@misc{Aylien16, Author = {Aylien}, Date-Added = {2018-02-17 20:56:11 +0000}, Date-Modified = {2018-02-17 21:00:33 +0000}, Howpublished = {\url{http://blog.aylien.com/source-code-classification-using-deep-learning/}}, Keywords = {data science, research}, Month = {August}, Title = {Source Code Classification Using Deep Learning [blog post]}, Year = {2016}} @misc{universal-ctags, Author = {Universal Ctags Team}, Date-Added = {2018-02-17 20:53:07 +0000}, Date-Modified = {2018-02-17 20:54:44 +0000}, Howpublished = {\url{http://ctags.io/}}, Title = {Universal Ctags}, Year = {2001--2018}} @misc{sloccount, Author = {David A. Wheeler}, Date-Added = {2018-02-17 20:47:15 +0000}, Date-Modified = {2018-02-17 20:51:51 +0000}, Howpublished = {\url{https://www.dwheeler.com/sloccount/}}, Title = {SLOCCount}, Year = {2004--2018}} @misc{cloc, Author = {Al Danial}, Date-Added = {2018-02-17 20:46:02 +0000}, Date-Modified = {2018-02-17 20:46:38 +0000}, Howpublished = {\url{https://github.com/AlDanial/cloc}}, Title = {cloc}, Year = {2006--2018}} @misc{guesslang, Author = {Y. Somda}, Date-Added = {2018-02-17 20:27:54 +0000}, Date-Modified = {2018-02-17 20:43:42 +0000}, Howpublished = {\url{http://guesslang.readthedocs.io/}}, Title = {Guesslang}, Year = {2017--2018}} @misc{linguist, Author = {Github}, Date-Added = {2018-02-17 20:21:27 +0000}, Date-Modified = {2018-02-17 20:26:46 +0000}, Howpublished = {\url{https://github.com/github/linguist}}, Title = {Linguist}, Year = {2011--2018}} @misc{ohcount, Author = {Black Duck Software}, Date-Added = {2018-02-17 20:11:31 +0000}, Date-Modified = {2018-02-17 21:03:52 +0000}, Title = {Ohcount}, Howpublished = {\url{https://github.com/blackducksoftware/ohcount}}, Year = {2008--2018}} @inproceedings{vanDam16, Author = {J. K. v. Dam and V. Zaytsev}, Booktitle = {2016 IEEE 23rd International Conference on Software Analysis, Evolution, and Reengineering (SANER)}, Doi = {10.1109/SANER.2016.92}, Keywords = {meta data;natural language processing;pattern classification;program diagnostics;software maintenance;text analysis;embedded code fragments;file extensions;grammar-based text analysis;keyword search;legacy code analysis;multinominal naïve Bayes;n-grams;natural language classifiers;natural language processing field;normalised compression distance;skip-grams;software artefact metadata;software language identification;statistical language models;universal IDE support;Cascading style sheets;HTML;Java;Natural languages;Software;Training;Training data;language identification;natural language processing;software language engineering}, Month = {March}, Pages = {624-628}, Title = {Software Language Identification with Natural Language Classifiers}, Volume = {1}, Year = {2016}, Bdsk-Url-1 = {http://dx.doi.org/10.1109/SANER.2016.92}} @article{Klein11, Archiveprefix = {arXiv}, Author = {David Klein and Kyle Murray and Simon Weber}, Bibsource = {dblp computer science bibliography, http://dblp.org}, Biburl = {http://dblp.org/rec/bib/journals/corr/abs-1106-4064}, Eprint = {1106.4064}, Journal = {CoRR}, Timestamp = {Wed, 07 Jun 2017 14:41:07 +0200}, Title = {Algorithmic Programming Language Identification}, Url = {http://arxiv.org/abs/1106.4064}, Volume = {abs/1106.4064}, Year = {2011}, Bdsk-Url-1 = {http://arxiv.org/abs/1106.4064}} @inproceedings{Gilda17, Author = {S. Gilda}, Booktitle = {2017 14th International Joint Conference on Computer Science and Software Engineering (JCSSE)}, Doi = {10.1109/JCSSE.2017.8025917}, Keywords = {feature extraction;learning (artificial intelligence);neural nets;pattern classification;programming languages;software engineering;source code (software);artificial neural network;convolutional neural network;file extension;intelligent feature extraction;multilayer neural network;neural networks;programming languages;software development industry;source code classification;supervised learning;word embedding layers;Feature extraction;HTML;Syntactics;Training;Artificial neural network;Feature extraction;Multi-layer neural network;Supervised learning}, Month = {July}, Pages = {1-6}, Title = {Source code classification using Neural Networks}, Year = {2017}, Bdsk-Url-1 = {http://dx.doi.org/10.1109/JCSSE.2017.8025917}} @article{Zevin17, Archiveprefix = {arXiv}, Author = {Shaul Zevin and Catherine Holzem}, Bibsource = {dblp computer science bibliography, http://dblp.org}, Biburl = {http://dblp.org/rec/bib/journals/corr/ZevinH17}, Eprint = {1703.07638}, Journal = {CoRR}, Timestamp = {Wed, 07 Jun 2017 14:41:28 +0200}, Title = {Machine Learning Based Source Code Classification Using Syntax Oriented Features}, Url = {http://arxiv.org/abs/1703.07638}, Volume = {abs/1703.07638}, Year = {2017}, Bdsk-Url-1 = {http://arxiv.org/abs/1703.07638}} @inproceedings{Ugurel02, Acmid = {775141}, Address = {New York, NY, USA}, Author = {Ugurel, Secil and Krovetz, Robert and Giles, C. Lee}, Booktitle = {Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, Doi = {10.1145/775047.775141}, Isbn = {1-58113-567-X}, Location = {Edmonton, Alberta, Canada}, Numpages = {7}, Pages = {632--638}, Publisher = {ACM}, Series = {KDD '02}, Title = {What's the Code?: Automatic Classification of Source Code Archives}, Url = {http://doi.acm.org/10.1145/775047.775141}, Year = {2002}, Bdsk-Url-1 = {http://doi.acm.org/10.1145/775047.775141}, Bdsk-Url-2 = {http://dx.doi.org/10.1145/775047.775141}} @inproceedings{Wang15, author = {Peng Wang and Jiaming Xu and Bo Xu and Cheng{-}Lin Liu and Heng Zhang and Fangyuan Wang and Hongwei Hao}, title = {Semantic Clustering and Convolutional Neural Network for Short Text Categorization}, booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing of the Asian Federation of Natural Language Processing, {ACL} 2015, July 26-31, 2015, Beijing, China, Volume 2: Short Papers}, pages = {352--357}, year = {2015}, url = {http://aclweb.org/anthology/P/P15/P15-2058.pdf}, timestamp = {Mon, 03 Aug 2015 08:13:34 +0200}, biburl = {http://dblp.org/rec/bib/conf/acl/WangXXLZWH15}, bibsource = {dblp computer science bibliography, http://dblp.org} } @inproceedings{Khasnabish14, author = {Jyotiska Nath Khasnabish and Mitali Sodhi and Jayati Deshmukh and G. Srinivasaraghavan}, title = {Detecting Programming Language from Source Code Using Bayesian Learning Techniques}, booktitle = {Machine Learning and Data Mining in Pattern Recognition - 10th International Conference, {MLDM} 2014, St. Petersburg, Russia, July 21-24, 2014. Proceedings}, pages = {513--522}, year = {2014}, url = {https://doi.org/10.1007/978-3-319-08979-9_39}, doi = {10.1007/978-3-319-08979-9_39}, timestamp = {Wed, 17 May 2017 14:25:11 +0200}, biburl = {http://dblp.org/rec/bib/conf/mldm/KhasnabishSDS14}, bibsource = {dblp computer science bibliography, http://dblp.org} } @misc{Heres16, Author = {Daniël Heres}, Howpublished = {\url{http://blog.aylien.com/source-code-classification-using-deep-learning/}}, Month = {July}, Title = {Detecting the Programming Language of Source Code Snippets using Machine Learning and Neural Networks [blog post]}, Year = {2016}} @Inbook{Aggarwal12, author={Aggarwal, Charu C. and Zhai, ChengXiang}, editor={Aggarwal, Charu C. and Zhai, ChengXiang}, title={A Survey of Text Classification Algorithms}, bookTitle={Mining Text Data}, year={2012}, publisher={Springer US}, address={Boston, MA}, pages={163--222}, abstract={The problem of classification has been widely studied in the data mining, machine learning, database, and information retrieval communities with applications in a number of diverse domains, such as target marketing, medical diagnosis, news group filtering, and document organization. In this paper we will provide a survey of a wide variety of text classification algorithms.}, isbn={978-1-4614-3223-4}, doi={10.1007/978-1-4614-3223-4_6}, url={https://doi.org/10.1007/978-1-4614-3223-4_6} } @article{Chen09, title = {Feature selection for text classification with Naïve Bayes}, journal = {Expert Systems with Applications}, volume = {36}, number = {3, Part 1}, pages = {5432 - 5435}, year = {2009}, issn = {0957-4174}, doi = {https://doi.org/10.1016/j.eswa.2008.06.054}, url = {http://www.sciencedirect.com/science/article/pii/S0957417408003564}, author = {Jingnian Chen and Houkuan Huang and Shengfeng Tian and Youli Qu}, keywords = {Text classification, Feature selection, Text preprocessing, Naïve Bayes} } @misc{MLatB16, Author = {Machine Learning at Berkeley}, Howpublished = {\url{https://ml.berkeley.edu/blog/2016/12/03/github/}}, Keywords = {data science, research}, Month = {December}, Title = {Github Programming Language Classification [blog post]}, Year = {2016} } @article{Cavnar94, title={N-gram-based text categorization}, author={Cavnar, William B and Trenkle, John M and others}, journal={Ann arbor mi}, volume={48113}, number={2}, pages={161--175}, year={1994}, publisher={Citeseer} } @article{Kim15, author = {Yoon Kim and Yacine Jernite and David Sontag and Alexander M. Rush}, title = {Character-Aware Neural Language Models}, journal = {CoRR}, volume = {abs/1508.06615}, year = {2015}, url = {http://arxiv.org/abs/1508.06615}, archivePrefix = {arXiv}, eprint = {1508.06615}, timestamp = {Wed, 07 Jun 2017 14:41:17 +0200}, biburl = {https://dblp.org/rec/bib/journals/corr/KimJSR15}, bibsource = {dblp computer science bibliography, https://dblp.org} } @article{Kim14, author = {Yoon Kim}, title = {Convolutional Neural Networks for Sentence Classification}, journal = {CoRR}, volume = {abs/1408.5882}, year = {2014}, url = {http://arxiv.org/abs/1408.5882}, archivePrefix = {arXiv}, eprint = {1408.5882}, timestamp = {Wed, 07 Jun 2017 14:40:07 +0200}, biburl = {https://dblp.org/rec/bib/journals/corr/Kim14f}, bibsource = {dblp computer science bibliography, https://dblp.org} } @inproceedings{kenlm, author = {Kenneth Heafield}, title = {{KenLM:} Faster and Smaller Language Model Queries}, year = {2011}, month = {July}, booktitle = {Proceedings of the {EMNLP} 2011 Sixth Workshop on Statistical Machine Translation}, address = {Edinburgh, Scotland, United Kingdom}, pages = {187--197}, url = {https://kheafield.com/papers/avenue/kenlm.pdf}, } @article{scikit-learn, title={Scikit-learn: Machine Learning in {P}ython}, author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, journal={Journal of Machine Learning Research}, volume={12}, pages={2825--2830}, year={2011} } @misc{keras, title={Keras}, author={Chollet, Fran\c{c}ois and others}, year={2015}, howpublished={\url{https://keras.io}}, } @misc{tensorflow2015-whitepaper, title={ {TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems}, howpublished={\url{https://www.tensorflow.org/}}, author={ Mart\'{\i}n~Abadi and Ashish~Agarwal and Paul~Barham and Eugene~Brevdo and Zhifeng~Chen and Craig~Citro and Greg~S.~Corrado and Andy~Davis and Jeffrey~Dean and Matthieu~Devin and Sanjay~Ghemawat and Ian~Goodfellow and Andrew~Harp and Geoffrey~Irving and Michael~Isard and Yangqing Jia and Rafal~Jozefowicz and Lukasz~Kaiser and Manjunath~Kudlur and Josh~Levenberg and Dandelion~Man\'{e} and Rajat~Monga and Sherry~Moore and Derek~Murray and Chris~Olah and Mike~Schuster and Jonathon~Shlens and Benoit~Steiner and Ilya~Sutskever and Kunal~Talwar and Paul~Tucker and Vincent~Vanhoucke and Vijay~Vasudevan and Fernanda~Vi\'{e}gas and Oriol~Vinyals and Pete~Warden and Martin~Wattenberg and Martin~Wicke and Yuan~Yu and Xiaoqiang~Zheng}, year={2015}, } @article{Gepperth16, Abstract = {We present a biologically inspired architecture for incremental learning that remains resource-efficient even in the face of very high data dimensionalities (>1000) that are typically associated with perceptual problems. In particular, we investigate how a new perceptual (object) class can be added to a trained architecture without retraining, while avoiding the well-known catastrophic forgetting effects typically associated with such scenarios. At the heart of the presented architecture lies a generative description of the perceptual space by a self-organized approach which at the same time approximates the neighborhood relations in this space on a two-dimensional plane. This approximation, which closely imitates the topographic organization of the visual cortex, allows an efficient local update rule for incremental learning even in the face of very high dimensionalities, which we demonstrate by tests on the well-known MNIST benchmark. We complement the model by adding a biologically plausible short-term memory system, allowing it to retain excellent classification accuracy even under incremental learning in progress. The short-term memory is additionally used to reinforce new data statistics by replaying previously stored samples during dedicated ``sleep'' phases.}, Author = {Gepperth, Alexander and Karaoguz, Cem}, Day = {01}, Doi = {10.1007/s12559-016-9389-5}, Issn = {1866-9964}, Journal = {Cognitive Computation}, Month = {Oct}, Number = {5}, Pages = {924--934}, Title = {A Bio-Inspired Incremental Learning Architecture for Applied Perceptual Problems}, Url = {https://doi.org/10.1007/s12559-016-9389-5}, Volume = {8}, Year = {2016}, Bdsk-Url-1 = {https://doi.org/10.1007/s12559-016-9389-5}} @article{RebuffiKL16, author = {Sylvestre{-}Alvise Rebuffi and Alexander Kolesnikov and Christoph H. Lampert}, title = {iCaRL: Incremental Classifier and Representation Learning}, journal = {CoRR}, volume = {abs/1611.07725}, year = {2016}, url = {http://arxiv.org/abs/1611.07725}, archivePrefix = {arXiv}, eprint = {1611.07725}, timestamp = {Wed, 07 Jun 2017 14:42:11 +0200}, biburl = {https://dblp.org/rec/bib/journals/corr/RebuffiKL16}, bibsource = {dblp computer science bibliography, https://dblp.org} } @article{Kemker17, author = {Ronald Kemker and Christopher Kanan}, title = {FearNet: Brain-Inspired Model for Incremental Learning}, journal = {CoRR}, volume = {abs/1711.10563}, year = {2017}, url = {http://arxiv.org/abs/1711.10563}, archivePrefix = {arXiv}, eprint = {1711.10563}, timestamp = {Mon, 04 Dec 2017 18:34:59 +0100}, biburl = {https://dblp.org/rec/bib/journals/corr/abs-1711-10563}, bibsource = {dblp computer science bibliography, https://dblp.org} } @inproceedings{DiCosmo17, author = {Di Cosmo, Roberto and Stefano Zacchiroli}, title = {Software Heritage: Why and How to Preserve Software Source Code}, abstract = {Software is now a key component present in all aspects of our society. Its preservation has attracted growing attention over the past years within the digital preservation community. We claim that source code ``the only representation of software that contains human readable knowledge'' is a precious digital object that needs special handling: it must be a first class citizen in the preservation landscape and we need to take action immediately, given the increasingly more frequent incidents that result in permanent losses of source code collections. In this paper we present Software Heritage, an ambitious initiative to collect, preserve, and share the entire corpus of publicly accessible software source code. We discuss the archival goals of the project, its use cases and role as a participant in the broader digital preservation ecosystem, and detail its key design decisions. We also report on the project road map and the current status of the Software Heritage archive that, as of early 2017, has collected more than 3 billion unique source code files and 700 million commits coming from more than 50 million software development projects.}, year = {2017}, booktitle = {iPRES 2017: 14th International Conference on Digital Preservation}, }