@misc{Aylien16,
	Author = {Aylien},
	Date-Added = {2018-02-17 20:56:11 +0000},
	Date-Modified = {2018-02-17 21:00:33 +0000},
	Howpublished = {\url{http://blog.aylien.com/source-code-classification-using-deep-learning/}},
	Keywords = {data science, research},
	Month = {August},
	Title = {Source Code Classification Using Deep Learning [blog post]},
	Year = {2016}}

@misc{universal-ctags,
	Author = {Universal Ctags Team},
	Date-Added = {2018-02-17 20:53:07 +0000},
	Date-Modified = {2018-02-17 20:54:44 +0000},
	Howpublished = {\url{http://ctags.io/}},
	Title = {Universal Ctags},
	Year = {2001--2018}}

@misc{sloccount,
	Author = {David A. Wheeler},
	Date-Added = {2018-02-17 20:47:15 +0000},
	Date-Modified = {2018-02-17 20:51:51 +0000},
	Howpublished = {\url{https://www.dwheeler.com/sloccount/}},
	Title = {SLOCCount},
	Year = {2004--2018}}

@misc{cloc,
	Author = {Al Danial},
	Date-Added = {2018-02-17 20:46:02 +0000},
	Date-Modified = {2018-02-17 20:46:38 +0000},
	Howpublished = {\url{https://github.com/AlDanial/cloc}},
	Title = {cloc},
	Year = {2006--2018}}

@misc{guesslang,
	Author = {Y. Somda},
	Date-Added = {2018-02-17 20:27:54 +0000},
	Date-Modified = {2018-02-17 20:43:42 +0000},
	Howpublished = {\url{http://guesslang.readthedocs.io/}},
	Title = {Guesslang},
	Year = {2017--2018}}

@misc{linguist,
	Author = {Github},
	Date-Added = {2018-02-17 20:21:27 +0000},
	Date-Modified = {2018-02-17 20:26:46 +0000},
	Howpublished = {\url{https://github.com/github/linguist}},
	Title = {Linguist},
	Year = {2011--2018}}

@misc{ohcount,
	Author = {Black Duck Software},
	Date-Added = {2018-02-17 20:11:31 +0000},
	Date-Modified = {2018-02-17 21:03:52 +0000},
	Title = {Ohcount},
	Howpublished = {\url{https://github.com/blackducksoftware/ohcount}},
	Year = {2008--2018}}

@inproceedings{vanDam16,
	Author = {J. K. v. Dam and V. Zaytsev},
	Booktitle = {2016 IEEE 23rd International Conference on Software Analysis, Evolution, and Reengineering (SANER)},
	Doi = {10.1109/SANER.2016.92},
	Keywords = {meta data;natural language processing;pattern classification;program diagnostics;software maintenance;text analysis;embedded code fragments;file extensions;grammar-based text analysis;keyword search;legacy code analysis;multinominal naïve Bayes;n-grams;natural language classifiers;natural language processing field;normalised compression distance;skip-grams;software artefact metadata;software language identification;statistical language models;universal IDE support;Cascading style sheets;HTML;Java;Natural languages;Software;Training;Training data;language identification;natural language processing;software language engineering},
	Month = {March},
	Pages = {624-628},
	Title = {Software Language Identification with Natural Language Classifiers},
	Volume = {1},
	Year = {2016},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/SANER.2016.92}}

@article{Klein11,
	Archiveprefix = {arXiv},
	Author = {David Klein and Kyle Murray and Simon Weber},
	Bibsource = {dblp computer science bibliography, http://dblp.org},
	Biburl = {http://dblp.org/rec/bib/journals/corr/abs-1106-4064},
	Eprint = {1106.4064},
	Journal = {CoRR},
	Timestamp = {Wed, 07 Jun 2017 14:41:07 +0200},
	Title = {Algorithmic Programming Language Identification},
	Url = {http://arxiv.org/abs/1106.4064},
	Volume = {abs/1106.4064},
	Year = {2011},
	Bdsk-Url-1 = {http://arxiv.org/abs/1106.4064}}

@inproceedings{Gilda17,
	Author = {S. Gilda},
	Booktitle = {2017 14th International Joint Conference on Computer Science and Software Engineering (JCSSE)},
	Doi = {10.1109/JCSSE.2017.8025917},
	Keywords = {feature extraction;learning (artificial intelligence);neural nets;pattern classification;programming languages;software engineering;source code (software);artificial neural network;convolutional neural network;file extension;intelligent feature extraction;multilayer neural network;neural networks;programming languages;software development industry;source code classification;supervised learning;word embedding layers;Feature extraction;HTML;Syntactics;Training;Artificial neural network;Feature extraction;Multi-layer neural network;Supervised learning},
	Month = {July},
	Pages = {1-6},
	Title = {Source code classification using Neural Networks},
	Year = {2017},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/JCSSE.2017.8025917}}

@article{Zevin17,
	Archiveprefix = {arXiv},
	Author = {Shaul Zevin and Catherine Holzem},
	Bibsource = {dblp computer science bibliography, http://dblp.org},
	Biburl = {http://dblp.org/rec/bib/journals/corr/ZevinH17},
	Eprint = {1703.07638},
	Journal = {CoRR},
	Timestamp = {Wed, 07 Jun 2017 14:41:28 +0200},
	Title = {Machine Learning Based Source Code Classification Using Syntax Oriented Features},
	Url = {http://arxiv.org/abs/1703.07638},
	Volume = {abs/1703.07638},
	Year = {2017},
	Bdsk-Url-1 = {http://arxiv.org/abs/1703.07638}}

@inproceedings{Ugurel02,
	Acmid = {775141},
	Address = {New York, NY, USA},
	Author = {Ugurel, Secil and Krovetz, Robert and Giles, C. Lee},
	Booktitle = {Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
	Doi = {10.1145/775047.775141},
	Isbn = {1-58113-567-X},
	Location = {Edmonton, Alberta, Canada},
	Numpages = {7},
	Pages = {632--638},
	Publisher = {ACM},
	Series = {KDD '02},
	Title = {What's the Code?: Automatic Classification of Source Code Archives},
	Url = {http://doi.acm.org/10.1145/775047.775141},
	Year = {2002},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/775047.775141},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/775047.775141}}
	
@inproceedings{Wang15,
  author    = {Peng Wang and
               Jiaming Xu and
               Bo Xu and
               Cheng{-}Lin Liu and
               Heng Zhang and
               Fangyuan Wang and
               Hongwei Hao},
  title     = {Semantic Clustering and Convolutional Neural Network for Short Text
               Categorization},
  booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational
               Linguistics and the 7th International Joint Conference on Natural
               Language Processing of the Asian Federation of Natural Language Processing,
               {ACL} 2015, July 26-31, 2015, Beijing, China, Volume 2: Short Papers},
  pages     = {352--357},
  year      = {2015},
  url       = {http://aclweb.org/anthology/P/P15/P15-2058.pdf},
  timestamp = {Mon, 03 Aug 2015 08:13:34 +0200},
  biburl    = {http://dblp.org/rec/bib/conf/acl/WangXXLZWH15},
  bibsource = {dblp computer science bibliography, http://dblp.org}
}

@inproceedings{Khasnabish14,
  author    = {Jyotiska Nath Khasnabish and
               Mitali Sodhi and
               Jayati Deshmukh and
               G. Srinivasaraghavan},
  title     = {Detecting Programming Language from Source Code Using Bayesian Learning
               Techniques},
  booktitle = {Machine Learning and Data Mining in Pattern Recognition - 10th International
               Conference, {MLDM} 2014, St. Petersburg, Russia, July 21-24, 2014.
               Proceedings},
  pages     = {513--522},
  year      = {2014},
  url       = {https://doi.org/10.1007/978-3-319-08979-9_39},
  doi       = {10.1007/978-3-319-08979-9_39},
  timestamp = {Wed, 17 May 2017 14:25:11 +0200},
  biburl    = {http://dblp.org/rec/bib/conf/mldm/KhasnabishSDS14},
  bibsource = {dblp computer science bibliography, http://dblp.org}
}

@misc{Heres16,
	Author = {Daniël Heres},
	Howpublished = {\url{http://blog.aylien.com/source-code-classification-using-deep-learning/}},
	Month = {July},
	Title = {Detecting the Programming Language of Source Code Snippets using Machine Learning and Neural Networks [blog post]},
	Year = {2016}}
	
@Inbook{Aggarwal12,
author={Aggarwal, Charu C.
and Zhai, ChengXiang},
editor={Aggarwal, Charu C.
and Zhai, ChengXiang},
title={A Survey of Text Classification Algorithms},
bookTitle={Mining Text Data},
year={2012},
publisher={Springer US},
address={Boston, MA},
pages={163--222},
abstract={The problem of classification has been widely studied in the data mining, machine learning, database, and information retrieval communities with applications in a number of diverse domains, such as target marketing, medical diagnosis, news group filtering, and document organization. In this paper we will provide a survey of a wide variety of text classification algorithms.},
isbn={978-1-4614-3223-4},
doi={10.1007/978-1-4614-3223-4_6},
url={https://doi.org/10.1007/978-1-4614-3223-4_6}
}

@article{Chen09,
title = {Feature selection for text classification with Naïve Bayes},
journal = {Expert Systems with Applications},
volume = {36},
number = {3, Part 1},
pages = {5432 - 5435},
year = {2009},
issn = {0957-4174},
doi = {https://doi.org/10.1016/j.eswa.2008.06.054},
url = {http://www.sciencedirect.com/science/article/pii/S0957417408003564},
author = {Jingnian Chen and Houkuan Huang and Shengfeng Tian and Youli Qu},
keywords = {Text classification, Feature selection, Text preprocessing, Naïve Bayes}
}

@misc{MLatB16,
	Author = {Machine Learning at Berkeley},
	Howpublished = {\url{https://ml.berkeley.edu/blog/2016/12/03/github/}},
	Keywords = {data science, research},
	Month = {December},
	Title = {Github Programming Language Classification [blog post]},
	Year = {2016}
	}
	
@article{Cavnar94,
  title={N-gram-based text categorization},
  author={Cavnar, William B and Trenkle, John M and others},
  journal={Ann arbor mi},
  volume={48113},
  number={2},
  pages={161--175},
  year={1994},
  publisher={Citeseer}
}

@article{Kim15,
  author    = {Yoon Kim and
               Yacine Jernite and
               David Sontag and
               Alexander M. Rush},
  title     = {Character-Aware Neural Language Models},
  journal   = {CoRR},
  volume    = {abs/1508.06615},
  year      = {2015},
  url       = {http://arxiv.org/abs/1508.06615},
  archivePrefix = {arXiv},
  eprint    = {1508.06615},
  timestamp = {Wed, 07 Jun 2017 14:41:17 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/KimJSR15},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{Kim14,
  author    = {Yoon Kim},
  title     = {Convolutional Neural Networks for Sentence Classification},
  journal   = {CoRR},
  volume    = {abs/1408.5882},
  year      = {2014},
  url       = {http://arxiv.org/abs/1408.5882},
  archivePrefix = {arXiv},
  eprint    = {1408.5882},
  timestamp = {Wed, 07 Jun 2017 14:40:07 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/Kim14f},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{kenlm,
  author = {Kenneth Heafield},
  title = {{KenLM:} Faster and Smaller Language Model Queries},
  year = {2011},
  month = {July},
  booktitle = {Proceedings of the {EMNLP} 2011 Sixth Workshop on Statistical Machine Translation},
  address = {Edinburgh, Scotland, United Kingdom},
  pages = {187--197},
  url = {https://kheafield.com/papers/avenue/kenlm.pdf},
}

@article{scikit-learn,
 title={Scikit-learn: Machine Learning in {P}ython},
 author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
 journal={Journal of Machine Learning Research},
 volume={12},
 pages={2825--2830},
 year={2011}
}

@misc{keras,
  title={Keras},
  author={Chollet, Fran\c{c}ois and others},
  year={2015},
  howpublished={\url{https://keras.io}},
}

@misc{tensorflow2015-whitepaper,
title={ {TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems},
howpublished={\url{https://www.tensorflow.org/}},
author={
    Mart\'{\i}n~Abadi and
    Ashish~Agarwal and
    Paul~Barham and
    Eugene~Brevdo and
    Zhifeng~Chen and
    Craig~Citro and
    Greg~S.~Corrado and
    Andy~Davis and
    Jeffrey~Dean and
    Matthieu~Devin and
    Sanjay~Ghemawat and
    Ian~Goodfellow and
    Andrew~Harp and
    Geoffrey~Irving and
    Michael~Isard and
    Yangqing Jia and
    Rafal~Jozefowicz and
    Lukasz~Kaiser and
    Manjunath~Kudlur and
    Josh~Levenberg and
    Dandelion~Man\'{e} and
    Rajat~Monga and
    Sherry~Moore and
    Derek~Murray and
    Chris~Olah and
    Mike~Schuster and
    Jonathon~Shlens and
    Benoit~Steiner and
    Ilya~Sutskever and
    Kunal~Talwar and
    Paul~Tucker and
    Vincent~Vanhoucke and
    Vijay~Vasudevan and
    Fernanda~Vi\'{e}gas and
    Oriol~Vinyals and
    Pete~Warden and
    Martin~Wattenberg and
    Martin~Wicke and
    Yuan~Yu and
    Xiaoqiang~Zheng},
  year={2015},
}


@article{Gepperth16,
	Abstract = {We present a biologically inspired architecture for incremental learning that remains resource-efficient even in the face of very high data dimensionalities (>1000) that are typically associated with perceptual problems. In particular, we investigate how a new perceptual (object) class can be added to a trained architecture without retraining, while avoiding the well-known catastrophic forgetting effects typically associated with such scenarios. At the heart of the presented architecture lies a generative description of the perceptual space by a self-organized approach which at the same time approximates the neighborhood relations in this space on a two-dimensional plane. This approximation, which closely imitates the topographic organization of the visual cortex, allows an efficient local update rule for incremental learning even in the face of very high dimensionalities, which we demonstrate by tests on the well-known MNIST benchmark. We complement the model by adding a biologically plausible short-term memory system, allowing it to retain excellent classification accuracy even under incremental learning in progress. The short-term memory is additionally used to reinforce new data statistics by replaying previously stored samples during dedicated ``sleep'' phases.},
	Author = {Gepperth, Alexander and Karaoguz, Cem},
	Day = {01},
	Doi = {10.1007/s12559-016-9389-5},
	Issn = {1866-9964},
	Journal = {Cognitive Computation},
	Month = {Oct},
	Number = {5},
	Pages = {924--934},
	Title = {A Bio-Inspired Incremental Learning Architecture for Applied Perceptual Problems},
	Url = {https://doi.org/10.1007/s12559-016-9389-5},
	Volume = {8},
	Year = {2016},
	Bdsk-Url-1 = {https://doi.org/10.1007/s12559-016-9389-5}}
	
@article{RebuffiKL16,
  author    = {Sylvestre{-}Alvise Rebuffi and
               Alexander Kolesnikov and
               Christoph H. Lampert},
  title     = {iCaRL: Incremental Classifier and Representation Learning},
  journal   = {CoRR},
  volume    = {abs/1611.07725},
  year      = {2016},
  url       = {http://arxiv.org/abs/1611.07725},
  archivePrefix = {arXiv},
  eprint    = {1611.07725},
  timestamp = {Wed, 07 Jun 2017 14:42:11 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/RebuffiKL16},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{Kemker17,
  author    = {Ronald Kemker and
               Christopher Kanan},
  title     = {FearNet: Brain-Inspired Model for Incremental Learning},
  journal   = {CoRR},
  volume    = {abs/1711.10563},
  year      = {2017},
  url       = {http://arxiv.org/abs/1711.10563},
  archivePrefix = {arXiv},
  eprint    = {1711.10563},
  timestamp = {Mon, 04 Dec 2017 18:34:59 +0100},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1711-10563},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DiCosmo17,
  author = {Di Cosmo, Roberto and Stefano Zacchiroli},
  title = {Software Heritage: Why and How to Preserve Software Source Code},
  abstract = {Software is now a key component present in all aspects of our society. Its preservation has attracted growing attention over the past years within the digital preservation community. We claim that source code ``the only representation of software that contains human readable knowledge'' is a precious digital object that needs special handling: it must be a first class citizen in the preservation landscape and we need to take action immediately, given the increasingly more frequent incidents that result in permanent losses of source code collections. In this paper we present Software Heritage, an ambitious initiative to collect, preserve, and share the entire corpus of publicly accessible software source code. We discuss the archival goals of the project, its use cases and role as a participant in the broader digital preservation ecosystem, and detail its key design decisions. We also report on the project road map and the current status of the Software Heritage archive that, as of early 2017, has collected more than 3 billion unique source code files and 700 million commits coming from more than 50 million software development projects.},
  year = {2017},
  booktitle = {iPRES 2017: 14th International Conference on Digital Preservation},
}