diff --git a/docs/report/bib-rapport.bib b/docs/report/bib-rapport.bib index 097410a..076208c 100644 --- a/docs/report/bib-rapport.bib +++ b/docs/report/bib-rapport.bib @@ -1,10 +1,225 @@ -@incollection{zhang2015, -title = {Character-level Convolutional Networks for Text Classification}, -author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, -booktitle = {Advances in Neural Information Processing Systems 28}, -editor = {C. Cortes and N. D. Lawrence and D. D. Lee and M. Sugiyama and R. Garnett}, -pages = {649--657}, -year = {2015}, -publisher = {Curran Associates, Inc.}, -url = {http://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf} +@misc{Aylien16, + Author = {Aylien}, + Date-Added = {2018-02-17 20:56:11 +0000}, + Date-Modified = {2018-02-17 21:00:33 +0000}, + Howpublished = {\url{http://blog.aylien.com/source-code-classification-using-deep-learning/}}, + Keywords = {data science, research}, + Month = {August}, + Title = {Source Code Classification Using Deep Learning [blog post]}, + Year = {2016}} + +@misc{universal-ctags, + Author = {Universal Ctags Team}, + Date-Added = {2018-02-17 20:53:07 +0000}, + Date-Modified = {2018-02-17 20:54:44 +0000}, + Howpublished = {\url{http://ctags.io/}}, + Title = {Universal Ctags}, + Year = {2001--2018}} + +@misc{sloccount, + Author = {David A. Wheeler}, + Date-Added = {2018-02-17 20:47:15 +0000}, + Date-Modified = {2018-02-17 20:51:51 +0000}, + Howpublished = {\url{https://www.dwheeler.com/sloccount/}}, + Title = {SLOCCount}, + Year = {2004--2018}} + +@misc{cloc, + Author = {Al Danial}, + Date-Added = {2018-02-17 20:46:02 +0000}, + Date-Modified = {2018-02-17 20:46:38 +0000}, + Howpublished = {\url{https://github.com/AlDanial/cloc}}, + Title = {cloc}, + Year = {2006--2018}} + +@misc{guesslang, + Author = {Y. Somda}, + Date-Added = {2018-02-17 20:27:54 +0000}, + Date-Modified = {2018-02-17 20:43:42 +0000}, + Howpublished = {\url{http://guesslang.readthedocs.io/}}, + Title = {Guesslang}, + Year = {2017--2018}} + +@misc{linguist, + Author = {Github}, + Date-Added = {2018-02-17 20:21:27 +0000}, + Date-Modified = {2018-02-17 20:26:46 +0000}, + Howpublished = {\url{https://github.com/github/linguist}}, + Title = {Linguist}, + Year = {2011--2018}} + +@misc{ohcount, + Author = {Black Duck Software}, + Date-Added = {2018-02-17 20:11:31 +0000}, + Date-Modified = {2018-02-17 21:03:52 +0000}, + Title = {Ohcount}, + Howpublished = {\url{https://github.com/blackducksoftware/ohcount}}, + Year = {2008--2018}} + +@inproceedings{vanDam16, + Author = {J. K. v. Dam and V. Zaytsev}, + Booktitle = {2016 IEEE 23rd International Conference on Software Analysis, Evolution, and Reengineering (SANER)}, + Doi = {10.1109/SANER.2016.92}, + Keywords = {meta data;natural language processing;pattern classification;program diagnostics;software maintenance;text analysis;embedded code fragments;file extensions;grammar-based text analysis;keyword search;legacy code analysis;multinominal naïve Bayes;n-grams;natural language classifiers;natural language processing field;normalised compression distance;skip-grams;software artefact metadata;software language identification;statistical language models;universal IDE support;Cascading style sheets;HTML;Java;Natural languages;Software;Training;Training data;language identification;natural language processing;software language engineering}, + Month = {March}, + Pages = {624-628}, + Title = {Software Language Identification with Natural Language Classifiers}, + Volume = {1}, + Year = {2016}, + Bdsk-Url-1 = {http://dx.doi.org/10.1109/SANER.2016.92}} + +@article{Klein11, + Archiveprefix = {arXiv}, + Author = {David Klein and Kyle Murray and Simon Weber}, + Bibsource = {dblp computer science bibliography, http://dblp.org}, + Biburl = {http://dblp.org/rec/bib/journals/corr/abs-1106-4064}, + Eprint = {1106.4064}, + Journal = {CoRR}, + Timestamp = {Wed, 07 Jun 2017 14:41:07 +0200}, + Title = {Algorithmic Programming Language Identification}, + Url = {http://arxiv.org/abs/1106.4064}, + Volume = {abs/1106.4064}, + Year = {2011}, + Bdsk-Url-1 = {http://arxiv.org/abs/1106.4064}} + +@inproceedings{Gilda17, + Author = {S. Gilda}, + Booktitle = {2017 14th International Joint Conference on Computer Science and Software Engineering (JCSSE)}, + Doi = {10.1109/JCSSE.2017.8025917}, + Keywords = {feature extraction;learning (artificial intelligence);neural nets;pattern classification;programming languages;software engineering;source code (software);artificial neural network;convolutional neural network;file extension;intelligent feature extraction;multilayer neural network;neural networks;programming languages;software development industry;source code classification;supervised learning;word embedding layers;Feature extraction;HTML;Syntactics;Training;Artificial neural network;Feature extraction;Multi-layer neural network;Supervised learning}, + Month = {July}, + Pages = {1-6}, + Title = {Source code classification using Neural Networks}, + Year = {2017}, + Bdsk-Url-1 = {http://dx.doi.org/10.1109/JCSSE.2017.8025917}} + +@article{Zevin17, + Archiveprefix = {arXiv}, + Author = {Shaul Zevin and Catherine Holzem}, + Bibsource = {dblp computer science bibliography, http://dblp.org}, + Biburl = {http://dblp.org/rec/bib/journals/corr/ZevinH17}, + Eprint = {1703.07638}, + Journal = {CoRR}, + Timestamp = {Wed, 07 Jun 2017 14:41:28 +0200}, + Title = {Machine Learning Based Source Code Classification Using Syntax Oriented Features}, + Url = {http://arxiv.org/abs/1703.07638}, + Volume = {abs/1703.07638}, + Year = {2017}, + Bdsk-Url-1 = {http://arxiv.org/abs/1703.07638}} + +@inproceedings{Ugurel02, + Acmid = {775141}, + Address = {New York, NY, USA}, + Author = {Ugurel, Secil and Krovetz, Robert and Giles, C. Lee}, + Booktitle = {Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, + Doi = {10.1145/775047.775141}, + Isbn = {1-58113-567-X}, + Location = {Edmonton, Alberta, Canada}, + Numpages = {7}, + Pages = {632--638}, + Publisher = {ACM}, + Series = {KDD '02}, + Title = {What's the Code?: Automatic Classification of Source Code Archives}, + Url = {http://doi.acm.org/10.1145/775047.775141}, + Year = {2002}, + Bdsk-Url-1 = {http://doi.acm.org/10.1145/775047.775141}, + Bdsk-Url-2 = {http://dx.doi.org/10.1145/775047.775141}} + +@inproceedings{Wang15, + author = {Peng Wang and + Jiaming Xu and + Bo Xu and + Cheng{-}Lin Liu and + Heng Zhang and + Fangyuan Wang and + Hongwei Hao}, + title = {Semantic Clustering and Convolutional Neural Network for Short Text + Categorization}, + booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational + Linguistics and the 7th International Joint Conference on Natural + Language Processing of the Asian Federation of Natural Language Processing, + {ACL} 2015, July 26-31, 2015, Beijing, China, Volume 2: Short Papers}, + pages = {352--357}, + year = {2015}, + url = {http://aclweb.org/anthology/P/P15/P15-2058.pdf}, + timestamp = {Mon, 03 Aug 2015 08:13:34 +0200}, + biburl = {http://dblp.org/rec/bib/conf/acl/WangXXLZWH15}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@inproceedings{Khasnabish14, + author = {Jyotiska Nath Khasnabish and + Mitali Sodhi and + Jayati Deshmukh and + G. Srinivasaraghavan}, + title = {Detecting Programming Language from Source Code Using Bayesian Learning + Techniques}, + booktitle = {Machine Learning and Data Mining in Pattern Recognition - 10th International + Conference, {MLDM} 2014, St. Petersburg, Russia, July 21-24, 2014. + Proceedings}, + pages = {513--522}, + year = {2014}, + url = {https://doi.org/10.1007/978-3-319-08979-9_39}, + doi = {10.1007/978-3-319-08979-9_39}, + timestamp = {Wed, 17 May 2017 14:25:11 +0200}, + biburl = {http://dblp.org/rec/bib/conf/mldm/KhasnabishSDS14}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@misc{Heres16, + Author = {Daniël Heres}, + Howpublished = {\url{http://blog.aylien.com/source-code-classification-using-deep-learning/}}, + Month = {July}, + Title = {Detecting the Programming Language of Source Code Snippets using Machine Learning and Neural Networks [blog post]}, + Year = {2016}} + +@Inbook{Aggarwal12, +author={Aggarwal, Charu C. +and Zhai, ChengXiang}, +editor={Aggarwal, Charu C. +and Zhai, ChengXiang}, +title={A Survey of Text Classification Algorithms}, +bookTitle={Mining Text Data}, +year={2012}, +publisher={Springer US}, +address={Boston, MA}, +pages={163--222}, +abstract={The problem of classification has been widely studied in the data mining, machine learning, database, and information retrieval communities with applications in a number of diverse domains, such as target marketing, medical diagnosis, news group filtering, and document organization. In this paper we will provide a survey of a wide variety of text classification algorithms.}, +isbn={978-1-4614-3223-4}, +doi={10.1007/978-1-4614-3223-4_6}, +url={https://doi.org/10.1007/978-1-4614-3223-4_6} +} + +@article{Chen09, +title = {Feature selection for text classification with Naïve Bayes}, +journal = {Expert Systems with Applications}, +volume = {36}, +number = {3, Part 1}, +pages = {5432 - 5435}, +year = {2009}, +issn = {0957-4174}, +doi = {https://doi.org/10.1016/j.eswa.2008.06.054}, +url = {http://www.sciencedirect.com/science/article/pii/S0957417408003564}, +author = {Jingnian Chen and Houkuan Huang and Shengfeng Tian and Youli Qu}, +keywords = {Text classification, Feature selection, Text preprocessing, Naïve Bayes} +} + +@misc{MLatB16, + Author = {Machine Learning at Berkeley}, + Howpublished = {\url{https://ml.berkeley.edu/blog/2016/12/03/github/}}, + Keywords = {data science, research}, + Month = {December}, + Title = {Github Programming Language Classification [blog post]}, + Year = {2016} + } + +@article{Cavnar94, + title={N-gram-based text categorization}, + author={Cavnar, William B and Trenkle, John M and others}, + journal={Ann arbor mi}, + volume={48113}, + number={2}, + pages={161--175}, + year={1994}, + publisher={Citeseer} } \ No newline at end of file diff --git a/docs/report/rapport.pdf b/docs/report/rapport.pdf index 809f4b7..954bac2 100644 Binary files a/docs/report/rapport.pdf and b/docs/report/rapport.pdf differ diff --git a/docs/report/report-en.pdf b/docs/report/report-en.pdf new file mode 100644 index 0000000..9f75d7a Binary files /dev/null and b/docs/report/report-en.pdf differ diff --git a/docs/report/report-en.tex b/docs/report/report-en.tex index c853899..b993565 100644 --- a/docs/report/report-en.tex +++ b/docs/report/report-en.tex @@ -1,110 +1,190 @@ \documentclass[a4paper,12pt]{article} \usepackage[english]{babel} \usepackage[parfill]{parskip} \usepackage{graphicx} \usepackage{amssymb} \usepackage{amsmath} \usepackage{amsthm} \usepackage{xunicode} \usepackage[T1]{fontenc} -%\usepackage{times} +\usepackage{charter} +\usepackage{url} +\usepackage{hyperref} \title{Large-scale Programming Language Detection} \author{Yuan YIN} \date{} \begin{document} \maketitle \begin{abstract} (to be completed) \end{abstract} \section{Introduction} Programming Language Detection is a problem of identifying which programming language is a piece of source code written in. We here define the piece of source code as a textual sequential representation of an artefact, which is normally in the form of character sequence or, more generally, byte sequence. More precisely, the objective is to build a model which could predict the language of a given byte sequence. The formal definition of the problem as follows: on the input, given an byte sequence $d$ and the number of languages $n$, \[l_d = \underset{l_i\in \{l_1, ..., l_n\}}{\arg \max}\ m(d, l_i),\] -where $l_d$ is a language, model $m$ calculates a value indicating the likelihood of a document written in language $l_i$ and the most likely one is chosen as the language of the document. +where $l_d$ is the projected language, model $m$ calculates a value indicating the likelihood of a document written in language $l_i$ and the most likely one is chosen as the language of the document. In general, Programming Language Detection could be utilised in different situations, here are several example applications: language composition of software project in version control systems. For example, GitHub team is developing the project Linguist to return which languages are the project written in; code searching in plain text, in order to track the popularity of a language; language detection helps also IDEs to choose the language whose support functionalities, like syntax highlighting, are implemented. We dive into this problem in the context of \emph{Software Heritage}. \emph{Software Heritage}, initiated by Inria, is an archive in which 4 billions source code files from 80 millions projects are stored. The reason why the language detection is requested by \emph{Software Heritage} is that the language of a file could not be found in its filename extension. In \emph{Software Heritage}, every source code file is a blob which contains raw content of the file, that means a sequence of bytes without any extra information, such as filename (including filename extension), metadata, \emph{etc}. Since each blob could be represented by an intrinsic identifier generated from the blob itself, the duplication of files is avoided. For this reason, all existing tools depending on filenames fail in our context, and the methods for recognising the language from a sequence of bytes is strongly demanded. -In this report, we introduce briefly the state-of-the-art methods in Section 2. In Section 3, the procedure of making a feasible data set is related. In Section 4, we explain the methods that we took in account. +In this report, we introduce briefly the state-of-the-art methods in Section 2. In Section 3, the procedure of making a feasible dataset is related. In Section 4, we explain the methods that we took in account for the evaluation. -\section{State of the art} +\section{State of the Art} The existing approaches could be divided into two groups: practical methods and machine learning methods. Practical methods are mostly based on several empirical or external information, basic ideas are presented as follows: \begin{itemize} - \item Judging from filename extension. The problem from this straightforward method is that some extensions are related to different languages, \emph{e.g.} \texttt{.m} refers to Objective-C and MATLAB, \texttt{.pl} points to Python and Prolog. - \item Grammar-based approches. + \item Judging from filename extension. Ohcount\cite{ohcount} and Linguist\cite{linguist} practice the detection by hashing filename extension. The problem from this straightforward method is that some extensions are related to different languages, \emph{e.g.} \texttt{*.m} refers to a file written in Objective-C or MATLAB, \texttt{*.pl} points to Python or Prolog. + \item Grammar-based approaches. The principal is to parse through all languages, which is complex in modelling and demand an heavy consumption of calculation time. + \item Heuristics approaches. Most of them, such as SLOCCount\cite{sloccount}, use predefined regular expressions to capture empirically discovered features, \emph{e.g.} a file start with ``\texttt{\#include}'' is probably written in C. Some other looks for hints in the file, such as shebang lines, Vim modelines, Emacs modelines, \emph{etc}. \end{itemize} -\section{Data set} +In Machine learning, the problem is regarded as a sub-problem of \emph{text categorisation} or \emph{text classification}, which means that given a piece of text, we find a function that predicts which category the text belongs to. The state-of-the-art methods build such function based on example input-output pairs, which are categorised as \emph{supervised learning}. -Etant donné qu'aucun jeu de données de tel genre n'est publiquement disponible de la part des auteurs des articles précédents, nous avons besoin d'en construire un nouveau nous permettant de disposer d'un ensemble de fichiers avec les étiquettes d'identité indiquant le langage utilisé. Comme GitHub est l'un des services de gestion de développement de logiciels les plus populaires, nous pouvons récupérer un nombre considérable de fichiers pour chaque langage pris en compte. +Ugurel \emph{et al.} \cite{Ugurel02} selects firstly the features by Expected Entropy Loss for each language, then vectorise the tested document into a vector representing the presence of a selected feature. Since Support Vector Machine (SVM) is binary classifier, the $n$-class classification is resolved by training $n \choose 2$ SVMs in the form of decision tree. Van Dam and Zaytsev \cite{vanDam16} test several popular and performant methods in Natural Language Processing. Multi-nominal Naïve Bayes (MNB), one of the variants of Naïve Bayes Classifiers, utilises unified frequency of a word or a sequence of words in a byte-sequence to decide the most possibly corresponding programming language. $N$-gram model and skip-gram model calculate for each gram the possibility of its appearance after $N$ grams. Normalised Compression Distance compares a piece of compressed code to the examples in the training set, then chooses the nearest language on as projection. MNB and $N$-gram model outperform others according to the experimental results. Gilda\cite{Gilda17} adopts a general setup of Convolutional Neurone Network (ConvNet) in NLP and proofs its performance. -\subsection{Supposition sur la vérité au sol} +In this report, we will test several NLP methods on a larger dataset. -\textsf{Linguist}, logiciel développé par l'équipe GitHub, est capable de reconnaître plus de 400 langages de programmation, de représentation de données et de balisage pour les fichiers contenus dans un dépôt Git. +\section{Dataset} -\subsection{Langages} +Supervised learning methods require a dataset containing labeled inputs to train and to evaluate the model. Nowadays, since Programming Language Detection is not seriously considered as an important subject in machine learning, for the reason that it could be resolved by adopting existing classifiers of ML, the articles are rarely accompanied by a publicly available dataset. Therefore, we natively build a novel dataset for our experiments. -\subsection{Récupération de données} +GitHub is one of the most popular web-based hosting service for Git version control system, reporting having more than 57 million repositories. We decide to build the dataset using GitHub. -Pour ce faire, +\paragraph{Ground Truth Supposition} -\section{} +In the context of \emph{Software Heritage}, our aim is to cover as many languages as possible for classification, thus the dataset we build possesses inevitably a large amount of files, which is unaffordable to be labeled manually. We thus seek help from automatic labelling tools. -\subsection{Méthode de référence : classification en comparant les fréquences de n-grams} +Linguist\cite{linguist} is the tool of language detection developed by the GitHub team for unveiling the language composition in git repository, service provided on GitHub through API. There exists a command line version Linguist producing list of files by language for repository. Given that filename extensions are visible for Linguist and such features boost enormously on accuracy of classification (we will show this claim in later experiment), we suppose that the language recognised by Linguist is the ground truth language attributed to it. -\subsection{Classification avec le model de langage n-grams} +\paragraph{Repository Fetching} -\subsection{Modèle bayésien naïf multinominal} +For each language, we fetch the first 75 repositories which top on the list ordered by number of stars, which shows the popularity of the repository. To avoid huge repositories, we ignore all repositories whose size is superior to 150 MiB. All lists -\subsection{Réseau neuronal convolutif au niveau de tokens} +We initially took the entire language list of Linguist (version 6.0.1) into account for repository fetching, then we eliminated some languages, \emph{i.e.} data description languages, which we could not fetch any repository. -\subsection{Réseau neuronal convolutif au niveau de bytes} +\paragraph{Imbalance Among Classes} -\section{Résultats expérimentaux} +Despite of our efforts on balancing the number of repositories for each class, a significant imbalance is eventually observed among language classes. For example, C++ has 180,093 files, but Omgrofl has only 16 files. + +(Graph needed) + +\paragraph{Training Set And Test Set} + +Files of the training set are randomly picked at first, then the test set is built from the remaining samples. To avoid the imbalance of the training set that impacts the performance of several methods in Section 4, for each language, we restrain the maximum number of training files to 500 and the maximum number of test files to 1000. + +\section{Methods for Evaluation} + +In this section, we describe several NLP methods here tested on our dataset: +\begin{itemize} + \item $n$-gram-based frequency distance model, + \item $n$-gram model, + \item Multinominal Naïve Bayes (MNB), and + \item Convolutional Neurone Networks (ConvNet). +\end{itemize} +The first approach is regarded as a baseline method for the evaluation of the accuracy and the efficiency of the model. + +\subsection{Baseline: $n$-gram-based frequency distance} + +Cavnar and Trenkle \cite{Cavnar94} introduce an early NLP method using the distance between two $n$-gram frequency profiles. + +\paragraph{$n$-gram} + +An $n$-gram is a slice of a larger sequence with $n$ units. In NLP, the sequence is naturally the string. Depending on different problems, an unit represents a character or a word. + +For example, the string ``print(n)'' with 8 characters could generate following character based $n$-grams: +\begin{itemize} + \item unigrams: \texttt{, p, r, ..., ), } + \item bigrams: \texttt{ p, p r, r i, ..., n ), ) } + \item trigrams: \texttt{ p r, p r i, r i t, ..., ( n ), n ) } + \item ... +\end{itemize} +or word-based $n$-grams: +\begin{itemize} + \item unigrams: \texttt{, print, (, n, ), } + \item bigrams: \texttt{ print, print (, ( n, n ), ) } + \item trigrams: \texttt{ print (, print ( n, ( n ), n ) } + \item ... +\end{itemize} + +Strings are often padded with \texttt{} and \texttt{} around to mark its start and end. In general, a $k$-unity sequence generates exactly $k-(n-1)$ n-grams. + +According to Zipf's law, an empirical observation expressing that the $n$-th most common word in a human language occurs with a frequency inversely proportional to $n$. By retaining the most common words, it is possible to obtain a list describing the characteristics of the language. + +Given a training set, at the training phase, a bag of $n$-grams is generated for each document in the training set. By gathering all bags of a language and counting the occurrences of each $n$-gram, a list of $n$-grams ordered by number of occurrences is created as the \emph{category profile} of the class. Only the most frequent 300 $n$-grams are kept, since they are highly correlated to the language. + +The \emph{distance} between category profile and document profile is defined as follows: + +Given trained category profiles $p_{l_1}, ..., p_{l_k}$ for $k$ languages, and document profile $p_{d}$ of test document $d$, +\[ +distance(p_{l_i}, p_{d}) = \sum_{w\in p_{d}} | rankdist(w, p_d, p_{l_i})| +\] +\[ +rankdist(w, p_d, p_{l_i})= +\begin{cases} +|rank(w, p_d) - rank(w, p_{l_i})| & \text{if }rank(w, p_{l_i}) \text{ exists,} \\ +|p_d| & \text{else} +\end{cases} +\] +where $p$ containing an ordered list of word, $rank(w, p)$ returns the rank of $w$ in list $p$. $rankdist(w, p_d, p_{l_i})$ returns the out-of-place distance between two profiles if $w$ appears in $p_{l_i}$. If $w$ is an out-of-vocabulary word, the distance is the length of document profile $p_d$. + +We then categorise the document as language with minimum distance. + +\subsection{$n$-gram model} + +As the precedent method, $n$-gram model utilises also statistical properties of $n$-grams but in another way. + +\subsection{Multinominal Naïve Bayes} + +\subsection{Convolutional Neurone Network} + +\subsubsection{Word-level Approach} + +\subsubsection{Byte-level Approach} + +\section{Experimental Results} \subsection{Evaluation des modeles} \subsubsection{Impact de nombre de classes} \subsubsection{Confusion d'inter-classes} \subsection{Benchmark} \section{Défis à grande échelle} \subsection{Déployabilité des approches dans les systèmes distribués} \subsection{Découverte de nouvelles classes} \subsubsection{Regroupement hiérarchique agglomératif} \subsection{Apprentissage incrémental (partie optionelle)} \section{Conclusion} \bibliography{bib-rapport} -\bibliographystyle{plain} +\bibliographystyle{unsrt} %Rapport % %Il doit faire de 15 à 30 pages et, dans la mesure du possible, doit être en grande part lisible par des non-spécialistes. Son plan peut être par exemple : %présentation du domaine de recherche (le jury n'est pas constitué seulement de spécialistes du domaine, tenez-en compte !) ; %énoncé et motivation du sujet ; %résultats existants s'y rapportant (état de l'art, commentaire d'article, ...) ; %vos résultats personnels (clairement identifiés comme tels). %Le rapport devra être assorti d'un résumé d'une page compréhensible par quiconque. \tableofcontents \end{document} \ No newline at end of file