diff --git a/docs/report/bib-rapport.bib b/docs/report/bib-rapport.bib
index 4f22d1f..8bde4a1 100644
--- a/docs/report/bib-rapport.bib
+++ b/docs/report/bib-rapport.bib
@@ -1,279 +1,333 @@
@misc{Aylien16,
Author = {Aylien},
Date-Added = {2018-02-17 20:56:11 +0000},
Date-Modified = {2018-02-17 21:00:33 +0000},
Howpublished = {\url{http://blog.aylien.com/source-code-classification-using-deep-learning/}},
Keywords = {data science, research},
Month = {August},
Title = {Source Code Classification Using Deep Learning [blog post]},
Year = {2016}}
@misc{universal-ctags,
Author = {Universal Ctags Team},
Date-Added = {2018-02-17 20:53:07 +0000},
Date-Modified = {2018-02-17 20:54:44 +0000},
Howpublished = {\url{http://ctags.io/}},
Title = {Universal Ctags},
Year = {2001--2018}}
@misc{sloccount,
Author = {David A. Wheeler},
Date-Added = {2018-02-17 20:47:15 +0000},
Date-Modified = {2018-02-17 20:51:51 +0000},
Howpublished = {\url{https://www.dwheeler.com/sloccount/}},
Title = {SLOCCount},
Year = {2004--2018}}
@misc{cloc,
Author = {Al Danial},
Date-Added = {2018-02-17 20:46:02 +0000},
Date-Modified = {2018-02-17 20:46:38 +0000},
Howpublished = {\url{https://github.com/AlDanial/cloc}},
Title = {cloc},
Year = {2006--2018}}
@misc{guesslang,
Author = {Y. Somda},
Date-Added = {2018-02-17 20:27:54 +0000},
Date-Modified = {2018-02-17 20:43:42 +0000},
Howpublished = {\url{http://guesslang.readthedocs.io/}},
Title = {Guesslang},
Year = {2017--2018}}
@misc{linguist,
Author = {Github},
Date-Added = {2018-02-17 20:21:27 +0000},
Date-Modified = {2018-02-17 20:26:46 +0000},
Howpublished = {\url{https://github.com/github/linguist}},
Title = {Linguist},
Year = {2011--2018}}
@misc{ohcount,
Author = {Black Duck Software},
Date-Added = {2018-02-17 20:11:31 +0000},
Date-Modified = {2018-02-17 21:03:52 +0000},
Title = {Ohcount},
Howpublished = {\url{https://github.com/blackducksoftware/ohcount}},
Year = {2008--2018}}
@inproceedings{vanDam16,
Author = {J. K. v. Dam and V. Zaytsev},
Booktitle = {2016 IEEE 23rd International Conference on Software Analysis, Evolution, and Reengineering (SANER)},
Doi = {10.1109/SANER.2016.92},
Keywords = {meta data;natural language processing;pattern classification;program diagnostics;software maintenance;text analysis;embedded code fragments;file extensions;grammar-based text analysis;keyword search;legacy code analysis;multinominal naïve Bayes;n-grams;natural language classifiers;natural language processing field;normalised compression distance;skip-grams;software artefact metadata;software language identification;statistical language models;universal IDE support;Cascading style sheets;HTML;Java;Natural languages;Software;Training;Training data;language identification;natural language processing;software language engineering},
Month = {March},
Pages = {624-628},
Title = {Software Language Identification with Natural Language Classifiers},
Volume = {1},
Year = {2016},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/SANER.2016.92}}
@article{Klein11,
Archiveprefix = {arXiv},
Author = {David Klein and Kyle Murray and Simon Weber},
Bibsource = {dblp computer science bibliography, http://dblp.org},
Biburl = {http://dblp.org/rec/bib/journals/corr/abs-1106-4064},
Eprint = {1106.4064},
Journal = {CoRR},
Timestamp = {Wed, 07 Jun 2017 14:41:07 +0200},
Title = {Algorithmic Programming Language Identification},
Url = {http://arxiv.org/abs/1106.4064},
Volume = {abs/1106.4064},
Year = {2011},
Bdsk-Url-1 = {http://arxiv.org/abs/1106.4064}}
@inproceedings{Gilda17,
Author = {S. Gilda},
Booktitle = {2017 14th International Joint Conference on Computer Science and Software Engineering (JCSSE)},
Doi = {10.1109/JCSSE.2017.8025917},
Keywords = {feature extraction;learning (artificial intelligence);neural nets;pattern classification;programming languages;software engineering;source code (software);artificial neural network;convolutional neural network;file extension;intelligent feature extraction;multilayer neural network;neural networks;programming languages;software development industry;source code classification;supervised learning;word embedding layers;Feature extraction;HTML;Syntactics;Training;Artificial neural network;Feature extraction;Multi-layer neural network;Supervised learning},
Month = {July},
Pages = {1-6},
Title = {Source code classification using Neural Networks},
Year = {2017},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/JCSSE.2017.8025917}}
@article{Zevin17,
Archiveprefix = {arXiv},
Author = {Shaul Zevin and Catherine Holzem},
Bibsource = {dblp computer science bibliography, http://dblp.org},
Biburl = {http://dblp.org/rec/bib/journals/corr/ZevinH17},
Eprint = {1703.07638},
Journal = {CoRR},
Timestamp = {Wed, 07 Jun 2017 14:41:28 +0200},
Title = {Machine Learning Based Source Code Classification Using Syntax Oriented Features},
Url = {http://arxiv.org/abs/1703.07638},
Volume = {abs/1703.07638},
Year = {2017},
Bdsk-Url-1 = {http://arxiv.org/abs/1703.07638}}
@inproceedings{Ugurel02,
Acmid = {775141},
Address = {New York, NY, USA},
Author = {Ugurel, Secil and Krovetz, Robert and Giles, C. Lee},
Booktitle = {Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
Doi = {10.1145/775047.775141},
Isbn = {1-58113-567-X},
Location = {Edmonton, Alberta, Canada},
Numpages = {7},
Pages = {632--638},
Publisher = {ACM},
Series = {KDD '02},
Title = {What's the Code?: Automatic Classification of Source Code Archives},
Url = {http://doi.acm.org/10.1145/775047.775141},
Year = {2002},
Bdsk-Url-1 = {http://doi.acm.org/10.1145/775047.775141},
Bdsk-Url-2 = {http://dx.doi.org/10.1145/775047.775141}}
@inproceedings{Wang15,
author = {Peng Wang and
Jiaming Xu and
Bo Xu and
Cheng{-}Lin Liu and
Heng Zhang and
Fangyuan Wang and
Hongwei Hao},
title = {Semantic Clustering and Convolutional Neural Network for Short Text
Categorization},
booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational
Linguistics and the 7th International Joint Conference on Natural
Language Processing of the Asian Federation of Natural Language Processing,
{ACL} 2015, July 26-31, 2015, Beijing, China, Volume 2: Short Papers},
pages = {352--357},
year = {2015},
url = {http://aclweb.org/anthology/P/P15/P15-2058.pdf},
timestamp = {Mon, 03 Aug 2015 08:13:34 +0200},
biburl = {http://dblp.org/rec/bib/conf/acl/WangXXLZWH15},
bibsource = {dblp computer science bibliography, http://dblp.org}
}
@inproceedings{Khasnabish14,
author = {Jyotiska Nath Khasnabish and
Mitali Sodhi and
Jayati Deshmukh and
G. Srinivasaraghavan},
title = {Detecting Programming Language from Source Code Using Bayesian Learning
Techniques},
booktitle = {Machine Learning and Data Mining in Pattern Recognition - 10th International
Conference, {MLDM} 2014, St. Petersburg, Russia, July 21-24, 2014.
Proceedings},
pages = {513--522},
year = {2014},
url = {https://doi.org/10.1007/978-3-319-08979-9_39},
doi = {10.1007/978-3-319-08979-9_39},
timestamp = {Wed, 17 May 2017 14:25:11 +0200},
biburl = {http://dblp.org/rec/bib/conf/mldm/KhasnabishSDS14},
bibsource = {dblp computer science bibliography, http://dblp.org}
}
@misc{Heres16,
Author = {Daniël Heres},
Howpublished = {\url{http://blog.aylien.com/source-code-classification-using-deep-learning/}},
Month = {July},
Title = {Detecting the Programming Language of Source Code Snippets using Machine Learning and Neural Networks [blog post]},
Year = {2016}}
@Inbook{Aggarwal12,
author={Aggarwal, Charu C.
and Zhai, ChengXiang},
editor={Aggarwal, Charu C.
and Zhai, ChengXiang},
title={A Survey of Text Classification Algorithms},
bookTitle={Mining Text Data},
year={2012},
publisher={Springer US},
address={Boston, MA},
pages={163--222},
abstract={The problem of classification has been widely studied in the data mining, machine learning, database, and information retrieval communities with applications in a number of diverse domains, such as target marketing, medical diagnosis, news group filtering, and document organization. In this paper we will provide a survey of a wide variety of text classification algorithms.},
isbn={978-1-4614-3223-4},
doi={10.1007/978-1-4614-3223-4_6},
url={https://doi.org/10.1007/978-1-4614-3223-4_6}
}
@article{Chen09,
title = {Feature selection for text classification with Naïve Bayes},
journal = {Expert Systems with Applications},
volume = {36},
number = {3, Part 1},
pages = {5432 - 5435},
year = {2009},
issn = {0957-4174},
doi = {https://doi.org/10.1016/j.eswa.2008.06.054},
url = {http://www.sciencedirect.com/science/article/pii/S0957417408003564},
author = {Jingnian Chen and Houkuan Huang and Shengfeng Tian and Youli Qu},
keywords = {Text classification, Feature selection, Text preprocessing, Naïve Bayes}
}
@misc{MLatB16,
Author = {Machine Learning at Berkeley},
Howpublished = {\url{https://ml.berkeley.edu/blog/2016/12/03/github/}},
Keywords = {data science, research},
Month = {December},
Title = {Github Programming Language Classification [blog post]},
Year = {2016}
}
@article{Cavnar94,
title={N-gram-based text categorization},
author={Cavnar, William B and Trenkle, John M and others},
journal={Ann arbor mi},
volume={48113},
number={2},
pages={161--175},
year={1994},
publisher={Citeseer}
}
@article{Kim15,
author = {Yoon Kim and
Yacine Jernite and
David Sontag and
Alexander M. Rush},
title = {Character-Aware Neural Language Models},
journal = {CoRR},
volume = {abs/1508.06615},
year = {2015},
url = {http://arxiv.org/abs/1508.06615},
archivePrefix = {arXiv},
eprint = {1508.06615},
timestamp = {Wed, 07 Jun 2017 14:41:17 +0200},
biburl = {https://dblp.org/rec/bib/journals/corr/KimJSR15},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{Kim14,
author = {Yoon Kim},
title = {Convolutional Neural Networks for Sentence Classification},
journal = {CoRR},
volume = {abs/1408.5882},
year = {2014},
url = {http://arxiv.org/abs/1408.5882},
archivePrefix = {arXiv},
eprint = {1408.5882},
timestamp = {Wed, 07 Jun 2017 14:40:07 +0200},
biburl = {https://dblp.org/rec/bib/journals/corr/Kim14f},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{kenlm,
author = {Kenneth Heafield},
title = {{KenLM:} Faster and Smaller Language Model Queries},
year = {2011},
month = {July},
booktitle = {Proceedings of the {EMNLP} 2011 Sixth Workshop on Statistical Machine Translation},
address = {Edinburgh, Scotland, United Kingdom},
pages = {187--197},
url = {https://kheafield.com/papers/avenue/kenlm.pdf},
}
@article{scikit-learn,
title={Scikit-learn: Machine Learning in {P}ython},
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
journal={Journal of Machine Learning Research},
volume={12},
pages={2825--2830},
year={2011}
+}
+
+@misc{keras,
+ title={Keras},
+ author={Chollet, Fran\c{c}ois and others},
+ year={2015},
+ howpublished={\url{https://keras.io}},
+}
+
+@misc{tensorflow2015-whitepaper,
+title={ {TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems},
+howpublished={\url{https://www.tensorflow.org/}},
+author={
+ Mart\'{\i}n~Abadi and
+ Ashish~Agarwal and
+ Paul~Barham and
+ Eugene~Brevdo and
+ Zhifeng~Chen and
+ Craig~Citro and
+ Greg~S.~Corrado and
+ Andy~Davis and
+ Jeffrey~Dean and
+ Matthieu~Devin and
+ Sanjay~Ghemawat and
+ Ian~Goodfellow and
+ Andrew~Harp and
+ Geoffrey~Irving and
+ Michael~Isard and
+ Yangqing Jia and
+ Rafal~Jozefowicz and
+ Lukasz~Kaiser and
+ Manjunath~Kudlur and
+ Josh~Levenberg and
+ Dandelion~Man\'{e} and
+ Rajat~Monga and
+ Sherry~Moore and
+ Derek~Murray and
+ Chris~Olah and
+ Mike~Schuster and
+ Jonathon~Shlens and
+ Benoit~Steiner and
+ Ilya~Sutskever and
+ Kunal~Talwar and
+ Paul~Tucker and
+ Vincent~Vanhoucke and
+ Vijay~Vasudevan and
+ Fernanda~Vi\'{e}gas and
+ Oriol~Vinyals and
+ Pete~Warden and
+ Martin~Wattenberg and
+ Martin~Wicke and
+ Yuan~Yu and
+ Xiaoqiang~Zheng},
+ year={2015},
}
\ No newline at end of file
diff --git a/docs/report/report-en.pdf b/docs/report/report-en.pdf
index 8fe4843..88dede2 100644
Binary files a/docs/report/report-en.pdf and b/docs/report/report-en.pdf differ
diff --git a/docs/report/report-en.tex b/docs/report/report-en.tex
index c65e740..ae9d24f 100644
--- a/docs/report/report-en.tex
+++ b/docs/report/report-en.tex
@@ -1,461 +1,555 @@
\documentclass[a4paper,12pt]{article}
\usepackage[english]{babel}
\usepackage[parfill]{parskip}
\usepackage{graphicx}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{xunicode}
\usepackage[charter]{mathdesign}
\usepackage{url}
\usepackage{hyperref}
\usepackage{multirow}
\usepackage[toc,page]{appendix}
\usepackage{tabularx}
\usepackage{longtable}
\DeclareTextCommand{\nobreakspace}{T1}{\leavevmode\nobreak\ }
\title{Large-scale Programming Language Detection}
\author{Yuan YIN}
\date{}
\begin{document}
\maketitle
\begin{abstract}
(to be completed)
\end{abstract}
\tableofcontents
\section{Introduction}
Programming Language Detection is a problem of identifying which programming language is a piece of source code written in. We here define the piece of source code as a textual sequential representation of an artefact, which is normally in the form of character sequence or, more generally, byte sequence. More precisely, the objective is to build a model which could predict the language of a given byte sequence.
The formal definition of the problem as follows: on the input, given a byte sequence $d$ and the number of a languages $n$,
\[l_d = \underset{l_i\in \{l_1, ..., l_n\}}{\arg \max}\ m(d, l_i),\]
where $l_d$ is the projected language, model $m$ calculates a value indicating the likelihood of a document written in language $l_i$ and the most likely one is chosen as the recognised language of the document.
In general, Programming Language Detection could be utilised in different situations, here are several example applications: language composition of software project in version control systems. For example, GitHub team is developing the project Linguist to return which languages are the project written in; code searching in plain text, in order to track the popularity of a language; language detection helps also IDEs to choose the language whose support functionalities, like syntax highlighting, are implemented.
We dive into this problem in the context of \emph{Software Heritage}. \emph{Software Heritage}, initiated by Inria, is an archive in which 4 billions source code files from 80 millions projects are stored.
The reason why the language detection is requested by \emph{Software Heritage} is that the language of a file could not be found in its filename extension. In \emph{Software Heritage}, every source code file is a blob which contains raw content of the file, that means a sequence of bytes without any extra information, such as filename (including filename extension), metadata, \emph{etc}. Since each blob could be represented by an intrinsic identifier generated from the blob itself, the duplication of files is avoided. For this reason, all existing tools depending on filenames fail in our context, and the methods for recognising the language from a sequence of bytes is strongly demanded.
(To be fixed after the redaction)
In this report, we introduce briefly the state-of-the-art methods in Section 2. In Section 3, the procedure of making a feasible dataset is related. In Section 4, we explain the methods that we took in account for the evaluation.
We provide the implemented methods and more detailed results on Forge of \emph{Software Heritage} \footnote{\url{http://}}.
\section{Related Works}
The existing approaches could be divided into two categories: practical methods and machine learning methods.
Practical methods are mostly based on several empirical or external information, basic ideas are presented as follows:
\begin{itemize}
\item Judging from filename extension. Ohcount\cite{ohcount} and Linguist\cite{linguist} practice the detection by hashing filename extension. The problem from this straightforward method is that some extensions are related to different languages, \emph{e.g.} \texttt{*.m} refers to a file written in Objective-C or MATLAB, \texttt{*.pl} points to Python or Prolog.
\item Grammar-based approaches. The principal is to parse through all languages, which is complex in modelling and demand an heavy consumption of calculation time.
\item Heuristics approaches. Most of them, such as SLOCCount\cite{sloccount}, use predefined regular expressions to capture empirically discovered features, \emph{e.g.} a file start with ``\texttt{\#include}'' is probably written in C. Some other looks for hints in the file, such as shebang lines, Vim modelines, Emacs modelines, \emph{etc}.
\end{itemize}
In Machine learning, the problem is regarded as a sub-problem of \emph{text categorisation} or \emph{text classification}, which means that given a piece of text, we find a function that predicts which category the text belongs to. The state-of-the-art methods build such function based on example input-output pairs, which are categorised as \emph{supervised learning}.
Ugurel \emph{et al.} \cite{Ugurel02} selects firstly the features by Expected Entropy Loss for each language, then vectorise the tested document into a vector representing the presence of a selected feature. Since Support Vector Machine (SVM) is binary classifier, the $n$-class classification is resolved by training $n \choose 2$ SVMs in the form of decision tree. Van Dam and Zaytsev \cite{vanDam16} test several popular and performant methods in Natural Language Processing. Multi-nominal Naïve Bayes (MNB), one of the variants of Naïve Bayes Classifiers, utilises unified frequency of a word or a sequence of words in a byte-sequence to decide the most possibly corresponding programming language. $N$-gram model and skip-gram model calculate for each gram the possibility of its appearance after $N$ grams. Normalised Compression Distance compares a piece of compressed code to the examples in the training set, then chooses the nearest language on as projection. MNB and $N$-gram model outperform others according to the experimental results. Gilda\cite{Gilda17} adopts a general setup of Convolutional Neurone Network (ConvNet) in NLP and proofs its performance.
\section{Dataset}
+We considered either applying supervised learning and unsupervised learning for the problem. However, the usage of unsupervised learning is quite limited (we will talk about it later in Section 6). We then focus on supervised methods.
+
Supervised learning methods require a dataset containing labeled inputs to train and to evaluate the model. Nowadays, since Programming Language Detection is not seriously considered as an important subject in machine learning, for the reason that it could be resolved by adopting existing classifiers of ML, the articles are rarely accompanied by a publicly available dataset. Therefore, we natively build a novel dataset for our experiments.
GitHub\footnote{\url{https://www.github.com/}} is one of the most popular web-based hosting service for Git version control system, reporting having more than 57 million repositories. We decide to build the dataset using GitHub.
\paragraph{Ground Truth Supposition}
In the context of \emph{Software Heritage}, our aim is to cover as many languages as possible for classification, thus the dataset we build possesses inevitably a large amount of files, which is unaffordable to be labeled manually. We thus seek help from automatic labelling tools.
Linguist \cite{linguist} is the tool of language detection developed by the GitHub team for unveiling the language composition in git repository, service provided on GitHub through API. There exists a command line version Linguist producing list of files by language for repository. Given that filename extensions are visible for Linguist and such features boost enormously on accuracy of classification (we will show this claim in later experiment), we suppose that the language recognised by Linguist is the ground truth language attributed to it.
\paragraph{Source Code Recuperation and Languages Included}
The dataset is built in the context of \emph{Software Heritage}. Therefore, the list of languages we consider integrating in the system covers as many languages as possible.
We initially took the entire language list of Linguist (version 6.0.1) into account for repository fetching. For each language, we fetch the first 75 repositories which top on the list ordered by number of stars, manifesting the popularity of the repository. To avoid huge repositories, we ignore all repositories whose size is superior to 150 MiB.
We then eliminate some languages, \emph{i.e.} data description languages, which we could not fetch any repository from GitHub. We successfully fetched 323 valid languages, showed in Table~\ref{tab:lan}.
\section{Methods for Evaluation}
In this section, we describe several NLP methods here tested on our dataset:
\begin{itemize}
\item $n$-gram-based frequency distance model,
\item $n$-gram model,
\item Multinominal Naïve Bayes (MNB), and
\item Convolutional Neurone Networks (ConvNet).
\end{itemize}
The first approach is regarded as a baseline method for the evaluation of the accuracy and the efficiency of the model.
+Given that in \emph{Software Heritage} every file is only a sequence of bytes which we are not able to assert its encoding, even unable to judge whether it is a binary file or not, we are willing to discover the approaches at byte level.
+
\subsection{Baseline: $n$-gram-based frequency distance}
\paragraph{$n$-gram}
An $n$-gram is a slice of a larger sequence with $n$ units. In NLP, the sequence is naturally the string. Depending on different problems, an unit represents a character or a word.
For example, the string ``print(n)'' with 8 characters could generate following character based $n$-grams:
\begin{itemize}
\item unigrams: \texttt{p, r, ..., )}
\item bigrams: \texttt{\textvisiblespace p, pr, ri, ..., n), )\textvisiblespace}
\item trigrams: \texttt{\textvisiblespace\textvisiblespace p, \textvisiblespace pr, pri, rit, ..., n)\textvisiblespace, )\textvisiblespace\textvisiblespace}
\item ...
\end{itemize}
or word-based $n$-grams:
\begin{itemize}
\item unigrams: \texttt{, print, (, n, ), }
\item bigrams: \texttt{ print, print (, ( n, n ), ) }
\item trigrams: \texttt{ print (, print ( n, ( n ), n ) }
\item ...
\end{itemize}
-Strings are often padded with start mark \texttt{} and end mark \texttt{}. In general, a $k$-unity sequence generates exactly $k-(n-1)$ n-grams.
+Strings are often padded with start marker \texttt{} and end marker \texttt{}. In general, a $k$-unity sequence generates exactly $k-(n-1)$ n-grams.
Cavnar and Trenkle \cite{Cavnar94} introduce an early NLP method using the distance between two $n$-gram frequency profiles.
According to Zipf's law, an empirical observation expressing that the $n$-th most common word in a human language occurs with a frequency inversely proportional to $n$. By retaining the most common words, it is possible to obtain a list describing the characteristics of the language.
Given a training set, at the training phase, a bag of $n$-grams is generated for each document in the training set. By gathering all bags of a language and counting the occurrences of each $n$-gram, a list of $n$-grams ordered by number of occurrences is created as the \emph{category profile} of the class. Only the most frequent 300 $n$-grams are kept, since they are highly correlated to the language.
The \emph{distance} between category profile and document profile is defined as follows:
Given trained category profiles $p_{l_1}, ..., p_{l_k}$ for $k$ languages, and document profile $p_{d}$ of test document $d$,
\[
distance(p_{l_i}, p_{d}) = \sum_{w\in p_{d}} | rankdist(w, p_d, p_{l_i})|
\]
\[
rankdist(w, p_d, p_{l_i})=
\begin{cases}
|rank(w, p_d) - rank(w, p_{l_i})| & \text{if }rank(w, p_{l_i}) \text{ exists,} \\
|p_d| & \text{else}
\end{cases}
\]
where $p$ containing an ordered list of word, $rank(w, p)$ returns the rank of $w$ in list $p$. $rankdist(w, p_d, p_{l_i})$ returns the out-of-place distance between two profiles if $w$ appears in $p_{l_i}$. If $w$ is an out-of-vocabulary word, the distance is the length of document profile $p_d$.
We then categorise the document as language with minimum distance.
\subsection{Multinominal Naïve Bayes}
This approach is introduced by van Dam and Zaytsev \cite{vanDam16}.
We assume in Naïve Bayes model that each word of the document is independent from each other. According to Bayes' Theorem,
\begin{eqnarray*}
P(l|w_1w_2...w_n) & = & \frac{P(w_1w_2...w_n|l)P(l)}{P(w_1w_2...w_n)} \\
& = & c\cdot P(l) P(w_1w_2...w_n|l)\\
& = & c\cdot P(l) \prod_{i = 1}^n P(w_i|l)
\end{eqnarray*}
Probability of $w_i$ in language $l$ is estimated by its occurrences in language with bag-of-word assumption:
\[
P(w_i|l) = \frac{C_l(w_i) + 1}{\sum_{w\in V}C_l(w) + |V|}
\]
where $C_l$ gives frequency of a word, $V$ is the vocabulary all languages.
Assumption of independence of words is quite limited for classification, in practice we actually use unigrams to 5-grams to replace words in the original method for taking the context of words into account.
\subsection{$n$-gram model}
The approach is introduced by van Dam and Zaytsev\cite{vanDam16}. As the precedent method, $n$-gram model utilises also statistical properties of $n$-grams but in another way.
Originally, $n$-gram model aims at predicting the possibility of an unit after knowing $n-1$ units occurred before. Given an unit $w_i$, the probability of its occurrence in a sequence is defined as:
\[
P(w_i | w_1...w_{i-1})
\]
According to Markov assumption, we omit older context in the sequence,
\[
P(w_i | w_1...w_{i-1}) \approx P(w_i | w_{i-(n-1)}...w_{i-1})
\]
In reality, the probability could be estimated by maximum likelihood estimation (MLE):
\[
P(w_i | w_{i-(n-1)}...w_{i-1}) = \frac{C(w_{i-(n-1)}...w_{i-1}w_{i})}{C(w_{i-(n-1)}...w_{i-1})}
\]
where $C$ gives the count of given $n$-gram.
By chain rule of probability and precedent estimation,
\[
P(w_1w_2...w_n)\approx \prod_{i = 1}^n P(w_i|w_{i-(n-1)}...w_{i-1})
\]
Now we transform such model into a classifier. Given a sequence $w_1w_2...w_n$, we assume that each language $l$ appears with the same probability and the probability of a given sequence is fixed.
According to Bayes' Theorem,
\begin{eqnarray*}
P(l|w_1w_2...w_n) & = & \frac{P(w_1w_2...w_n|l)P(l)}{P(w_1w_2...w_n)} \\
& = & c\cdot P(w_1w_2...w_n|l)\\
& = & c\cdot \prod_{i = 1}^n P(w_i|w_{i-(n-1)}...w_{i-1}, l)
\end{eqnarray*}
Rather than counting $n$-grams in the document, the probability of $n$-gram is estimated from the $n$-gram frequency of language, obtained from training set.
\[
P(w_i | w_{i-(n-1)}...w_{i-1}, l) = \frac{C_l(w_{i-(n-1)}...w_{i-1}w_{i})}{C_l(w_{i-(n-1)}...w_{i-1})}
\]
where $l$ is $C_l$ gives the count of language $l$ in training set.
While estimating the probability of $n$-grams, the smoothing techniques are required because of possible occurrence of \emph{out-of-vocabulary (OOV)} $n$-gram. In our case, Modified Kneser-Ney is applied since it is one of the methods that gives better experimental results in \cite{vanDam16}.
\subsection{Convolutional Neural Network (ConvNet)}
Convolutional Neural Network is one of the most popular machine learning branch usually used for image classification. It is a class of deep feed-forward artificial neural networks.
The following two architectures are tested in Section 4.
\subsubsection{Word-level Approach (Ongoing)}
+\label{sec:word-conv}
+
Although Gilda \cite{Gilda17} shows the performance of his own architecture, we are not able to rebuild the same network due to the lack of network architecture details and hyper-parameter configuration. We move our vision to other architectures.
-Kim \cite{Kim14} introduces a ConvNet for natural language sentence classification.
+Kim \cite{Kim14} introduces a ConvNet for natural language sentence classification. Figure~\ref{fig:word-convnet} illustrates the architecture of the network.
+\paragraph{Word Embedding}
In this architecture, word is the unit of the input. The $i$-th word $w_i$ is transformed into a vector $\mathbf{x}_i \in \mathbb{R}^k$ by word embedding level using \texttt{word2vec}. Word vectors are then concatenated to form the representation of the document, an $n\times k$ matrix.
The number of words $n$ of the document is fixed by the model. Therefore, a document longer than $n$ words needs to be pruned and the shorter one needs padding, by concatenating zero-vectors at the beginning or the end of the matrix.
+\paragraph{Feature Extraction}
In the convolutional levels, by using a \emph{filter} $\mathbf{w_h} \in R^{hk}$, a \emph{feature} $c_i$ is then generated,
\[c_i = f(\mathbf{w_h}\cdot(\mathbf{x}_i\ ||\ \mathbf{x}_{i+1}\ ||\ ...\ ||\ \mathbf{x}_{i+h-1}) + b)\]
where $||$ is vector concatenate operator, $b\in \mathbb{R}$ is a bias term, $f$ is an \emph{activation function} outputting a feature from a set of inputs.
This procedure utilises the similar principle of $n$-gram model, but rather than extracting features from original words, ConvNet works on their vector representation.
Each filter produces a \emph{feature map}, a vector $\mathbf{c}^h\in \mathbb{R}^{n - h+1}$. A max-over-time-pooling is then applied on the feature map $\mathbf{c}^h$, aiming at choosing the most important features with the highest values and avoiding overfitting at training stage. We then obtain the final feature map of this $h\times k$ filter.
-Several filters are often applied to obtain the corresponding feature maps. Each filter represents a \emph{} They are then concatenated vertically into a final feature map $\mathbf{c}$.
+Several filters are often applied to obtain the corresponding feature map, representing a \emph{channel}. They are then concatenated vertically into a final feature map $\mathbf{c}$.
+
+\paragraph{Classification}
+
+\emph{Fully connected layer} is a traditional multi-layer perceptron whose neurons are all connected to every neurons of the precedent and following levels. It uses a softmax activation function in the output layer.
+
+Feature map $\mathbf{c}$ is then put into a fully connected layer for extracting higher level features preparing for final classification. The output of these fully connected layers gives a vector indicating the score obtained for each class. The higher the score is given, the more possible the document is categorised into this class.
+
+\paragraph{Hyperparameters}
+\begin{table}[h]
+\centering
+\begin{tabular}{|c|c|}
+\hline
+Hyperparameter & Value \\
+\hline
+Character embedding size & 128 \\
+Filter sizes & [3, 4, 5] \\
+Number of filter matrices & 10 \\
+Nonlinearity function & ReLU \\
+Max-pooling size & 3 \\
+\end{tabular}
+\caption{\label{tab:hyp-word}}
+\end{table}
-\subsubsection{Byte-level Approach (Ongoing)}
+\subsubsection{Byte-level Approach}
+
+Kim \cite{Kim15} introduces a character-level ConvNet for language modelling. The original architecture is adapted by Chaitanya Joshi \footnote{\url{https://github.com/chaitjo/character-level-cnn}} for achieving a classification model by replacing recurrent layers with same fully connected layers as word-level approach of Section~\ref{sec:word-conv}.
+
+\paragraph{Word Embedding} Instead of using word or token as feature, character-level approach could make use of character (or byte) without building a large vocabulary. Although the size of vocabulary is commonly considerably small, \emph{e.g.} 256 when we use every byte as character.
+
+\paragraph{Feature Extraction and Classification} These two parts are similar to word-level approach.
+
+\paragraph{Hyperparameters}
+
\section{Experimental Results}
In this section, we present several questions that we are willing to answer by experiments on our customised dataset.
-\subsection{Implementation and System Setup (Ongoing)}
+\subsection{Implementation and System Setup}
-We implement the methods described in Section 4 in Python 3, in order of finally integrating one of them in \emph{Software Heritage}.
+We implement the methods described in Section 4 in Python 3, in order to finally integrate one of them in \emph{Software Heritage}.
-Baseline method is implemented natively in Python. We implement MNB using Scikit-learn. $n$-gram model is implemented with KenLM \cite{kenlm}. The last two ConvNets are both implemented with Keras using Tensorflow as backend.
+Baseline method is implemented natively in Python. We implement MNB using Scikit-learn. $n$-gram model is implemented with KenLM \cite{kenlm}. The last two ConvNets are both implemented with Keras \cite{keras} using Tensorflow \cite{tensorflow2015-whitepaper} as backend.
-We execute principally the training and test phase on a portable computer with 2.7 GHz Intel Core i5 processor running macOS 10.3. The training phase of two ConvNet methods are executed in an instance with one Intel Sandy Bridge virtual CPU, equipped with one NVIDIA Tesla K80 GPU on Google Cloud Platform. The instance is configured for making use of Tensorflow backend with GPU acceleration using CUDA Deep Neural Network Library (cuDNN).
+We execute principally the training and test phase on a portable computer with 2.7 GHz Intel Core i5 processor running macOS 10.3. The training phase of two ConvNet methods are executed in an instance running Ubuntu 16.04 with one Intel Sandy Bridge virtual CPU, equipped with one NVIDIA Tesla K80 GPU on Google Cloud Platform. The instance is configured for making use of Tensorflow backend with GPU acceleration using CUDA Deep Neural Network Library (cuDNN).
-\subsection{Training Set And Test Set (Ongoing)}
+\subsection{Training Set And Test Set}
-Files of the training set are randomly picked at first, then the test set is built from the remaining samples. To avoid the imbalance of the training set that impacts the performance of several methods in Section 4, for each language, we restrain the maximum number of training files to 500 and the maximum number of test files to 1000.
+Files of the training set are randomly picked from the dataset at first time. To avoid the imbalance of the training set that impacts the performance of several methods in Section 4, we restrain the maximum number of training files to 500 for each language. The test set is then built from remaining samples, it includes up to 1000 files for testing.
\subsection{Model Quality Metrics}
-Given a class $c$, test results of documents could be regrouped into 4 categories:
+For a class $c$, test results of documents could be regrouped into 4 categories, we mark $\hat{y_i}$ as ground truth class label, $y_i$ as predicted label:
\begin{itemize}
- \item True Positive (TP): document in $c$ predicted as member of $c$
- \item False Positive (FP): document not in $c$ predicted as member of $c$
- \item True Negative (TN): document not in $c$ predicted as member of a class other than $c$
- \item False Negative (FN): document in $c$ predicted as member of a class other than $c$
+ \item True Positive (TP): when $\hat{y_i} = l$ and $y_i = l$, \emph{i.e.} document written in $l$ is recognised as the same language.
+ \item False Positive (FP): when $\hat{y_i} \neq l$ and $y_i = l$, \emph{i.e.} document not written in languag $l$ is incorrectly recognised as $l$.
+ \item True Negative (TN): when $\hat{y_i} \neq l$ and $y_i \neq l$, \emph{i.e.} document not written in $l$ is rejected by $l$.
+ \item False Negative (FN): when $\hat{y_i} = l$ and $y_i \neq l$, \emph{i.e.} document written in $l$ is incorrectly rejected by $l$.
\end{itemize}
In the context of classification, the quality of methods is measured by Precision, Recall and $F_1$ score.
Recall is also called True Positive Rate (TPR). It is the fraction of correctly classified samples over all samples should be predicted as in $c$:
\[\text{recall} = \frac{\text{\#TP}}{\text{\#TP}+\text{\#FN}}\]
Precision is also called Positive Predictive Value (PPV). It is the fraction of correctly classified samples over all samples predicted as in $c$:
\[\text{precision} = \frac{\text{\#TP}}{\text{\#TP}+\text{\#FP}}\]
The harmonic mean of precision and recall is called $F_1$ score, introduced for balancing two metrics:
\[
F_1 = \left(\frac{\text{precision}^{-1} + \text{recall}^{-1}}{2}\right)^{-1} = 2\cdot\frac{\text{precision}\cdot\text{recall}}{\text{precision}+\text{recall}}
\]
-In following subsections, we use $F_1$ as the measurement of the model validity by language.
+In following subsections, we use $F_1$ as the measurement of the model quality of each class' performance.
+
+Global model quality is evaluated by accuracy score:
+\[
+\text{accuracy}(y,\hat{y}) = \frac{1}{n}\sum_{i=0}^{n-1}1(y_i = \hat{y}_i)
+\]
+where $y$ is the predicted labels, $\hat{y}$ is the ground truth labels, $n$ is the number of samples, $1(\cdot)$ is the indicator function. The score shows the ratio of the number of samples whose projected label is the same as its ground truth to the total number of samples.
+
+\subsection{Experimental Results}
+
+\subsubsection{Quality of Models}
+
+The evaluation of the quality of models utilises the entire list of 323 languages.
-\subsection{Evaluation of Models}
+\paragraph{Overall Quality}
+Table~\ref{tab:total-comp} shows that baseline method reaches only 46.14\% of accuracy. Byte-level ConvNet marks the best accuracy score at 87.26\% which is much higher than word-level ConvNet. Both MNB and $n$-gram model reach acceptable results respectively at 85.10\% and 83.39\%.
-\subsection{Benchmark}
+\begin{table}[h]
+\centering
+ \begin{tabular}{|c|c|}
+ \hline
+ & Accuracy Score / \% \\ \hline
+ Baseline & 46.14 \\
+ MNB & 85.10 \\
+ $n$-gram model & 83.39 \\
+ Word-level ConvNet & 69.24 \\
+ Byte-level ConvNet & 87.26 \\ \hline
+ \end{tabular}
+ \caption{\label{tab:total-comp} Comparison of accuracy between evaluation methods.}
+\end{table}
+
+\paragraph{Inequality Between Classes} Although the overall score of Byte-level ConvNet reaches 87.26\%, $F_1$ score of several classes is much lower than
+
+\paragraph{Interclass Confusion (Ongoing)}
-\subsection{Filename Extension Is Important}
+We visualised the confusion matrices of methods in our repository. There exist significant confusion between similar languages, \emph{i.e.} C and C++; Objective-C, Objective-C++ and Objective-J; Befunge and HyPhy; Java and Processing; NetLinx, PAWN and Ruby; Javascript and Cycript, \emph{etc}.
+
+\subsubsection{Benchmark (Ongoing)}
+
+Table~\ref{tab:ben-train} shows that the first three basic NLP methods could be rapidly trained on CPU even a large number of classes are considered. The ConvNet methods demand more computing power in training stage.
+
+\begin{table}[h]
+\centering
+\begin{tabular}{|c|c|c|}
+\hline
+Method & Training Time & Test Time (per file) \\
+\hline
+Baseline & 1.8 h & 0.12 s \\
+MNB & 0.7 h & 2 s \\
+$n$-gram model & 0.8 h & 1.2 s \\
+Word-level ConvNet & 15.6 h (GPU 7.2 h) & 0.01 s \\
+Byte-level ConvNet & 20.8 h (GPU 1.6 h) & 0.01 s \\
+\hline
+\end{tabular}
+\caption{\label{tab:ben-train} Training time and test time benchmark on the same computer. Training time of ConvNet approaches on server is also indicated.}
+\end{table}
+
+
+\subsubsection{Word or Byte?}
+
+\subsubsection{Impact of Number of Languages (Ongoing)}
+
+
+\subsubsection{Filename Extension Is Important}
We know empirically that filename extension is a critical feature of classification. However, we hope to figure out how important it is. Knowing that ConvNet is good at choosing features distinguishing inputs, we test the performance using Byte-level ConvNet. In order to make use of the extension as feature, we add the corresponding extension at the end of the test file.
For convenience, we test only for 20 languages in the list. Table~\ref{tab:ext} shows that by adding the extension into the code the detection accuracy could be dramatically improved.
\begin{table}[h]
\centering
- \begin{tabular}{|c|c|c|}
+ \begin{tabular}{|c|c|}
\hline
- & Without Extension & With Extension \\
+ & Accuracy Score / \%\\
\hline
- Overall Accuracy / \% & 91.27 & 97.39 \\
+ Without Extension & 91.97 \\
+ With Extension & \textbf{97.37} \\
\hline
\end{tabular}
\caption{\label{tab:ext} Comparison of accuracy with extension and accuracy without extension with Byte-level ConvNet Classification on 20 classes.}
\end{table}
-\subsubsection{Impact of Number of Languages (Ongoing)}
-
-\subsubsection{Interclass Confusion (Ongoing)}
\section{Application in \emph{Software Heritage} (Ongoing)}
\section{Challenge for Large-scale Deployment (Ongoing)}
\subsection{Imbalance Among Classes}
Despite of our efforts on balancing the number of repositories for each class, a significant imbalance is eventually observed among language classes. For example, C++ has 180,093 files, but Omgrofl has only 16 files.
(Graph needed)
\subsection{Deploying in Distributed Systems}
\subsection{Discovering New Languages}
\subsubsection{Agglomerative Hierarchical Clustering}
\subsection{Integrating New Languages}
\subsubsection{Retraining}
\subsubsection{Incremental Learning}
\section{Conclusion (Ongoing)}
\clearpage
\begin{appendices}
\section{Language List}
\begin{table*}[h!]
\centering
\tiny
\begin{tabularx}{\textwidth}{|X|X|X|X|X|X|}
\hline
1C Enterprise & ABAP & ActionScript & Ada & Agda & AGS Script \\
\hline
Alloy & AMPL & AngelScript & ANTLR & Apex & API Blueprint \\
\hline
APL & AppleScript & Arc & ASP & AspectJ & Assembly \\
\hline
ATS & Augeas & AutoHotkey & AutoIt & Awk & Ballerina \\
\hline
Batchfile & Befunge & BitBake & BlitzBasic & BlitzMax & Bluespec \\
\hline
Boo & Brainfuck & Brightscript & Bro & C & C\# \\
\hline
C++ & Cap'n Proto & CartoCSS & Ceylon & Chapel & Charity \\
\hline
ChucK & Cirru & Clarion & Clean & Click & CLIPS \\
\hline
Clojure & CMake & COBOL & CoffeeScript & ColdFusion & Common Lisp \\
\hline
Common Workflow Language & Component Pascal & Cool & Coq & Crystal & Csound \\
\hline
Csound Document & Csound Score & CSS & Cuda & CWeb & Cycript \\
\hline
D & Dart & DataWeave & DIGITAL Command Language & DM & Dogescript \\
\hline
DTrace & Dylan & E & eC & ECL & Eiffel \\
\hline
Elixir & Elm & Emacs Lisp & EmberScript & EQ & Erlang \\
\hline
F\# & Factor & Fancy & Fantom & Filebench WML & FLUX \\
\hline
Forth & Fortran & FreeMarker & Frege & Game Maker Language & GAMS \\
\hline
GAP & GDB & GDScript & Genie & Genshi & Gherkin \\
\hline
GLSL & Glyph & Gnuplot & Go & Golo & Gosu \\
\hline
Grace & Grammatical Framework & Groovy & Hack & Harbour & Haskell \\
\hline
Haxe & HCL & HLSL & HTML & Hy & HyPhy \\
\hline
IDL & Idris & IGOR Pro & Inform 7 & Inno Setup & Io \\
\hline
Ioke & Isabelle & J & Jasmin & Java & JavaScript \\
\hline
Jolie & JSONiq & Julia & Jupyter Notebook & Kit & Kotlin \\
\hline
KRL & LabVIEW & Lasso & Lean & Lex & LFE \\
\hline
LilyPond & Limbo & Liquid & LiveScript & LLVM & Logos \\
\hline
Logtalk & LOLCODE & LookML & LoomScript & LSL & Lua \\
\hline
M & M4 & Makefile & Mako & Markdown & Mask \\
\hline
Mathematica & Matlab & Max & MAXScript & Mercury & Meson \\
\hline
Metal & Mirah & Modelica & Modula-2 & Module Management System & Monkey \\
\hline
Moocode & MoonScript & MQL4 & MQL5 & MTML & mupad \\
\hline
NCL & Nearley & Nemerle & nesC & NetLinx & NetLinx+ERB \\
\hline
NetLogo & NewLisp & Nextflow & Nim & Nit & Nix \\
\hline
NSIS & Nu & Objective-C & Objective-C++ & Objective-J & OCaml \\
\hline
Omgrofl & ooc & Opa & Opal & OpenEdge ABL & OpenSCAD \\
\hline
Ox & Oxygene & Oz & P4 & Pan & Papyrus \\
\hline
Parrot & Pascal & PAWN & Pep8 & Perl & Perl 6 \\
\hline
PHP & PicoLisp & PigLatin & Pike & PLpgSQL & PLSQL \\
\hline
PogoScript & Pony & PostScript & POV-Ray SDL & PowerBuilder & PowerShell \\
\hline
Processing & Prolog & Propeller Spin & Puppet & PureBasic & PureScript \\
\hline
Python & QMake & QML & R & Racket & Ragel \\
\hline
RAML & Rascal & REALbasic & Rebol & Red & Redcode \\
\hline
Ren'Py & RenderScript & reStructuredText & REXX & Ring & RMarkdown \\
\hline
RobotFramework & Roff & Rouge & RPC & Ruby & Rust \\
\hline
SaltStack & SAS & Scala & Scheme & Scilab & Self \\
\hline
ShaderLab & Shell & ShellSession & Shen & Slash & Smali \\
\hline
Smalltalk & Smarty & SMT & Solidity & SourcePawn & SQF \\
\hline
SQLPL & Squirrel & SRecode Template & Stan & Standard ML & Stata \\
\hline
SuperCollider & Swift & SystemVerilog & Tcl & Tea & Terra \\
\hline
TeX & Thrift & TI Program & TLA & Turing & TXL \\
\hline
TypeScript & Uno & UnrealScript & UrWeb & Vala & VCL \\
\hline
Verilog & VHDL & Vim script & Visual Basic & Volt & Vue \\
\hline
wdl & WebAssembly & WebIDL & wisp & X10 & xBase \\
\hline
XC & Xojo & XProc & XQuery & XS & XSLT \\
\hline
Xtend & Yacc & YAML & Zephir & Zimpl \\
\cline{1-5}
\end{tabularx}
\caption{\label{tab:lan} Language List of the dataset, 323 languages engaged.}
\end{table*}
\end{appendices}
\bibliography{bib-rapport}
\bibliographystyle{unsrt}
%Rapport
%
%Il doit faire de 15 à 30 pages et, dans la mesure du possible, doit être en grande part lisible par des non-spécialistes. Son plan peut être par exemple :
%présentation du domaine de recherche (le jury n'est pas constitué seulement de spécialistes du domaine, tenez-en compte !) ;
%énoncé et motivation du sujet ;
%résultats existants s'y rapportant (état de l'art, commentaire d'article, ...) ;
%vos résultats personnels (clairement identifiés comme tels).
%Le rapport devra être assorti d'un résumé d'une page compréhensible par quiconque.
\end{document}
\ No newline at end of file