diff --git a/doc/review-en/Makefile b/doc/review-en/Makefile new file mode 100644 index 0000000..5d57f04 --- /dev/null +++ b/doc/review-en/Makefile @@ -0,0 +1,12 @@ +.PHONY: all clean + +all: review-en.pdf clean + +clean: + rm -f *.aux *.bcf *.log *.xml *.toc */*.aux *.bbl *.blg + +%.pdf: %.tex %.bib + xelatex $*.tex + bibtex $* + xelatex $*.tex + xelatex $*.tex diff --git a/doc/review-en/review-en.bib b/doc/review-en/review-en.bib new file mode 100644 index 0000000..72211f1 --- /dev/null +++ b/doc/review-en/review-en.bib @@ -0,0 +1,214 @@ +@misc{Aylien16, + Author = {Aylien}, + Date-Added = {2018-02-17 20:56:11 +0000}, + Date-Modified = {2018-02-17 21:00:33 +0000}, + Howpublished = {\url{http://blog.aylien.com/source-code-classification-using-deep-learning/}}, + Keywords = {data science, research}, + Month = {August}, + Title = {Source Code Classification Using Deep Learning [blog post]}, + Year = {2016}} + +@misc{universal-ctags, + Author = {Universal Ctags Team}, + Date-Added = {2018-02-17 20:53:07 +0000}, + Date-Modified = {2018-02-17 20:54:44 +0000}, + Howpublished = {\url{http://ctags.io/}}, + Title = {Universal Ctags}, + Year = {2001--2018}} + +@misc{sloccount, + Author = {David A. Wheeler}, + Date-Added = {2018-02-17 20:47:15 +0000}, + Date-Modified = {2018-02-17 20:51:51 +0000}, + Howpublished = {\url{https://www.dwheeler.com/sloccount/}}, + Title = {SLOCCount}, + Year = {2004--2018}} + +@misc{cloc, + Author = {Al Danial}, + Date-Added = {2018-02-17 20:46:02 +0000}, + Date-Modified = {2018-02-17 20:46:38 +0000}, + Howpublished = {\url{https://github.com/AlDanial/cloc}}, + Title = {cloc}, + Year = {2006--2018}} + +@misc{guesslang, + Author = {Y. Somda}, + Date-Added = {2018-02-17 20:27:54 +0000}, + Date-Modified = {2018-02-17 20:43:42 +0000}, + Howpublished = {\url{http://guesslang.readthedocs.io/}}, + Title = {Guesslang}, + Year = {2017--2018}} + +@misc{linguist, + Author = {Github}, + Date-Added = {2018-02-17 20:21:27 +0000}, + Date-Modified = {2018-02-17 20:26:46 +0000}, + Howpublished = {\url{https://github.com/github/linguist}}, + Title = {Linguist}, + Year = {2011--2018}} + +@misc{ohcount, + Author = {Black Duck Software}, + Date-Added = {2018-02-17 20:11:31 +0000}, + Date-Modified = {2018-02-17 21:03:52 +0000}, + Title = {Ohcount}, + Howpublished = {\url{https://github.com/blackducksoftware/ohcount}}, + Year = {2008--2018}} + +@inproceedings{vanDam16, + Author = {J. K. v. Dam and V. Zaytsev}, + Booktitle = {2016 IEEE 23rd International Conference on Software Analysis, Evolution, and Reengineering (SANER)}, + Doi = {10.1109/SANER.2016.92}, + Keywords = {meta data;natural language processing;pattern classification;program diagnostics;software maintenance;text analysis;embedded code fragments;file extensions;grammar-based text analysis;keyword search;legacy code analysis;multinominal naïve Bayes;n-grams;natural language classifiers;natural language processing field;normalised compression distance;skip-grams;software artefact metadata;software language identification;statistical language models;universal IDE support;Cascading style sheets;HTML;Java;Natural languages;Software;Training;Training data;language identification;natural language processing;software language engineering}, + Month = {March}, + Pages = {624-628}, + Title = {Software Language Identification with Natural Language Classifiers}, + Volume = {1}, + Year = {2016}, + Bdsk-Url-1 = {http://dx.doi.org/10.1109/SANER.2016.92}} + +@article{Klein11, + Archiveprefix = {arXiv}, + Author = {David Klein and Kyle Murray and Simon Weber}, + Bibsource = {dblp computer science bibliography, http://dblp.org}, + Biburl = {http://dblp.org/rec/bib/journals/corr/abs-1106-4064}, + Eprint = {1106.4064}, + Journal = {CoRR}, + Timestamp = {Wed, 07 Jun 2017 14:41:07 +0200}, + Title = {Algorithmic Programming Language Identification}, + Url = {http://arxiv.org/abs/1106.4064}, + Volume = {abs/1106.4064}, + Year = {2011}, + Bdsk-Url-1 = {http://arxiv.org/abs/1106.4064}} + +@inproceedings{Gilda17, + Author = {S. Gilda}, + Booktitle = {2017 14th International Joint Conference on Computer Science and Software Engineering (JCSSE)}, + Doi = {10.1109/JCSSE.2017.8025917}, + Keywords = {feature extraction;learning (artificial intelligence);neural nets;pattern classification;programming languages;software engineering;source code (software);artificial neural network;convolutional neural network;file extension;intelligent feature extraction;multilayer neural network;neural networks;programming languages;software development industry;source code classification;supervised learning;word embedding layers;Feature extraction;HTML;Syntactics;Training;Artificial neural network;Feature extraction;Multi-layer neural network;Supervised learning}, + Month = {July}, + Pages = {1-6}, + Title = {Source code classification using Neural Networks}, + Year = {2017}, + Bdsk-Url-1 = {http://dx.doi.org/10.1109/JCSSE.2017.8025917}} + +@article{Zevin17, + Archiveprefix = {arXiv}, + Author = {Shaul Zevin and Catherine Holzem}, + Bibsource = {dblp computer science bibliography, http://dblp.org}, + Biburl = {http://dblp.org/rec/bib/journals/corr/ZevinH17}, + Eprint = {1703.07638}, + Journal = {CoRR}, + Timestamp = {Wed, 07 Jun 2017 14:41:28 +0200}, + Title = {Machine Learning Based Source Code Classification Using Syntax Oriented Features}, + Url = {http://arxiv.org/abs/1703.07638}, + Volume = {abs/1703.07638}, + Year = {2017}, + Bdsk-Url-1 = {http://arxiv.org/abs/1703.07638}} + +@inproceedings{Ugurel02, + Acmid = {775141}, + Address = {New York, NY, USA}, + Author = {Ugurel, Secil and Krovetz, Robert and Giles, C. Lee}, + Booktitle = {Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, + Doi = {10.1145/775047.775141}, + Isbn = {1-58113-567-X}, + Location = {Edmonton, Alberta, Canada}, + Numpages = {7}, + Pages = {632--638}, + Publisher = {ACM}, + Series = {KDD '02}, + Title = {What's the Code?: Automatic Classification of Source Code Archives}, + Url = {http://doi.acm.org/10.1145/775047.775141}, + Year = {2002}, + Bdsk-Url-1 = {http://doi.acm.org/10.1145/775047.775141}, + Bdsk-Url-2 = {http://dx.doi.org/10.1145/775047.775141}} + +@inproceedings{Wang15, + author = {Peng Wang and + Jiaming Xu and + Bo Xu and + Cheng{-}Lin Liu and + Heng Zhang and + Fangyuan Wang and + Hongwei Hao}, + title = {Semantic Clustering and Convolutional Neural Network for Short Text + Categorization}, + booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational + Linguistics and the 7th International Joint Conference on Natural + Language Processing of the Asian Federation of Natural Language Processing, + {ACL} 2015, July 26-31, 2015, Beijing, China, Volume 2: Short Papers}, + pages = {352--357}, + year = {2015}, + url = {http://aclweb.org/anthology/P/P15/P15-2058.pdf}, + timestamp = {Mon, 03 Aug 2015 08:13:34 +0200}, + biburl = {http://dblp.org/rec/bib/conf/acl/WangXXLZWH15}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@inproceedings{Khasnabish14, + author = {Jyotiska Nath Khasnabish and + Mitali Sodhi and + Jayati Deshmukh and + G. Srinivasaraghavan}, + title = {Detecting Programming Language from Source Code Using Bayesian Learning + Techniques}, + booktitle = {Machine Learning and Data Mining in Pattern Recognition - 10th International + Conference, {MLDM} 2014, St. Petersburg, Russia, July 21-24, 2014. + Proceedings}, + pages = {513--522}, + year = {2014}, + url = {https://doi.org/10.1007/978-3-319-08979-9_39}, + doi = {10.1007/978-3-319-08979-9_39}, + timestamp = {Wed, 17 May 2017 14:25:11 +0200}, + biburl = {http://dblp.org/rec/bib/conf/mldm/KhasnabishSDS14}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@misc{Heres16, + Author = {Daniël Heres}, + Howpublished = {\url{http://blog.aylien.com/source-code-classification-using-deep-learning/}}, + Month = {July}, + Title = {Detecting the Programming Language of Source Code Snippets using Machine Learning and Neural Networks [blog post]}, + Year = {2016}} + +@Inbook{Aggarwal12, +author={Aggarwal, Charu C. +and Zhai, ChengXiang}, +editor={Aggarwal, Charu C. +and Zhai, ChengXiang}, +title={A Survey of Text Classification Algorithms}, +bookTitle={Mining Text Data}, +year={2012}, +publisher={Springer US}, +address={Boston, MA}, +pages={163--222}, +abstract={The problem of classification has been widely studied in the data mining, machine learning, database, and information retrieval communities with applications in a number of diverse domains, such as target marketing, medical diagnosis, news group filtering, and document organization. In this paper we will provide a survey of a wide variety of text classification algorithms.}, +isbn={978-1-4614-3223-4}, +doi={10.1007/978-1-4614-3223-4_6}, +url={https://doi.org/10.1007/978-1-4614-3223-4_6} +} + +@article{Chen09, +title = {Feature selection for text classification with Naïve Bayes}, +journal = {Expert Systems with Applications}, +volume = {36}, +number = {3, Part 1}, +pages = {5432 - 5435}, +year = {2009}, +issn = {0957-4174}, +doi = {https://doi.org/10.1016/j.eswa.2008.06.054}, +url = {http://www.sciencedirect.com/science/article/pii/S0957417408003564}, +author = {Jingnian Chen and Houkuan Huang and Shengfeng Tian and Youli Qu}, +keywords = {Text classification, Feature selection, Text preprocessing, Naïve Bayes} +} + +@misc{MLatB16, + Author = {Machine Learning at Berkeley}, + Howpublished = {\url{https://ml.berkeley.edu/blog/2016/12/03/github/}}, + Keywords = {data science, research}, + Month = {December}, + Title = {Github Programming Language Classification [blog post]}, + Year = {2016} + } \ No newline at end of file diff --git a/doc/review-en/review-en.pdf b/doc/review-en/review-en.pdf new file mode 100644 index 0000000..9a21583 Binary files /dev/null and b/doc/review-en/review-en.pdf differ diff --git a/doc/review-en/review-en.tex b/doc/review-en/review-en.tex new file mode 100644 index 0000000..e3591c1 --- /dev/null +++ b/doc/review-en/review-en.tex @@ -0,0 +1,301 @@ +\documentclass[10pt]{article} +\usepackage[english]{babel} +\usepackage[T1]{fontenc} +\usepackage[utf8]{inputenc} +\usepackage{url} +\usepackage{hyperref} +\usepackage[style=numeric,backend=bibtex]{biblatex} +\addbibresource{review-en.bib} + +\title{Review of Programming Language Identification Articles and Tools} +\date{March 1, 2018} +\author{Yuan Yin} +\begin{document} +\maketitle +\tableofcontents + +\section{Introduction} +In the scenario of Software Heritage, since we are not able to identify the programming language of a specific file by filename extension, we are in need of an practical approach to detect the corresponding language from source code, which is normally in the form of sequence of bytes. + +More precisely, it concerns programming language identification problem, determining the language of a given source code segment. + +In this review, we investigate the approaches from two types of research results: articles (in section 2) and implemented free softwares (in section 3). + +\section{Articles} + +\subsection{Software Language Identification with Natural Language Classifiers \cite{vanDam16}} + +In this article, researchers aim at investigating whether programming language identification problem could be resolved, by adopting existing natural language identification approaches. + +\subsubsection{Keywords} + +Natural Language Processing (NLP), Multinominal Naïve Bayes, $n$-gram, skip gram, Normalised Compression Distance. + +\subsubsection{Multinominal Naïve Bayes (MNB)} +\paragraph{Description} MNB is one of the variants of Naïve Bayes classifiers. Comparing with other classifiers, MNB is simpler and more applicable for classification of documents. + +MNB utilises unified frequency of a word or a sequence of words, generated by $n$-gram, in a byte-sequence to decide the most possibly corresponding programming language. In order to avoid zero possibility, it is necessary to add a constant to every frequency count. + +\paragraph{Advantages} The method is easy to implement. Once the training set is determined, features are also extracted by counting word n-grams. For each file, all counts can be done after read it once. + +\paragraph{Weaknesses} All collected features are local, even if they are collected by $n$-grams, because of the assumption of independence of features. Global contextual information is ignored. + +\subsubsection{$N$-gram and skip gram language model} +\paragraph{Description} Basically, for a given $N$, the probability of the last word $w_i$ of the $N$-gram $w_{i-N+1}...w_i$ is calculated by the proportion of the occurrences of the $N$-gram to the occurrences of the $N$-gram without word $w_i$ $(w_{i-N+1}...w_{i-1})$. + +The only difference between non-skip version and skip version $N$-gram is that the skip version ignore some words after it considers a word. + +\paragraph{Advantages} Comparing with MNB, more context information is integrated in the model by chain rule of probability. Rather than considering texts as independent groups, $N$-gram model has a vision of chain reaction. Furthermore, the usage of skip gram provides a higher elasticity to the concept of context while detecting more complex structure, such as variable names, \emph{etc}. + +\paragraph{Weaknesses} Although chain rule is applied, as MNB, $N$-gram and skip gram focus on local contextual information. + +\subsubsection{Normalised Compression Distance (NCD)} + +\paragraph{Description} NCD applies a simple criteria to calculate the normalised distance between a piece of compressed source code and compressed code in the training set. The appropriate class will be the nearest class in feature space. + +\paragraph{Advantages} This method completely depends on global similarity between two files. +\paragraph{Weaknesses} Apparently, this method ignores all local structures, which is less applicable while facing such strongly structural context like programming languages. The time complexity needs to be questioned, it is susceptible that the complexity surpasses two other methods. + +\subsubsection{Experiments} + +\paragraph{Dataset} +\begin{itemize} + \item Dataset source: GitHub + \item Language classes: 20 languages + \item Training set: + \begin{itemize} + \item Small set: 200 source code files, 10 per language + \item Sarge set: 10 000 source code files, 500 per language + \end{itemize} + \item Test set: 4 000 source code files, 200 per language + \item Availability: Not available +\end{itemize} + +\paragraph{Experimental Results} + +The classifier of best performance is Modified Kneser-Ney Discounting Classifier, giving an accuracy of 96.9\%. + +\subsubsection{Conclusion} + +Methods are evaluated respectively by varying parameters (length of grams, possibility smoothing techniques, size of training set, tokenise method, \emph{etc}). According to the experimental results, with a larger training set, input files are more precisely classified by an $n$-gram method. It shows that the classical methods in natural language identification is equally applicable to programming language. Furthermore, all presented methods are incrementable by simply adding classes in training set. + +The time complexity of listed methods, which is important to a large-scale application, is not included in the consideration of this article. + +\subsubsection{Related Approaches} +\begin{itemize} + \item J. N. Khasnabish, \emph{et al.}, Detecting Programming Language from Source Code Using Bayesian Learning Techniques \cite{Khasnabish14} + + Naïve Bayes Classifier, Bayesian Networks and Multinomial Naive Bayes Classifier. +\end{itemize} + +\subsection{Algorithmic Programming Language Identification \cite{Klein11}} + +\subsubsection{Keywords} + +Heuristic, statistical analysis, parsing expression grammars. + +\subsubsection{Approach} + +This method combines a few techniques of small range analysis to identify the programming language. This work consists of a part of achieved work and an unachieved one. + +For the first part, many heuristics are defined for modelling language features. They are mainly obtained by a priori experiences. A preprocessing is used to avoid the intervention of strings and comments, which could weaken the classifier since they commonly contain natural language components. For other statistical features, like brackets, keywords, operations, etc., the similarity scores between the unidentified file and trained classes are calculated by a reciprocal of probability distance, that means the smaller the difference of probability, the higher the score obtained. + +In the second part, grammatical features were intended to be integrated in previous implementation. Rather than matching the complete grammar with unknown source code, small pieces of grammar are used. Idea is to count the number of tokens matched by grammar, and combining grammar pieces together. + +\subsubsection{Advantages} + +Apart from unrealised part, this method consists of statistical analysis with some artificial features, that is to identify the language by extracting global information. + +\subsubsection{Weaknesses} +Despite the efforts on identifying strings and comments, it seems that experience features are not that enough to capture enough distinguishing information, comparing with Normalised Compression Distance method \cite{vanDam16}. The lack of the features of grammatical analysis weakens the approach. + +\subsubsection{Experiments} + +\paragraph{Dataset} +\begin{itemize} + \item Source: Github, by a custom web crawler + \item Language classes: 25 languages + \item Training set: over 41 000 source code files + \item Test set: 25 source code files chosen from former dataset + \item Availability: Not available +\end{itemize} + +\paragraph{Experimental Results} Accuracy: 48\% + +\subsubsection{Related Approaches} + +No related approaches. (It seems that this branch is in dead-end road. + +\subsection{Source Code Classification Using Neural Networks \cite{Gilda17}} + +Interestingly, a similar method is presented in \cite{Aylien16} with the same experimental results. This subsection is the review of these two articles. + +\subsubsection{Keywords} + +Convolutional Neural Network (ConvNet), Supervised learning, Feature extraction. + +\subsubsection{Approach} + +\paragraph{Preprocessing} + +\begin{itemize} + \item Purification: At the stage of training, all source code files are purified by excluding other non-dominant language code, \emph{i.e.} Javascript code fragments are excluded in HTML files. To avoid embedded code snippets in string, all strings are replaced by a special token, \emph{i.e.} raw SQL code in JAVA. + \item Tokenisation: file stream is separated into tokens of words, punctuations and other components for the facility of further processing. +\end{itemize} + +\paragraph{Word Embeddings} +Tokens in the form of string include only syntactic details, it is significant to transform tokens into a more semantic form, word embeddings. Tokens are mapped from the dictionary to a high-dimensional space of numbers, often represented in the form of vector. Word embeddings can be utilised to determine similitudes and relations between words. + +\paragraph{Convolutional Neural Network} + +The input of the ConvNet is a token number by unified vector length matrix. The convolution filters are used to obtain 256 feature maps for 9 convolution layers. Max-pooling, concerning a dimensionality reduction, is introduced after every three sequential convolution layers. On the output of convolution layers, high-level features are used as input for classifying. + +Training processus includes: forward propagation of training data and backward propagation of its errors. + +\subsubsection{Advantages} With deep learning techniques, the method can discover high-level features in different scales. ConvNet of typical structure used in NLP fits equally in programming language identification. + +\subsubsection{Weaknesses} Preprocessing needs to be extended because of other disturbing factors, such as comments, indentation, \emph{etc}. When a new language is introduced, the whole model needs to be retrained and replaced. + +\subsubsection{Experiments} + +\paragraph{Dataset} +\begin{itemize} + \item Dataset source: Github, crawled by API. + \item Language classes: 60 languages + \item Training set: 1 064 918 source code files + \item Test set: 118 324 source code files chosen from former dataset + \item Availability: Not available +\end{itemize} + +\paragraph{Experimental Results} Global accuracy, precision and recall are respectively 97\%, 96\% and 96\%. + + +\subsubsection{Related Approaches} +\begin{itemize} + \item P. Wang, \emph{et al}, Semantic Clustering and Convolutional Neural Network for Short Text Categorization \cite{Wang15} + \item D. Heres, Detecting the Programming Language of Source Code Snippets using Machine Learning and Neural Networks [blog post] \cite{Heres16} + \item C. C. Aggarwal, C. Zhai, A Survey of Text Classification Algorithms \cite{Aggarwal12} + \item Machine Learning at Berkeley, Github Programming Language Classification [blog post] \cite{MLatB16} +\end{itemize} + + +\subsection{Machine Learning Based Source Code Classification Using Syntax Oriented Features \cite{Zevin17}} + +\subsubsection{Keywords} +Maximum Entropy Classifier, grammar production + +\subsubsection{Approach} +\paragraph{Preprocessing} Only replacement of characters occur in preprocessing. For example, uppercase letters to lowercase letters, numbers, newlines and EOF to a special token. Tokens obtained after preprocessing consist of ponctuation marks, identifiers and keyword tokens. Words whose frequency exceeds 0.01 are considered as keywords, other words are considered as identifiers. + +\paragraph{Grammar Production} Grammar Productions are a set of $n$-grams selected by a threshold of Mutual Information index, which mesures how much the presence or absence of a production contribute to the classification. + +\paragraph{Classifer Training} The objective is to find the an optimal weight attributed to each grammar production and then to maximise the entropy of source code file facing to the a certain class. + +\subsubsection{Advantages} +Maximum entropy classifier do not make any assumption on relationship between features, this gets rid of the feature independence of Naïve Bayesian classifiers, which contradicts contextual dependency of programming language. + +\subsubsection{Weaknesses} +Several empirical parameters/thresholds, such as 0.05 threshold of MI index, are introduced, probably leading to instability of the method. + +\subsubsection{Experiments} +\paragraph{Dataset} +\begin{itemize} + \item Language classes: 29 languages + \item Training set: 145 476 source code files + \item Test set: 147 843 source code files chosen from former dataset +\end{itemize} + +\paragraph{Experimental Results} Global precision and recall are respectively 99.1\% and 99\%. + +\subsubsection{Related Approaches} +\begin{itemize} + \item C. C. Aggarwal, C. Zhai, A Survey of Text Classification Algorithms \cite{Aggarwal12} +\end{itemize} + +\subsection{What's the Code?: Automatic Classification of Source Code Archives \cite{Ugurel02}} + +\subsubsection{Keywords} +Expected Entropy Loss, Support Vector Machine (SVM) +\subsubsection{Approach} +\paragraph{Feature Extraction} A valid token in the approach is defined as a alphabetical sequence of characters separated by non-alphabetical characters. For each language, a set of top features is selected according to calculated Expected Entropy Loss. + +\paragraph{Vectorisation} The presence of features is encoded into a 0/1 vector, where 1 means that the corresponding feature presents in a file. + +\paragraph{Classification} Since SVM classifies the samples into two classes, ${n \choose 2} = n(n-1) / 2$ classifiers are constructed for $n$ classes. + +\subsubsection{Advantages} A purely probabilistic model to distinguish different languages. Easy to implement. + +\subsubsection{Weaknesses} The classifier is not the ideal one for an $n$ classification problem. When a new language is needed to be integrated in an $n$ language classifiers, $n$ new classifiers are needed to be trained. + +\subsubsection{Experiments} + +\paragraph{Dataset} +\begin{itemize} + \item Language classes: 10 languages + \item Training set: 1000 source code files, 100 per language + \item Test set: 300 source code files, 30 per language +\end{itemize} + +\paragraph{Experimental Results} Global accuracy with comments: 89.04\%; comment free: 87.41\%. +\subsubsection{Related Approaches} +\begin{itemize} + \item S. Zevin and C. Holzem, Machine Learning Based Source Code Classification Using Syntax Oriented Features \cite{Zevin17} + \item Machine Learning at Berkeley, Github Programming Language Classification [blog post] \cite{MLatB16} +\end{itemize} + + +\section{Tools} +\subsection{Ohcount \cite{ohcount}} +Ohcount is a open-source library for counting lines of source code and categorise them by language. + +\subsubsection{Approach} +Language of source code is detected firstly by hashing filename extension then by analysing filename. + +The interesting part is the succeeding step. While it is impossible to fetch or to detect language from filename, the library will check if source code contains some configuration for \texttt{emacs}, then it uses \texttt{libmagic}, a library for manipulating files in UNIX systems, to detect file language. \texttt{libmagic} returns a string with a textual description of the source code, such as ``\texttt{XML 1.0 document, ASCII text}'' + +In side the library \texttt{libmagic}, we found that the detection of language occurs in buffer level. When a file is opened, a \texttt{match()} function will go through a list of language features defining heuristics for classifying a file into a certain type. + +In the list of features, regular expressions are widely utilised to match specific keywords or highlight code pieces, \emph{i.e.} \verb|\^#include, \^#[[:space:]]*pragma| for C, +\verb|\^import.*;$| for JAVA, etc. + +\subsubsection{Advantages} + +It is clever to build a list of significant features of languages with regular expressions. This heuristic method is more efficient while matching the languages with more explicit, more easily recognisable features. + +\subsubsection{Weakness} + +The source of the matching list is unknown for the moment. According to the observation, we doubt that the list is based on empirical evidence of programming. The methodology is not equally effective if the language has few unique keywords or feature words, such as OCaml. + +\subsection{Linguist \cite{linguist}} + +\subsubsection{Approach} + +Apart from judging from filename or its extension, Linguist apply the Naïve Bayesian classifier to decide the programming language of source code file. The classification is based on tokens. + +\subsubsection{Related Approaches} +\begin{itemize} + \item J. K. van Dam and V. Zaytsev, Software Language Identification with Natural Language Classifiers \cite{vanDam16} + \item Feature selection for text classification with Naïve Bayes \cite{Chen09} +\end{itemize} + +\subsection{Guesslang \cite{guesslang}} + +\subsubsection{Approach} +\paragraph{Preprocessing} File streams are tokenised and numbers and strings are replaced by a special token. Bigrams and trigrams are also computed together. Tokens and $n$-grams are then hashed and put into a counter vector. + +\paragraph{Classifier} A Wide \& Deep Learning classifier, combining a linear model classifier with a feed-forward neural network. The model is implemented in TensorFlow, a deep learning library developed by Google. + +\subsection{cloc \cite{cloc}} +No language detection based on source code. + +\subsection{SLOCCount \cite{sloccount}} +Using heuristics to decide the language of source code. + +\subsection{Universal Ctags \cite{universal-ctags}} +No language detection based on source code. + +%\bibliographystyle{plain} +%\bibliography{trebib} +\printbibliography + +\end{document}