<?xml version="1.0" encoding="ISO-8859-1"?><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<front>
<journal-meta>
<journal-id>1870-9044</journal-id>
<journal-title><![CDATA[Polibits]]></journal-title>
<abbrev-journal-title><![CDATA[Polibits]]></abbrev-journal-title>
<issn>1870-9044</issn>
<publisher>
<publisher-name><![CDATA[Instituto Politécnico Nacional, Centro de Innovación y Desarrollo Tecnológico en Cómputo]]></publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id>S1870-90442016000100049</article-id>
<article-id pub-id-type="doi">10.17562/PB-53-5</article-id>
<title-group>
<article-title xml:lang="en"><![CDATA[Improving Corpus Annotation Quality Using Word Embedding Models]]></article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname><![CDATA[Novák]]></surname>
<given-names><![CDATA[Attila]]></given-names>
</name>
<xref ref-type="aff" rid="Aff"/>
</contrib>
</contrib-group>
<aff id="Af1">
<institution><![CDATA[,Pazmany Peter Catholic University Faculty of Information Technology and Bionics ]]></institution>
<addr-line><![CDATA[Budapest ]]></addr-line>
<country>Hungary</country>
</aff>
<pub-date pub-type="pub">
<day>00</day>
<month>06</month>
<year>2016</year>
</pub-date>
<pub-date pub-type="epub">
<day>00</day>
<month>06</month>
<year>2016</year>
</pub-date>
<numero>53</numero>
<fpage>49</fpage>
<lpage>53</lpage>
<copyright-statement/>
<copyright-year/>
<self-uri xlink:href="http://www.scielo.org.mx/scielo.php?script=sci_arttext&amp;pid=S1870-90442016000100049&amp;lng=en&amp;nrm=iso"></self-uri><self-uri xlink:href="http://www.scielo.org.mx/scielo.php?script=sci_abstract&amp;pid=S1870-90442016000100049&amp;lng=en&amp;nrm=iso"></self-uri><self-uri xlink:href="http://www.scielo.org.mx/scielo.php?script=sci_pdf&amp;pid=S1870-90442016000100049&amp;lng=en&amp;nrm=iso"></self-uri><abstract abstract-type="short" xml:lang="en"><p><![CDATA[Abstract Web-crawled corpora contain a significant amount of noise. Automatic corpus annotation tools introduce even more noise performing erroneous language identification or encoding detection, introducing tokenization and lemmatization errors and adding erroneous tags or analyses to the original words. Our goal with the methods presented in this article was to use word embedding models to reveal such errors and to provide correction procedures. The evaluation focuses on analyzing and validating noun compounds identifying bogus compound analyses, recognizing and concatenating fragmented words, detecting erroneously encoded text, restoring accents and handling the combination of these errors in a Hungarian web-crawled corpus.]]></p></abstract>
<kwd-group>
<kwd lng="en"><![CDATA[Corpus linguistics]]></kwd>
<kwd lng="en"><![CDATA[lexical resources]]></kwd>
<kwd lng="en"><![CDATA[corpus annotation]]></kwd>
<kwd lng="en"><![CDATA[word embeddings]]></kwd>
</kwd-group>
</article-meta>
</front><back>
<ref-list>
<ref id="B1">
<nlm-citation citation-type="confpro">
<person-group person-group-type="author">
<name>
<surname><![CDATA[Rangarajan Sridhar]]></surname>
<given-names><![CDATA[V. K.]]></given-names>
</name>
</person-group>
<source><![CDATA[Unsupervised text normalization using distributed representations of words and phrases]]></source>
<year>June</year>
<month> 2</month>
<day>01</day>
<conf-name><![CDATA[ 1stWorkshop on Vector Space Modeling for Natural Language Processing]]></conf-name>
<conf-date>June 2015</conf-date>
<conf-loc>Denver, Colorado </conf-loc>
<page-range>8-16</page-range></nlm-citation>
</ref>
<ref id="B2">
<nlm-citation citation-type="confpro">
<person-group person-group-type="author">
<name>
<surname><![CDATA[Li]]></surname>
<given-names><![CDATA[C.]]></given-names>
</name>
<name>
<surname><![CDATA[Liu]]></surname>
<given-names><![CDATA[.]]></given-names>
</name>
</person-group>
<source><![CDATA[Improving text normalization via unsupervised model and discriminative reranking]]></source>
<year>June</year>
<month> 2</month>
<day>01</day>
<conf-name><![CDATA[ ACL 2014 Student Research Workshop]]></conf-name>
<conf-date>June 2014</conf-date>
<conf-loc>Baltimore, Maryland, USA </conf-loc>
<page-range>86-93</page-range></nlm-citation>
</ref>
<ref id="B3">
<nlm-citation citation-type="confpro">
<person-group person-group-type="author">
<name>
<surname><![CDATA[Tan]]></surname>
<given-names><![CDATA[L.]]></given-names>
</name>
<name>
<surname><![CDATA[Zhang]]></surname>
<given-names><![CDATA[H.]]></given-names>
</name>
<name>
<surname><![CDATA[Clarke]]></surname>
<given-names><![CDATA[C.]]></given-names>
</name>
<name>
<surname><![CDATA[Smucker]]></surname>
<given-names><![CDATA[M.]]></given-names>
</name>
</person-group>
<source><![CDATA[Lexical comparison between Wikipedia and Twitter corpora by using word embeddings]]></source>
<year>July</year>
<month> 2</month>
<day>01</day>
<conf-name><![CDATA[ 53Annual Meeting of the Association for Computational Linguistics7]]></conf-name>
<conf-date>July 2015</conf-date>
<conf-loc>Beijing, China </conf-loc>
<page-range>657-61</page-range></nlm-citation>
</ref>
<ref id="B4">
<nlm-citation citation-type="confpro">
<person-group person-group-type="author">
<name>
<surname><![CDATA[Dima]]></surname>
<given-names><![CDATA[C.]]></given-names>
</name>
<name>
<surname><![CDATA[Hinrichs]]></surname>
<given-names><![CDATA[E.]]></given-names>
</name>
</person-group>
<source><![CDATA[Automatic noun compound interpretation using deep neural networks and word embeddings]]></source>
<year>Apri</year>
<month>l </month>
<day>20</day>
<conf-name><![CDATA[ 11International Conference on Computational Semantics]]></conf-name>
<conf-date>April 2015</conf-date>
<conf-loc>London, UK </conf-loc>
<page-range>173-83</page-range></nlm-citation>
</ref>
<ref id="B5">
<nlm-citation citation-type="confpro">
<person-group person-group-type="author">
<name>
<surname><![CDATA[Salehi]]></surname>
<given-names><![CDATA[B.]]></given-names>
</name>
<name>
<surname><![CDATA[Cook]]></surname>
<given-names><![CDATA[P.]]></given-names>
</name>
<name>
<surname><![CDATA[Baldwin]]></surname>
<given-names><![CDATA[T.]]></given-names>
</name>
</person-group>
<source><![CDATA[A word embedding approach to predicting the compositionality of multiword expressions]]></source>
<year>2015</year>
<conf-name><![CDATA[ 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies]]></conf-name>
<conf-loc>Denver, Colorado </conf-loc>
<page-range>977-83</page-range></nlm-citation>
</ref>
<ref id="B6">
<nlm-citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname><![CDATA[Mikolov]]></surname>
<given-names><![CDATA[T.]]></given-names>
</name>
<name>
<surname><![CDATA[Sutskever]]></surname>
<given-names><![CDATA[I.]]></given-names>
</name>
<name>
<surname><![CDATA[Chen]]></surname>
<given-names><![CDATA[K.]]></given-names>
</name>
<name>
<surname><![CDATA[Corrado]]></surname>
<given-names><![CDATA[G. S.]]></given-names>
</name>
<name>
<surname><![CDATA[Dean]]></surname>
<given-names><![CDATA[J.]]></given-names>
</name>
</person-group>
<article-title xml:lang=""><![CDATA[Distributed representations of words and phrases and their composi-tionality]]></article-title>
<person-group person-group-type="editor">
<name>
<surname><![CDATA[Burges]]></surname>
<given-names><![CDATA[C. J. C.]]></given-names>
</name>
<name>
<surname><![CDATA[Bottou]]></surname>
<given-names><![CDATA[L.]]></given-names>
</name>
<name>
<surname><![CDATA[Welling]]></surname>
<given-names><![CDATA[M.]]></given-names>
</name>
<name>
<surname><![CDATA[Ghahramani]]></surname>
<given-names><![CDATA[Z.]]></given-names>
</name>
<name>
<surname><![CDATA[Weinberger]]></surname>
<given-names><![CDATA[K. Q.]]></given-names>
</name>
</person-group>
<source><![CDATA[Advances in Neural Information Processing Systems]]></source>
<year>2013</year>
<volume>26</volume>
<page-range>3111-9</page-range><publisher-name><![CDATA[Curran Associates, Inc.]]></publisher-name>
</nlm-citation>
</ref>
<ref id="B7">
<nlm-citation citation-type="journal">
<article-title xml:lang=""><![CDATA[Efficient estimation of word representations in vector space]]></article-title>
<person-group person-group-type="author">
<name>
<surname><![CDATA[Mikolov]]></surname>
<given-names><![CDATA[T.]]></given-names>
</name>
<name>
<surname><![CDATA[Chen]]></surname>
<given-names><![CDATA[K.]]></given-names>
</name>
<name>
<surname><![CDATA[Corrado]]></surname>
<given-names><![CDATA[G.]]></given-names>
</name>
<name>
<surname><![CDATA[Dean]]></surname>
<given-names><![CDATA[J.]]></given-names>
</name>
</person-group>
<source><![CDATA[CoRR]]></source>
<year>2013</year>
<volume>abs/1301.3781</volume>
</nlm-citation>
</ref>
<ref id="B8">
<nlm-citation citation-type="confpro">
<person-group person-group-type="author">
<name>
<surname><![CDATA[Orosz]]></surname>
<given-names><![CDATA[G.]]></given-names>
</name>
<name>
<surname><![CDATA[Novak]]></surname>
<given-names><![CDATA[A.]]></given-names>
</name>
</person-group>
<source><![CDATA[PurePos 2.0: A hybrid tool for morphological disambiguation]]></source>
<year>2013</year>
<conf-name><![CDATA[ International Conference on Recent Advances in Natural Language Processing (RANLP 2013)]]></conf-name>
<conf-loc>Hissar, Bulgaria </conf-loc>
<page-range>539-45</page-range><publisher-loc><![CDATA[Shoumen, BULGARIA ]]></publisher-loc>
<publisher-name><![CDATA[INCOMA Ltd.]]></publisher-name>
</nlm-citation>
</ref>
<ref id="B9">
<nlm-citation citation-type="confpro">
<person-group person-group-type="author">
<name>
<surname><![CDATA[Novak]]></surname>
<given-names><![CDATA[A.]]></given-names>
</name>
</person-group>
<source><![CDATA[Milyen a jo humor? [What is good humor like?]]]></source>
<year>2003</year>
<conf-name><![CDATA[ FirstHungarian conference on computational linguistics]]></conf-name>
<conf-date>2003</conf-date>
<conf-loc>Szeged </conf-loc>
<page-range>138-44</page-range></nlm-citation>
</ref>
<ref id="B10">
<nlm-citation citation-type="confpro">
<person-group person-group-type="author">
<name>
<surname><![CDATA[Proszeky]]></surname>
<given-names><![CDATA[G.]]></given-names>
</name>
<name>
<surname><![CDATA[Kis]]></surname>
<given-names><![CDATA[B.]]></given-names>
</name>
</person-group>
<source><![CDATA[A unification-based approach to morpho-syntactic parsing of agglutinative and other (highly) inflectional languages]]></source>
<year>1999</year>
<conf-name><![CDATA[ 37annual meeting ofthe Association for Computational Linguistics on Computational Linguistics, ser. ACL'99]]></conf-name>
<conf-date>1999</conf-date>
<conf-loc>Stroudsburg, PA, USA </conf-loc>
<page-range>261-8</page-range></nlm-citation>
</ref>
<ref id="B11">
<nlm-citation citation-type="confpro">
<person-group person-group-type="author">
<name>
<surname><![CDATA[Novak]]></surname>
<given-names><![CDATA[A.]]></given-names>
</name>
<name>
<surname><![CDATA[Calzolari]]></surname>
<given-names><![CDATA[N.]]></given-names>
</name>
<name>
<surname><![CDATA[Choukri]]></surname>
<given-names><![CDATA[K.]]></given-names>
</name>
<name>
<surname><![CDATA[Declerck]]></surname>
<given-names><![CDATA[T.]]></given-names>
</name>
<name>
<surname><![CDATA[Loftsson]]></surname>
<given-names><![CDATA[H.]]></given-names>
</name>
<name>
<surname><![CDATA[Maegaard]]></surname>
<given-names><![CDATA[B.]]></given-names>
</name>
<name>
<surname><![CDATA[Mariani]]></surname>
<given-names><![CDATA[J.]]></given-names>
</name>
<name>
<surname><![CDATA[Moreno]]></surname>
<given-names><![CDATA[A.]]></given-names>
</name>
<name>
<surname><![CDATA[Odijk]]></surname>
<given-names><![CDATA[J.]]></given-names>
</name>
<name>
<surname><![CDATA[Piperidis]]></surname>
<given-names><![CDATA[S.]]></given-names>
</name>
</person-group>
<source><![CDATA[A new form of humor - mapping constraint-based computational morphologies to a finite-state representation]]></source>
<year>2014</year>
<conf-name><![CDATA[ NinthInternational Conference on Language Resources and Evaluation (LREC'14)]]></conf-name>
<conf-loc> </conf-loc>
<page-range>1068-73</page-range><publisher-loc><![CDATA[Reykjavik, Iceland ]]></publisher-loc>
<publisher-name><![CDATA[European Language Resources Association (ELRA)]]></publisher-name>
</nlm-citation>
</ref>
<ref id="B12">
<nlm-citation citation-type="confpro">
<person-group person-group-type="author">
<name>
<surname><![CDATA[Siklosi]]></surname>
<given-names><![CDATA[B.]]></given-names>
</name>
<name>
<surname><![CDATA[Gelbukh]]></surname>
<given-names><![CDATA[A.]]></given-names>
</name>
</person-group>
<source><![CDATA[Using embedding models for lexical categorization in morphologically rich languages]]></source>
<year>2016</year>
<conf-name><![CDATA[ Computational Linguistics and Intelligent Text Processing: 17th International Conference, CICLing 2016]]></conf-name>
<conf-date>April 3-9, 2016</conf-date>
<conf-loc>Konya, Turkey </conf-loc>
<publisher-loc><![CDATA[Cham ]]></publisher-loc>
<publisher-name><![CDATA[Springer International Publishing]]></publisher-name>
</nlm-citation>
</ref>
<ref id="B13">
<nlm-citation citation-type="journal">
<article-title xml:lang=""><![CDATA[The textcat package for ra-gram based text categorization in R]]></article-title>
<person-group person-group-type="author">
<name>
<surname><![CDATA[Hornik]]></surname>
<given-names><![CDATA[K.]]></given-names>
</name>
<name>
<surname><![CDATA[Mair]]></surname>
<given-names><![CDATA[P.]]></given-names>
</name>
<name>
<surname><![CDATA[Rauch]]></surname>
<given-names><![CDATA[J.]]></given-names>
</name>
<name>
<surname><![CDATA[Geiger]]></surname>
<given-names><![CDATA[W.]]></given-names>
</name>
<name>
<surname><![CDATA[Buchta]]></surname>
<given-names><![CDATA[C.]]></given-names>
</name>
<name>
<surname><![CDATA[Feinerer]]></surname>
<given-names><![CDATA[I.]]></given-names>
</name>
</person-group>
<source><![CDATA[Journal of Statistical Software]]></source>
<year>2013</year>
<volume>52</volume>
<numero>6</numero>
<issue>6</issue>
<page-range>1-17</page-range></nlm-citation>
</ref>
<ref id="B14">
<nlm-citation citation-type="confpro">
<person-group person-group-type="author">
<name>
<surname><![CDATA[Novak]]></surname>
<given-names><![CDATA[A.]]></given-names>
</name>
<name>
<surname><![CDATA[Siklosi]]></surname>
<given-names><![CDATA[B.]]></given-names>
</name>
</person-group>
<source><![CDATA[Automatic diacritics restoration for hungarian]]></source>
<year>Sept</year>
<month>em</month>
<day>be</day>
<conf-name><![CDATA[ 2015 Conference on Empirical Methods in Natural Language Processing]]></conf-name>
<conf-date>September 2015</conf-date>
<conf-loc>Lisbon, Portugal </conf-loc>
<page-range>2286-91</page-range></nlm-citation>
</ref>
</ref-list>
</back>
</article>
