├── .gitignore ├── BUILD_INDEX.md ├── LICENSE ├── Procfile ├── README.md ├── config ├── doc-English.txt ├── doc-EnglishTa.txt ├── doc-Farsi.txt ├── doc-Hebrew.txt ├── doc-Hindi.txt ├── doc-Russian.txt └── sent-Hindi.txt ├── data ├── column │ ├── fas │ │ ├── 1.txt │ │ ├── 2.txt │ │ └── 3.txt │ └── rus │ │ ├── 1.txt │ │ ├── 2.txt │ │ └── 3.txt ├── conll │ └── eng │ │ ├── 1.txt.conll │ │ ├── 2.txt.conll │ │ ├── 3.txt.conll │ │ ├── 4.txt.conll │ │ ├── coutinho.conll │ │ └── eng.conll ├── getindian.py ├── mturk.csv ├── ta │ └── eng │ │ ├── ta-0 │ │ ├── ta-1 │ │ ├── ta-10 │ │ ├── ta-11 │ │ ├── ta-12 │ │ ├── ta-13 │ │ ├── ta-14 │ │ ├── ta-15 │ │ ├── ta-16 │ │ ├── ta-17 │ │ ├── ta-18 │ │ ├── ta-19 │ │ ├── ta-2 │ │ ├── ta-20 │ │ ├── ta-21 │ │ ├── ta-22 │ │ ├── ta-23 │ │ ├── ta-24 │ │ ├── ta-3 │ │ ├── ta-4 │ │ ├── ta-5 │ │ ├── ta-6 │ │ ├── ta-7 │ │ ├── ta-8 │ │ └── ta-9 ├── tajson │ └── heb │ │ ├── 150 │ │ ├── 152 │ │ ├── 153 │ │ ├── 154 │ │ ├── 155 │ │ ├── 156 │ │ ├── 157 │ │ ├── 158 │ │ ├── 159 │ │ ├── 160 │ │ ├── 161 │ │ ├── 162 │ │ ├── 163 │ │ ├── 164 │ │ ├── 165 │ │ ├── 166 │ │ ├── 168 │ │ ├── 169 │ │ └── 170 ├── txt │ ├── eng │ │ ├── 1.txt │ │ ├── 2.txt │ │ ├── 3.txt │ │ └── 4.txt │ ├── fas │ │ ├── 1.txt │ │ ├── 2.txt │ │ └── 3.txt │ └── rus │ │ ├── 1.txt │ │ ├── 2.txt │ │ └── 3.txt ├── txt2column.py └── txt2tajson.py ├── docs └── index.html ├── pom.xml ├── scripts ├── buildindex.sh ├── install-cli.sh ├── preparedata.py └── run.sh └── src └── main ├── java └── io │ └── github │ └── mayhewsw │ ├── Application.java │ ├── ConfigFile.java │ ├── Dictionary.java │ ├── FeatureExtractor.java │ ├── Group.java │ ├── KeyComparator.java │ ├── SessionData.java │ ├── SessionInterceptor.java │ ├── Suggestion.java │ ├── TextFileIndexer.java │ ├── User.java │ ├── WebSecurityConfig.java │ ├── classifier │ ├── CandParser.java │ └── Candidate.java │ ├── controllers │ ├── Common.java │ ├── DictionaryController.java │ ├── DocumentController.java │ ├── LoginController.java │ ├── SentenceController.java │ └── TextStatisticsController.java │ └── utils │ ├── ColumnReader.java │ ├── FinalSaver.java │ ├── HtmlGenerator.java │ ├── IO.java │ ├── Propagator.java │ ├── Sandbox.java │ ├── SentenceCache.java │ ├── TalenCLI.java │ └── Utils.java ├── lbjava └── CandClassifier.lbj └── resources ├── application.properties ├── log4j.properties ├── static ├── css │ └── style.css ├── img │ ├── disk.png │ ├── favicon.ico │ ├── favicon.xcf │ ├── loading.gif │ ├── logo-black-trans.png │ ├── logo-black-white.png │ ├── logo-grey-trans.png │ ├── logo-white-black.png │ ├── logo-white-trans.png │ ├── logo.xcf │ ├── screenshot.png │ └── selection.png └── js │ ├── annotate-local.js │ └── annotate.js └── templates ├── base.html ├── dict.html ├── document ├── annotation.html ├── doc-base.html ├── getstarted.html ├── home.html └── old.html ├── fragments └── nav.html ├── index.html ├── instructions.html ├── layout.html ├── login.html ├── mturk └── mturkTemplate.html ├── sentence ├── annotation.html ├── getstarted.html ├── group-anno-NOTUSED.html ├── home.html └── sent-base.html └── unified-annotation.html /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | users.txt 4 | labels.css 5 | *~ 6 | target/ 7 | *.iml 8 | config/folders.txt 9 | .idea/ 10 | annotation-cache/ 11 | tas/ 12 | config/myfolders.txt 13 | log-spring.out 14 | 15 | # automatically generated annotation folders 16 | *-annotation-* 17 | *-sentanno-* 18 | 19 | *DS_Store 20 | 21 | # Mobile Tools for Java (J2ME) 22 | .mtj.tmp/ 23 | 24 | # Package Files # 25 | *.jar 26 | *.war 27 | *.ear 28 | 29 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 30 | hs_err_pid* 31 | 32 | data/train* 33 | wishlist.txt 34 | *lbjava* -------------------------------------------------------------------------------- /BUILD_INDEX.md: -------------------------------------------------------------------------------- 1 | # How to build an index 2 | 3 | The sentences branch uses a Lucene index to retrieve sentences quickly. This document will show you how to 4 | build this index. 5 | 6 | To get started, you need a folder that contains CoNLL column NER format files. These are the files that you 7 | want to annotate. 8 | 9 | Open up TextFileIndexer.java. 10 | 11 | Run it! You are good to go. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Research and Academic Use License 2 | Cognitive Computation Group 3 | University of Illinois at Urbana-Champaign 4 | 5 | Downloading software implies that you accept the following license terms: 6 | 7 | Under this Agreement, The Board of Trustees of the University of Illinois ("University"), a body corporate and politic of the State of Illinois with its principal offices at 506 South Wright Street, Urbana, Illinois 61801, U.S.A., on behalf of its Department of Computer Science on the Urbana-Champaign Campus, provides the software ("Software") described in Appendix A, attached hereto and incorporated herein, to the Licensee identified below ("Licensee") subject to the following conditions: 8 | 9 | 1. Upon execution of this Agreement by Licensee below, the University grants, and Licensee accepts, a roylaty-free, non-exclusive license: 10 | A. To use unlimited copies of the Software for its own academic and research purposes. 11 | B. To make derivative works. However, if Licensee distributes any derivative work based on or derived from the Software (with such distribution limited to binary form only), then Licensee will (1) notify the University (c/o Professor Dan Roth, e-mail: danr@cs.uiuc.edu) regarding its distribution of the derivative work and provide a copy if requested, and (2) clearly notify users that such derivative work is a modified version and not the original Software distributed by the University. 12 | C. To redistribute (sublicense) derivative works based on the Software in binary form only to third parties provided that (1) the copyright notice and any accompanying legends or proprietary notices are reproduced on all copies, (2) no royalty is charged for such copies, and (3) third parties are restricted to using the derivative work for academic and research purposes only, without further sublicensing rights. 13 | No license is granted herein that would permit Licensee to incorporate the Software into a commercial product, or to otherwise commercially exploit the Software. Should Licensee wish to make commercial use of the Software, Licensee should contact the University, c/o the Office of Technology Management ("OTM") to negotiate an appropriate license for such commercial use. To contact the OTM: otmmailaccount@ad.uiuc.edu; telephone: (217)333-3781; fax: (217) 265-5530. 14 | 2. THE UNIVERSITY GIVES NO WARRANTIES, EITHER EXPRESSED OR IMPLIED, FOR THE SOFTWARE AND/OR ASSOCIATED MATERIALS PROVIDED UNDER THIS AGREEMENT, INCLUDING, WITHOUT LIMITATION, WARRANTY OF MERCHANTABILITY AND WARRANTY OF FITNESS FOR A PARTICULAR PURPOSE, AND ANY WARRANTY AGAINST INFRINGEMENT OF ANY INTELLECTUAL PROPERTY RIGHTS. 15 | 3. Licensee understands the Software is a research tool for which no warranties as to capabilities or accuracy are made, and Licensee accepts the Software on an "as is, with all defects" basis, without maintenance, debugging , support or improvement. Licensee assumes the entire risk as to the results and performance of the Software and/or associated materials. Licensee agrees that University shall not be held liable for any direct, indirect, consequential, or incidental damages with respect to any claim by Licensee or any third party on account of or arising from this Agreement or use of the Software and/or associated materials. 16 | 4. Licensee understands the Software is proprietary to the University. Licensee will take all reasonable steps to insure that the source code is protected and secured from unauthorized disclosure, use, or release and will treat it with at least the same level of care as Licensee would use to protect and secure its own proprietary computer programs and/or information, but using no less than reasonable care. 17 | 5. In the event that Licensee shall be in default in the performance of any material obligations under this Agreement, and if the default has not been remedied within sixty (60) days after the date of notice in writing of such default, University may terminate this Agreement by written notice. In the event of termination, Licensee shall promptly return to University the original and any copies of licensed Software in Licensee's possession. In the event of any termination of this Agreement, any and all sublicenses granted by Licensee to third parties pursuant to this Agreement (as permitted by this Agreement) prior to the date of such termination shall nevertheless remain in full force and effect. 18 | 6. The Software was developed, in part, with support from the National Science Foundation, and the Federal Government has certain license rights in the Software. 19 | 7. This Agreement shall be construed and interpreted in accordance with the laws of the State of Illinois, U.S.A.. 20 | 8. This Agreement shall be subject to all United States Government laws and regulations now and hereafter applicable to the subject matter of this Agreement, including specifically the Export Law provisions of the Departments of Commerce and State. Licensee will not export or re-export the Software without the appropriate United States or foreign government license. 21 | By its registration below, Licensee confirms that it understands the terms and conditions of this Agreement, and agrees to be bound by them. This Agreement shall become effective as of the date of execution by Licensee. 22 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: java $JAVA_OPTS -Dserver.port=$PORT -jar target/ner-annotation-0.1.0.jar -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | # TALEN: Tool for Annotation of Low-resource ENtities 7 | 8 | A lightweight web-based tool for annotating word sequences. 9 | 10 | ![Screenshot of web interface](/src/main/resources/static/img/selection.png?raw=true "Screenshot") 11 | 12 | 13 | 14 | ## Installation 15 | 16 | Requires Java 8 and Maven. Run: 17 | 18 | $ ./scripts/run.sh 19 | 20 | This will start the server on port 8009. Point a browser to [localhost:8009](http://localhost:8009). The port number is specified in [`application.properties`](./src/main/resources/application.properties). 21 | 22 | This reads from [`config/users.txt`](config/users.txt), which has a username and password pair on each line. You will 23 | log in using one of those pairs, and then that username is tied to your activities in that session. All annotations 24 | that you do will be written to a path called `-annotation-`, where `` is the original path 25 | specified in the config file, and `` is what you chose as username. 26 | 27 | Suppose you do some annotations, then leave the session, and come back again. If you log in with the same 28 | username as the previous session, it will reload all of the annotations right where you left off, so no 29 | work is lost. 30 | 31 | ## Usage 32 | 33 | You make annotations by clicking on words and selecting a label. If you want to remove a label, right click on a word. 34 | 35 | To annotate a phrase, highlight the phrase, ending with the mouse in the middle of the last word. The standard box will 36 | show up, and you can select the correct label. To dismiss the annotation box, click on the word it points to. 37 | 38 | A document is saved by pressing the Save button. If you navigate away using 39 | the links on the top of the page, the document is not saved. 40 | 41 | ## Configuration 42 | 43 | There are two kinds of config files, corresponding to the two annotation methods 44 | (see below). The document-based method looks for config files that start with 'doc-' 45 | and the sentence-based method looks for config files that start with 'sent-'. 46 | 47 | See the [example config files](config/) for the minimally required set of options. 48 | 49 | ## Annotation Methods 50 | 51 | There are two main annotation methods supported: document-based, and sentence-based. 52 | 53 | ### Document-based 54 | The document-based method is a common paradigm. You point the software to a folder of documents 55 | and each is displayed in turn, and you annotate them. 56 | 57 | ### Sentence-based 58 | The sentence-based method is intended to allow a rapid annotation process. First, you need to 59 | build an index using `TextFileIndexer.java`, then you supply some seed names 60 | in the config file. The system searches for these seed names in the index, and returns 61 | a small number of sentences containing them. The annotator is encouraged to annotate 62 | these correctly, and also annotate any other names which may appear. These new names then 63 | join the list of seed names, and annotation continues. 64 | 65 | For example, if the seed name is 'Pete Sampras', then we might hope that 'Andre Agassi' 66 | will show up in the same sentence. If the annotator chooses to annotate 67 | 'Andre Agassi' also, then the system will retrieve new sentences containing 'Andre Agassi'. 68 | Presumably these sentences will contain entities such as 'Wimbledon' and 'New York City'. In principle, 69 | this will continue until some cap on the number of entities has been reached. 70 | 71 | #### Using the sentence-based 72 | 73 | First, you need to download a corpus. We have used Hindi for this. Run: 74 | 75 | ```bash 76 | $ (If you don't already have nltk) sudo pip install -U nltk 77 | $ python -m nltk.downloader indian 78 | ``` 79 | 80 | Now convert this: 81 | ```bash 82 | $ cd data 83 | $ python data/getindian.py 84 | $ cd .. 85 | ``` 86 | 87 | You'll notice that this created files in `data/txt/hindi` and in `data/tajson/hindi`. Now build the index: 88 | ```bash 89 | $ mvn dependency:copy-dependencies 90 | $ ./scripts/buildindex.sh data/tajson/hindi/ data/index_hindi 91 | ``` 92 | 93 | That's it! There is already a config file called `config/sent-Hindi.txt` that should get you started. 94 | 95 | 96 | ## Non-speaker Helps 97 | One major focus of the software is to allow non-speakers of a language to 98 | annotate text. Some features are: inline dictionary replacement, morphological 99 | awareness and coloring, entity propagation, entity suggestions, hints based on frequency and 100 | mutual information. 101 | 102 | ### How to build an index 103 | Use [`buildindex.sh`](scripts/buildindex.sh) to build a local index for the sentence based mode. The `indexdir` variable 104 | will be put in the sentence-based config file. This, in turn calls `TextFileIndexer.java`. 105 | 106 | ## Command line tool 107 | We also ship a lightweight command line tool for TALEN. This tool will read a folder of JSON TextAnnotations (more formats coming soon) 108 | and spin up a Java-only server, serving static HTML versions of each document. This will be used only for examination and exploration. 109 | 110 | Install it as follows: 111 | ```bash 112 | $ ./scripts/install-cli.sh 113 | $ export PATH=$PATH:$HOME/software/talen/ 114 | ``` 115 | 116 | (You can change the `INSTALLDIR` in `install-cli.sh` if you want it installed somewhere else). Now it is installed, you can run it 117 | from any folder in your terminal: 118 | 119 | ```bash 120 | $ talen-cli FolderOfTAFiles 121 | ``` 122 | 123 | This will serve static HTML documents at `localhost:PORT` (default `PORT` is 8008). You can run with additional options: 124 | 125 | ```bash 126 | $ talen-cli FolderOfTAFiles -roman -port 8888 127 | ``` 128 | 129 | Where the `-roman` option uses the `ROMANIZATION` view in the TextAnnotation for text (if available), and the `-port` option 130 | uses the specified port. 131 | 132 | 133 | ## Mechanical Turk 134 | Although the main function of this software is a server based system, there is also a lightweight version that runs 135 | entirely in Javascript, for the express purpose of creating Mechanical Turk jobs. 136 | 137 | The important files are [mturkTemplate.html](src/main/resources/templates/mturk/mturkTemplate.html) and [annotate-local.js](src/main/resources/static/js/annotate-local.js). The 138 | latter is a version of [annotate.js](src/main/resources/static/js/annotate.js), but the code to handle adding and 139 | removing spans is included in the Javascript instead of sent to a Java controller. This is less powerful (because we have 140 | NLP libraries written in Java, not Javascript), but can be run with no server. 141 | 142 | 143 | All the scripts needed to create this file are included in this repository. It was created as follows: 144 | 145 | ```bash 146 | $ python scripts/preparedata.py preparedata data/txt tmp.csv 147 | $ python scripts/preparedata.py testfile tmp.csv docs/index.html 148 | ``` 149 | 150 | [mturkTemplate.html](src/main/resources/templates/mturk/mturkTemplate.html) has a lot of extra stuff (instructions, annotator test, etc) which 151 | can all be removed if desired. I found it was useful for mturk tasks. When you create the mturk task, there will be a 152 | submit button, and the answer will be put into the `#finalsubmission` field. The output string is a Javascript list of token spans along with 153 | label. 154 | 155 | 156 | ## Citation 157 | 158 | If you use this in your research paper, please cite us! 159 | 160 | ``` 161 | @inproceedings{talen2018, 162 | author = {Stephen Mayhew, Dan Roth}, 163 | title = {TALEN: Tool for Annotation of Low-resource ENtities}, 164 | booktitle = {ACL System Demonstrations}, 165 | year = {2018}, 166 | } 167 | ``` 168 | 169 | Read the paper here: [http://cogcomp.org/papers/MayhewRo18.pdf](http://cogcomp.org/papers/MayhewRo18.pdf) 170 | 171 | 172 | -------------------------------------------------------------------------------- /config/doc-English.txt: -------------------------------------------------------------------------------- 1 | name EnglishDefault 2 | folderpath data/conll/eng/ 3 | format conll 4 | labels PER ORG LOC MISC 5 | -------------------------------------------------------------------------------- /config/doc-EnglishTa.txt: -------------------------------------------------------------------------------- 1 | name EnglishTa 2 | folderpath data/ta/eng/ 3 | format ta 4 | labels PER ORG LOC MISC 5 | -------------------------------------------------------------------------------- /config/doc-Farsi.txt: -------------------------------------------------------------------------------- 1 | name Farsi 2 | folderpath data/column/fas/ 3 | format column 4 | labels PER ORG LOC GPE 5 | -------------------------------------------------------------------------------- /config/doc-Hebrew.txt: -------------------------------------------------------------------------------- 1 | name Hebrew 2 | folderpath data/tajson/heb/ 3 | format tajson 4 | labels PER ORG LOC GPE 5 | -------------------------------------------------------------------------------- /config/doc-Hindi.txt: -------------------------------------------------------------------------------- 1 | name Hindi 2 | folderpath data/tajson/hindi 3 | format tajson 4 | labels PER ORG LOC GPE 5 | -------------------------------------------------------------------------------- /config/doc-Russian.txt: -------------------------------------------------------------------------------- 1 | name Russian 2 | folderpath data/column/rus/ 3 | format column 4 | labels PER ORG LOC GPE 5 | -------------------------------------------------------------------------------- /config/sent-Hindi.txt: -------------------------------------------------------------------------------- 1 | mode sentence 2 | folderpath data/tajson/hindi/ 3 | terms भारत 4 | name hindi 5 | format tajson 6 | indexpath data/index_hindi 7 | labels PER ORG LOC GPE 8 | -------------------------------------------------------------------------------- /data/column/fas/1.txt: -------------------------------------------------------------------------------- 1 | در O 2 | جریان O 3 | حمله O 4 | شبه‌نظامیان O 5 | حامی O 6 | دولت O 7 | نیکاراگوئه O 8 | به O 9 | کلیسایی O 10 | که O 11 | ده‌ها O 12 | معترض O 13 | در O 14 | آن O 15 | پناه O 16 | گرفته O 17 | بودند O 18 | دست O 19 | کم O 20 | یک O 21 | دانشجو O 22 | کشته O 23 | شده O 24 | است O 25 | . O 26 | 27 | دانشجویان O 28 | پس O 29 | از O 30 | آن O 31 | وارد O 32 | کلیسا O 33 | شدند O 34 | که O 35 | در O 36 | جریان O 37 | تظاهرات O 38 | اعتراضی O 39 | خود O 40 | با O 41 | حمله O 42 | شبه‌نظامیان O 43 | مسلح O 44 | روبرو O 45 | شدند O 46 | و O 47 | به O 48 | کلیسا O 49 | پناه O 50 | بردند O 51 | اما O 52 | به O 53 | محاصره O 54 | در O 55 | آمدند O 56 | . O 57 | 58 | دانشجویان O 59 | از O 60 | دیشب O 61 | ( O 62 | جمعه O 63 | ) O 64 | در O 65 | این O 66 | کلیسا O 67 | که O 68 | در O 69 | یکی O 70 | از O 71 | مناطق O 72 | مسکونی O 73 | بومیان O 74 | واقع O 75 | شده O 76 | گیر O 77 | افتاده O 78 | بودند O 79 | . O 80 | 81 | آنها O 82 | به O 83 | همراه O 84 | هزاران O 85 | معترض O 86 | دیگر O 87 | خواستار O 88 | استعفای O 89 | دانیل O 90 | اورتگا O 91 | ، O 92 | رئیس O 93 | جمهوری O 94 | هستند O 95 | . O 96 | 97 | بیش O 98 | از O 99 | ۳۰۰ O 100 | نفر O 101 | تاکنون O 102 | در O 103 | جریان O 104 | این O 105 | اعتراض‌های O 106 | ضد O 107 | دولتی O 108 | کشته O 109 | شده‌اند O 110 | . O 111 | 112 | کلیسایی O 113 | که O 114 | دانشجویان O 115 | در O 116 | آن O 117 | محاصره O 118 | شده O 119 | بودند O 120 | در O 121 | نزدیکی O 122 | دانشگاه O 123 | اصلی O 124 | ماناگوئه O 125 | ، O 126 | پایتخت O 127 | است O 128 | و O 129 | گزارش O 130 | شده O 131 | که O 132 | دست O 133 | کم O 134 | ۱۵۰ O 135 | دانشجو O 136 | به O 137 | همراه O 138 | کشیش O 139 | و O 140 | روزنامه‌نگاران O 141 | در O 142 | داخل O 143 | این O 144 | کلیسا O 145 | پناه O 146 | گرفته O 147 | بودند O 148 | . O 149 | 150 | -------------------------------------------------------------------------------- /data/column/fas/2.txt: -------------------------------------------------------------------------------- 1 | این O 2 | روزها O 3 | و O 4 | با O 5 | گذشت O 6 | چند O 7 | دهه O 8 | از O 9 | سیاست O 10 | " O 11 | جهانی‌شدن O 12 | " O 13 | ، O 14 | ملی‌گرایی O 15 | و O 16 | دیگرهراسی O 17 | به O 18 | دستاویز O 19 | مناسبی O 20 | برای O 21 | برخی O 22 | سیاست‌مداران O 23 | جویای O 24 | قدرت O 25 | بدل O 26 | شده O 27 | است O 28 | . O 29 | 30 | در O 31 | اروپا O 32 | و O 33 | آمریکا، O 34 | برخی O 35 | سیاستمداران O 36 | عمدتا O 37 | راست‌گرا O 38 | پیکان O 39 | حمله O 40 | را O 41 | به O 42 | سوی O 43 | "آن O 44 | مردم O 45 | دیگر" O 46 | که O 47 | به O 48 | زعم O 49 | آنها O 50 | مقصر O 51 | تمامی O 52 | مشکلات O 53 | این O 54 | روزهای O 55 | "مردم O 56 | ما" O 57 | هستند، O 58 | نشانه O 59 | رفته‌اند. O 60 | نیاز O 61 | اولیه‌ای O 62 | مانند O 63 | غذا O 64 | و O 65 | اندازه O 66 | سفره O 67 | مردم، O 68 | یکی O 69 | از O 70 | موارد O 71 | مهم O 72 | این O 73 | تنش‌ها O 74 | است O 75 | که O 76 | "ما O 77 | مردم" O 78 | و O 79 | "آن O 80 | مردم O 81 | دیگر" O 82 | را O 83 | رو O 84 | در O 85 | روی O 86 | هم O 87 | قرار O 88 | داده O 89 | است O 90 | . O 91 | 92 | شاید O 93 | جالب O 94 | باشد O 95 | بدانید O 96 | که O 97 | همین O 98 | روزها O 99 | یکی O 100 | از O 101 | چالش‌های O 102 | پیش‌روی O 103 | برگزیت O 104 | ( O 105 | خروج O 106 | بریتانیا O 107 | از O 108 | اتحادیه O 109 | اروپا O 110 | ) O 111 | ، O 112 | عواقب O 113 | این O 114 | خروج O 115 | بر O 116 | ساز O 117 | و O 118 | کار O 119 | تامین O 120 | مواد O 121 | غذایی O 122 | مردم O 123 | بریتانیا O 124 | است. O 125 | مثلا O 126 | کمبود O 127 | منابع O 128 | و O 129 | نیروی O 130 | کار O 131 | که O 132 | از O 133 | قضای O 134 | روزگار O 135 | اغلب O 136 | توسط O 137 | همان O 138 | " O 139 | مردم O 140 | دیگر O 141 | " O 142 | تامین O 143 | می‌شود O 144 | ، O 145 | در O 146 | نهایت O 147 | موجب O 148 | افزایش O 149 | قیمت‌ها O 150 | خواهد O 151 | شد O 152 | . O 153 | 154 | -------------------------------------------------------------------------------- /data/column/fas/3.txt: -------------------------------------------------------------------------------- 1 | علی O 2 | اکبر O 3 | ولایتی O 4 | ، O 5 | مشاور O 6 | امور O 7 | بین O 8 | الملل O 9 | رهبر O 10 | ایران O 11 | می‌گوید O 12 | ولادیمیر O 13 | پوتین O 14 | ، O 15 | رئیس O 16 | جمهوری O 17 | روسیه O 18 | وعده O 19 | داده O 20 | که O 21 | در O 22 | صنعت O 23 | نفت O 24 | و O 25 | گاز O 26 | ایران O 27 | سرمایه‌گذاری O 28 | کند O 29 | . O 30 | 31 | آقای O 32 | ولایتی O 33 | مبلع O 34 | این O 35 | سرمایه O 36 | گذاری O 37 | را O 38 | تا O 39 | ۵۰ O 40 | میلیارد O 41 | دلار O 42 | اعلام O 43 | کرده O 44 | و O 45 | گفته O 46 | است O 47 | که O 48 | این O 49 | پول O 50 | می‌تواند O 51 | جایگزین O 52 | شرکت‌های O 53 | غربی O 54 | شود O 55 | که O 56 | به O 57 | دلیل O 58 | تحریم‌های O 59 | آمریکا O 60 | از O 61 | ایران O 62 | رفته‌اند O 63 | . O 64 | 65 | روسیه O 66 | در O 67 | دور O 68 | قبلی O 69 | تحریم‌ها O 70 | هم O 71 | با O 72 | ایران O 73 | مراودات O 74 | اقتصادی O 75 | داشت، O 76 | اما O 77 | شواهدی O 78 | از O 79 | تاثیر O 80 | قابل O 81 | توجه O 82 | روابط O 83 | اقتصادی O 84 | ایران O 85 | و O 86 | روسیه O 87 | بر O 88 | آثار O 89 | تحریم‌های O 90 | هسته‌ای O 91 | ایران O 92 | در O 93 | دست O 94 | نیست. O 95 | از O 96 | طرفی، O 97 | نگاهی O 98 | به O 99 | مقصد O 100 | سرمایه‌گذاری‌های O 101 | روسیه O 102 | در O 103 | سال‌های O 104 | اخیر O 105 | نشان O 106 | می‌دهد O 107 | که O 108 | این O 109 | کشور O 110 | سرمایه‌گذاری O 111 | قابل O 112 | توجهی O 113 | در O 114 | خاورمیانه O 115 | نداشته O 116 | و O 117 | بیشتر O 118 | پول O 119 | این O 120 | کشور O 121 | به O 122 | اروپا O 123 | سرازیر O 124 | می‌شود O 125 | . O 126 | 127 | -------------------------------------------------------------------------------- /data/column/rus/1.txt: -------------------------------------------------------------------------------- 1 | Матч O 2 | за O 3 | бронзу O 4 | нельзя O 5 | называть O 6 | утешительным O 7 | финалом O 8 | . O 9 | 10 | Утирать O 11 | слезы O 12 | и O 13 | делиться O 14 | носовыми O 15 | платочками O 16 | нужно O 17 | с O 18 | теми O 19 | , O 20 | кто O 21 | не O 22 | попал O 23 | на O 24 | элитную O 25 | вечеринку O 26 | , O 27 | кто O 28 | вылетел O 29 | до O 30 | стадии O 31 | полуфиналов O 32 | или O 33 | просто O 34 | не O 35 | доехал O 36 | до O 37 | России O 38 | . O 39 | 40 | В O 41 | Питере O 42 | же O 43 | прошел O 44 | малый O 45 | финал O 46 | , O 47 | прелюдия O 48 | к O 49 | основному O 50 | действу O 51 | , O 52 | которое O 53 | начнется O 54 | в O 55 | 18:00 O 56 | по O 57 | московскому O 58 | времени O 59 | 15 O 60 | июля O 61 | . O 62 | 63 | Мы O 64 | знаем O 65 | , O 66 | что O 67 | тысячи O 68 | англичан O 69 | после O 70 | четвертьфинала O 71 | в O 72 | восторге O 73 | ринулись O 74 | скупать O 75 | билеты O 76 | до O 77 | Москвы O 78 | , O 79 | на O 80 | исторический O 81 | полуфинал O 82 | чемпионата O 83 | мира O 84 | . O 85 | 86 | И O 87 | они O 88 | на O 89 | сто O 90 | процентов O 91 | рассчитывали O 92 | остаться O 93 | в O 94 | столице O 95 | , O 96 | не O 97 | уезжать O 98 | в O 99 | Питер O 100 | , O 101 | но O 102 | у O 103 | Марио O 104 | Манджукича O 105 | было O 106 | свое O 107 | мнение O 108 | на O 109 | этот O 110 | счет O 111 | . O 112 | 113 | Так O 114 | что O 115 | из-за O 116 | него O 117 | на O 118 | «Санкт-Петербурге» O 119 | белых O 120 | проплешин O 121 | было O 122 | больше O 123 | . O 124 | 125 | Почему O 126 | проплешин O 127 | ? O 128 | 129 | Потому O 130 | что O 131 | трибуны O 132 | были O 133 | заполнены O 134 | представителями O 135 | слишком O 136 | многих O 137 | национальностей O 138 | , O 139 | а O 140 | больше O 141 | всех O 142 | было O 143 | россиян O 144 | . O 145 | 146 | -------------------------------------------------------------------------------- /data/column/rus/2.txt: -------------------------------------------------------------------------------- 1 | Президент O 2 | России O 3 | Владимир O 4 | Путин O 5 | остался O 6 | удовлетворен O 7 | тем O 8 | , O 9 | как O 10 | чемпионат O 11 | мира O 12 | повысил O 13 | имидж O 14 | России O 15 | за O 16 | рубежом O 17 | , O 18 | и O 19 | готов O 20 | разработать O 21 | облегченный O 22 | визовый O 23 | режим O 24 | для O 25 | болельщиков O 26 | , O 27 | которые O 28 | захотят O 29 | вернуться O 30 | в O 31 | страну O 32 | . O 33 | 34 | Об O 35 | этом O 36 | он O 37 | рассказал O 38 | на O 39 | открытии O 40 | гала-концерта O 41 | « O 42 | Ночь O 43 | в O 44 | Большом O 45 | » O 46 | в O 47 | субботу O 48 | , O 49 | 14 O 50 | июля O 51 | , O 52 | в O 53 | Большом O 54 | театре O 55 | , O 56 | передает O 57 | ТАСС O 58 | . O 59 | 60 | 61 | « O 62 | Мы O 63 | признательны O 64 | за O 65 | миллионы O 66 | добрых O 67 | слов O 68 | , O 69 | сказанных O 70 | гостями O 71 | чемпионата O 72 | в O 73 | адрес O 74 | России O 75 | и O 76 | нашего O 77 | народа O 78 | , O 79 | рады O 80 | , O 81 | что O 82 | им O 83 | понравилось O 84 | его O 85 | гостеприимство O 86 | и O 87 | открытость O 88 | , O 89 | природа O 90 | , O 91 | культура O 92 | , O 93 | традиции O 94 | нашей O 95 | большой O 96 | страны O 97 | . O 98 | 99 | Мы O 100 | рады O 101 | , O 102 | что O 103 | наши O 104 | гости O 105 | все O 106 | увидели O 107 | своими O 108 | глазами O 109 | , O 110 | что O 111 | рухнули O 112 | мифы O 113 | и O 114 | предубеждения O 115 | » O 116 | , O 117 | — O 118 | заявил O 119 | он O 120 | . O 121 | 122 | -------------------------------------------------------------------------------- /data/column/rus/3.txt: -------------------------------------------------------------------------------- 1 | Американский O 2 | телеканал O 3 | CNBC O 4 | со O 5 | ссылкой O 6 | на O 7 | свои O 8 | источники O 9 | рассказал O 10 | об O 11 | успешных O 12 | испытаниях O 13 | в O 14 | России O 15 | гиперзвуковой O 16 | ракеты O 17 | « O 18 | Кинжал O 19 | » O 20 | . O 21 | 22 | По O 23 | данным O 24 | автора O 25 | материала O 26 | , O 27 | было O 28 | проведено O 29 | 12 O 30 | тестов O 31 | — O 32 | во O 33 | время O 34 | последнего O 35 | снаряд O 36 | поразил O 37 | цель O 38 | на O 39 | расстоянии O 40 | чуть O 41 | более O 42 | 800 O 43 | километров O 44 | . O 45 | 46 | Канал O 47 | уточняет O 48 | , O 49 | что O 50 | ракеты O 51 | , O 52 | вероятно O 53 | , O 54 | примут O 55 | на O 56 | вооружение O 57 | в O 58 | 2020 O 59 | году O 60 | . O 61 | 62 | Отмечается O 63 | также O 64 | , O 65 | что O 66 | США O 67 | пока O 68 | ничего O 69 | не O 70 | могут O 71 | противопоставить O 72 | « O 73 | Кинжалу O 74 | » O 75 | . O 76 | 77 | -------------------------------------------------------------------------------- /data/conll/eng/1.txt.conll: -------------------------------------------------------------------------------- 1 | O 0 1 x x US x x 0 2 | O 0 2 x x President x x 0 3 | O 0 3 x x Donald x x 0 4 | O 0 4 x x Trump x x 0 5 | O 0 5 x x is x x 0 6 | O 0 6 x x seeking x x 0 7 | O 0 7 x x to x x 0 8 | O 0 8 x x boost x x 0 9 | O 0 9 x x defence x x 0 10 | O 0 10 x x spending x x 0 11 | O 0 11 x x by x x 0 12 | O 0 12 x x 10 x x 0 13 | O 0 13 x x % x x 0 14 | O 0 14 x x in x x 0 15 | O 0 15 x x his x x 0 16 | O 0 16 x x proposed x x 0 17 | O 0 17 x x budget x x 0 18 | O 0 18 x x plan x x 0 19 | O 0 19 x x for x x 0 20 | O 0 20 x x 2018 x x 0 21 | O 0 21 x x . x x 0 22 | 23 | O 0 22 x x The x x 0 24 | O 0 23 x x blueprint x x 0 25 | O 0 24 x x will x x 0 26 | O 0 25 x x increase x x 0 27 | O 0 26 x x defence x x 0 28 | O 0 27 x x spending x x 0 29 | O 0 28 x x by x x 0 30 | O 0 29 x x $ x x 0 31 | O 0 30 x x 54bn x x 0 32 | O 0 31 x x ( x x 0 33 | O 0 32 x x £43bn x x 0 34 | O 0 33 x x ) x x 0 35 | O 0 34 x x but x x 0 36 | O 0 35 x x seeks x x 0 37 | O 0 36 x x to x x 0 38 | O 0 37 x x recoup x x 0 39 | O 0 38 x x that x x 0 40 | O 0 39 x x sum x x 0 41 | O 0 40 x x through x x 0 42 | O 0 41 x x deep x x 0 43 | O 0 42 x x cuts x x 0 44 | O 0 43 x x elsewhere x x 0 45 | O 0 44 x x , x x 0 46 | O 0 45 x x including x x 0 47 | O 0 46 x x to x x 0 48 | O 0 47 x x foreign x x 0 49 | O 0 48 x x aid x x 0 50 | O 0 49 x x . x x 0 51 | 52 | O 0 50 x x Mr x x 0 53 | O 0 51 x x Trump's x x 0 54 | O 0 52 x x plan x x 0 55 | O 0 53 x x leaves x x 0 56 | O 0 54 x x large x x 0 57 | O 0 55 x x welfare x x 0 58 | O 0 56 x x programmes x x 0 59 | O 0 57 x x untouched x x 0 60 | O 0 58 x x , x x 0 61 | O 0 59 x x despite x x 0 62 | O 0 60 x x Republican x x 0 63 | O 0 61 x x calls x x 0 64 | O 0 62 x x for x x 0 65 | O 0 63 x x reform x x 0 66 | O 0 64 x x . x x 0 67 | 68 | O 0 65 x x The x x 0 69 | O 0 66 x x president x x 0 70 | O 0 67 x x has x x 0 71 | O 0 68 x x consulted x x 0 72 | O 0 69 x x government x x 0 73 | O 0 70 x x agencies x x 0 74 | O 0 71 x x about x x 0 75 | O 0 72 x x his x x 0 76 | O 0 73 x x plans x x 0 77 | O 0 74 x x and x x 0 78 | O 0 75 x x will x x 0 79 | O 0 76 x x present x x 0 80 | O 0 77 x x his x x 0 81 | O 0 78 x x budget x x 0 82 | O 0 79 x x to x x 0 83 | O 0 80 x x Congress x x 0 84 | O 0 81 x x in x x 0 85 | O 0 82 x x May x x 0 86 | O 0 83 x x . x x 0 87 | 88 | O 0 84 x x Between x x 0 89 | O 0 85 x x now x x 0 90 | O 0 86 x x and x x 0 91 | O 0 87 x x then x x 0 92 | O 0 88 x x , x x 0 93 | O 0 89 x x he x x 0 94 | O 0 90 x x needs x x 0 95 | O 0 91 x x to x x 0 96 | O 0 92 x x identify x x 0 97 | O 0 93 x x where x x 0 98 | O 0 94 x x the x x 0 99 | O 0 95 x x agencies x x 0 100 | O 0 96 x x can x x 0 101 | O 0 97 x x make x x 0 102 | O 0 98 x x savings x x 0 103 | O 0 99 x x and x x 0 104 | O 0 100 x x work x x 0 105 | O 0 101 x x out x x 0 106 | O 0 102 x x what x x 0 107 | O 0 103 x x he x x 0 108 | O 0 104 x x does x x 0 109 | O 0 105 x x with x x 0 110 | O 0 106 x x tax x x 0 111 | O 0 107 x x reform x x 0 112 | O 0 108 x x . x x 0 113 | 114 | O 0 109 x x Republican x x 0 115 | O 0 110 x x John x x 0 116 | O 0 111 x x McCain x x 0 117 | O 0 112 x x said x x 0 118 | O 0 113 x x the x x 0 119 | O 0 114 x x $ x x 0 120 | O 0 115 x x 603bn x x 0 121 | O 0 116 x x defence x x 0 122 | O 0 117 x x budget x x 0 123 | O 0 118 x x - x x 0 124 | O 0 120 x x which x x 0 125 | O 0 121 x x White x x 0 126 | O 0 122 x x House x x 0 127 | O 0 123 x x officials x x 0 128 | O 0 124 x x outlined x x 0 129 | O 0 125 x x - x x 0 130 | O 0 127 x x would x x 0 131 | O 0 128 x x be x x 0 132 | O 0 129 x x insufficient x x 0 133 | O 0 130 x x . x x 0 134 | 135 | O 0 131 x x Speaking x x 0 136 | O 0 132 x x at x x 0 137 | O 0 133 x x the x x 0 138 | O 0 134 x x White x x 0 139 | O 0 135 x x House x x 0 140 | O 0 136 x x during x x 0 141 | O 0 137 x x a x x 0 142 | O 0 138 x x meeting x x 0 143 | O 0 139 x x with x x 0 144 | O 0 140 x x state x x 0 145 | O 0 141 x x governors x x 0 146 | O 0 142 x x on x x 0 147 | O 0 143 x x Monday x x 0 148 | O 0 144 x x morning x x 0 149 | O 0 145 x x , x x 0 150 | O 0 146 x x Mr x x 0 151 | O 0 147 x x Trump x x 0 152 | O 0 148 x x said x x 0 153 | O 0 149 x x : x x 0 154 | O 0 150 x x " x x 0 155 | O 0 151 x x We're x x 0 156 | O 0 152 x x going x x 0 157 | O 0 153 x x to x x 0 158 | O 0 154 x x do x x 0 159 | O 0 155 x x more x x 0 160 | O 0 156 x x with x x 0 161 | O 0 157 x x less x x 0 162 | O 0 158 x x and x x 0 163 | O 0 159 x x make x x 0 164 | O 0 160 x x the x x 0 165 | O 0 161 x x government x x 0 166 | O 0 162 x x lean x x 0 167 | O 0 163 x x and x x 0 168 | O 0 164 x x accountable x x 0 169 | O 0 165 x x . x x 0 170 | O 0 166 x x " x x 0 171 | 172 | O 0 167 x x The x x 0 173 | O 0 168 x x president x x 0 174 | O 0 169 x x , x x 0 175 | O 0 170 x x who x x 0 176 | O 0 171 x x vowed x x 0 177 | O 0 172 x x to x x 0 178 | O 0 173 x x increase x x 0 179 | O 0 174 x x military x x 0 180 | O 0 175 x x spending x x 0 181 | O 0 176 x x and x x 0 182 | O 0 177 x x preserve x x 0 183 | O 0 178 x x welfare x x 0 184 | O 0 179 x x programmes x x 0 185 | O 0 180 x x during x x 0 186 | O 0 181 x x his x x 0 187 | O 0 182 x x campaign x x 0 188 | O 0 183 x x , x x 0 189 | O 0 184 x x said x x 0 190 | O 0 185 x x the x x 0 191 | O 0 186 x x budget x x 0 192 | O 0 187 x x will x x 0 193 | O 0 188 x x focus x x 0 194 | O 0 189 x x on x x 0 195 | O 0 190 x x " x x 0 196 | O 0 191 x x military x x 0 197 | O 0 192 x x , x x 0 198 | O 0 193 x x safety x x 0 199 | O 0 194 x x , x x 0 200 | O 0 195 x x economic x x 0 201 | O 0 196 x x development x x 0 202 | O 0 197 x x " x x 0 203 | O 0 198 x x . x x 0 204 | 205 | O 0 199 x x " x x 0 206 | O 0 200 x x It x x 0 207 | O 0 201 x x will x x 0 208 | O 0 202 x x include x x 0 209 | O 0 203 x x an x x 0 210 | O 0 204 x x historic x x 0 211 | O 0 205 x x increase x x 0 212 | O 0 206 x x in x x 0 213 | O 0 207 x x defence x x 0 214 | O 0 208 x x spending x x 0 215 | O 0 209 x x to x x 0 216 | O 0 210 x x rebuild x x 0 217 | O 0 211 x x the x x 0 218 | O 0 212 x x depleted x x 0 219 | O 0 213 x x military x x 0 220 | O 0 214 x x of x x 0 221 | O 0 215 x x the x x 0 222 | O 0 216 x x United x x 0 223 | O 0 217 x x States x x 0 224 | O 0 218 x x of x x 0 225 | O 0 219 x x America x x 0 226 | O 0 220 x x at x x 0 227 | O 0 221 x x a x x 0 228 | O 0 222 x x time x x 0 229 | O 0 223 x x we x x 0 230 | O 0 224 x x most x x 0 231 | O 0 225 x x need x x 0 232 | O 0 226 x x it x x 0 233 | O 0 227 x x , x x 0 234 | O 0 228 x x " x x 0 235 | O 0 229 x x he x x 0 236 | O 0 230 x x said x x 0 237 | O 0 231 x x . x x 0 238 | 239 | -------------------------------------------------------------------------------- /data/conll/eng/3.txt.conll: -------------------------------------------------------------------------------- 1 | O 0 1 x x Olga x x 0 2 | O 0 2 x x Korbut x x 0 3 | O 0 3 x x , x x 0 4 | O 0 4 x x the x x 0 5 | O 0 5 x x Soviet x x 0 6 | O 0 6 x x gymnast x x 0 7 | O 0 7 x x who x x 0 8 | O 0 8 x x charmed x x 0 9 | O 0 9 x x the x x 0 10 | O 0 10 x x world x x 0 11 | O 0 11 x x at x x 0 12 | O 0 12 x x the x x 0 13 | O 0 13 x x 1972 x x 0 14 | O 0 14 x x Munich x x 0 15 | O 0 15 x x Olympics x x 0 16 | O 0 16 x x , x x 0 17 | O 0 17 x x has x x 0 18 | O 0 18 x x sold x x 0 19 | O 0 19 x x her x x 0 20 | O 0 20 x x medal x x 0 21 | O 0 21 x x haul x x 0 22 | O 0 22 x x and x x 0 23 | O 0 23 x x other x x 0 24 | O 0 24 x x trophies x x 0 25 | O 0 25 x x in x x 0 26 | O 0 26 x x a x x 0 27 | O 0 27 x x US x x 0 28 | O 0 28 x x auction x x 0 29 | O 0 29 x x . x x 0 30 | 31 | O 0 30 x x The x x 0 32 | O 0 31 x x sale x x 0 33 | O 0 32 x x of x x 0 34 | O 0 33 x x seven x x 0 35 | O 0 34 x x lots x x 0 36 | O 0 35 x x - x x 0 37 | O 0 37 x x including x x 0 38 | O 0 38 x x two x x 0 39 | O 0 39 x x golds x x 0 40 | O 0 40 x x and x x 0 41 | O 0 41 x x a x x 0 42 | O 0 42 x x silver x x 0 43 | O 0 43 x x from x x 0 44 | O 0 44 x x the x x 0 45 | O 0 45 x x Munich x x 0 46 | O 0 46 x x Games x x 0 47 | O 0 47 x x - x x 0 48 | O 0 49 x x fetched x x 0 49 | O 0 50 x x $ x x 0 50 | O 0 51 x x 183,300 x x 0 51 | O 0 52 x x ( x x 0 52 | O 0 53 x x £147,000 x x 0 53 | O 0 54 x x ) x x 0 54 | O 0 55 x x for x x 0 55 | O 0 56 x x Korbut x x 0 56 | O 0 57 x x . x x 0 57 | 58 | O 0 58 x x The x x 0 59 | O 0 59 x x most x x 0 60 | O 0 60 x x expensive x x 0 61 | O 0 61 x x item x x 0 62 | O 0 62 x x was x x 0 63 | O 0 63 x x her x x 0 64 | O 0 64 x x team x x 0 65 | O 0 65 x x gold x x 0 66 | O 0 66 x x ( x x 0 67 | O 0 67 x x $ x x 0 68 | O 0 68 x x 66,000 x x 0 69 | O 0 69 x x ) x x 0 70 | O 0 70 x x . x x 0 71 | 72 | O 0 71 x x Born x x 0 73 | O 0 72 x x in x x 0 74 | O 0 73 x x Belarus x x 0 75 | O 0 74 x x , x x 0 76 | O 0 75 x x she x x 0 77 | O 0 76 x x moved x x 0 78 | O 0 77 x x to x x 0 79 | O 0 78 x x the x x 0 80 | O 0 79 x x US x x 0 81 | O 0 80 x x in x x 0 82 | O 0 81 x x 1991 x x 0 83 | O 0 82 x x . x x 0 84 | O 0 83 x x Now x x 0 85 | O 0 84 x x 61 x x 0 86 | O 0 85 x x , x x 0 87 | O 0 86 x x she x x 0 88 | O 0 87 x x lives x x 0 89 | O 0 88 x x in x x 0 90 | O 0 89 x x Arizona x x 0 91 | O 0 90 x x . x x 0 92 | 93 | O 0 91 x x In x x 0 94 | O 0 92 x x 1972 x x 0 95 | O 0 93 x x , x x 0 96 | O 0 94 x x at x x 0 97 | O 0 95 x x the x x 0 98 | O 0 96 x x height x x 0 99 | O 0 97 x x of x x 0 100 | O 0 98 x x the x x 0 101 | O 0 99 x x Cold x x 0 102 | O 0 100 x x War x x 0 103 | O 0 101 x x , x x 0 104 | O 0 102 x x Korbut's x x 0 105 | O 0 103 x x breathtaking x x 0 106 | O 0 104 x x gymnastics x x 0 107 | O 0 105 x x won x x 0 108 | O 0 106 x x millions x x 0 109 | O 0 107 x x of x x 0 110 | O 0 108 x x admirers x x 0 111 | O 0 109 x x in x x 0 112 | O 0 110 x x the x x 0 113 | O 0 111 x x West x x 0 114 | O 0 112 x x when x x 0 115 | O 0 113 x x she x x 0 116 | O 0 114 x x was x x 0 117 | O 0 115 x x just x x 0 118 | O 0 116 x x 17 x x 0 119 | O 0 117 x x . x x 0 120 | 121 | O 0 118 x x Just x x 0 122 | O 0 119 x x 1.5m x x 0 123 | O 0 120 x x tall x x 0 124 | O 0 121 x x ( x x 0 125 | O 0 122 x x 4ft x x 0 126 | O 0 123 x x 11 x x 0 127 | O 0 124 x x ) x x 0 128 | O 0 125 x x , x x 0 129 | O 0 126 x x she x x 0 130 | O 0 127 x x was x x 0 131 | O 0 128 x x nicknamed x x 0 132 | O 0 129 x x " x x 0 133 | O 0 130 x x the x x 0 134 | O 0 131 x x Sparrow x x 0 135 | O 0 132 x x from x x 0 136 | O 0 133 x x Minsk x x 0 137 | O 0 134 x x " x x 0 138 | O 0 135 x x . x x 0 139 | 140 | O 0 136 x x Her x x 0 141 | O 0 137 x x captivating x x 0 142 | O 0 138 x x smile x x 0 143 | O 0 139 x x and x x 0 144 | O 0 140 x x quirky x x 0 145 | O 0 141 x x charm x x 0 146 | O 0 142 x x helped x x 0 147 | O 0 143 x x to x x 0 148 | O 0 144 x x turn x x 0 149 | O 0 145 x x her x x 0 150 | O 0 146 x x into x x 0 151 | O 0 147 x x an x x 0 152 | O 0 148 x x Olympic x x 0 153 | O 0 149 x x legend x x 0 154 | O 0 150 x x . x x 0 155 | 156 | O 0 151 x x She x x 0 157 | O 0 152 x x won x x 0 158 | O 0 153 x x three x x 0 159 | O 0 154 x x golds x x 0 160 | O 0 155 x x ( x x 0 161 | O 0 156 x x team x x 0 162 | O 0 157 x x , x x 0 163 | O 0 158 x x balance x x 0 164 | O 0 159 x x beam x x 0 165 | O 0 160 x x and x x 0 166 | O 0 161 x x floor x x 0 167 | O 0 162 x x exercise x x 0 168 | O 0 163 x x ) x x 0 169 | O 0 164 x x and x x 0 170 | O 0 165 x x a x x 0 171 | O 0 166 x x silver x x 0 172 | O 0 167 x x at x x 0 173 | O 0 168 x x the x x 0 174 | O 0 169 x x 1972 x x 0 175 | O 0 170 x x Munich x x 0 176 | O 0 171 x x Olympics x x 0 177 | O 0 172 x x . x x 0 178 | 179 | O 0 173 x x In x x 0 180 | O 0 174 x x 1976 x x 0 181 | O 0 175 x x she x x 0 182 | O 0 176 x x won x x 0 183 | O 0 177 x x another x x 0 184 | O 0 178 x x gold x x 0 185 | O 0 179 x x and x x 0 186 | O 0 180 x x a x x 0 187 | O 0 181 x x silver x x 0 188 | O 0 182 x x at x x 0 189 | O 0 183 x x the x x 0 190 | O 0 184 x x Montreal x x 0 191 | O 0 185 x x Games x x 0 192 | O 0 186 x x . x x 0 193 | 194 | O 0 187 x x Heritage x x 0 195 | O 0 188 x x Auctions x x 0 196 | O 0 189 x x , x x 0 197 | O 0 190 x x organiser x x 0 198 | O 0 191 x x of x x 0 199 | O 0 192 x x the x x 0 200 | O 0 193 x x sale x x 0 201 | O 0 194 x x , x x 0 202 | O 0 195 x x says x x 0 203 | O 0 196 x x " x x 0 204 | O 0 197 x x there x x 0 205 | O 0 198 x x is x x 0 206 | O 0 199 x x hardly x x 0 207 | O 0 200 x x a x x 0 208 | O 0 201 x x gymnast x x 0 209 | O 0 202 x x alive x x 0 210 | O 0 203 x x who x x 0 211 | O 0 204 x x doesn't x x 0 212 | O 0 205 x x credit x x 0 213 | O 0 206 x x this x x 0 214 | O 0 207 x x tiny x x 0 215 | O 0 208 x x force x x 0 216 | O 0 209 x x of x x 0 217 | O 0 210 x x nature x x 0 218 | O 0 211 x x for x x 0 219 | O 0 212 x x the x x 0 220 | O 0 213 x x explosion x x 0 221 | O 0 214 x x of x x 0 222 | O 0 215 x x the x x 0 223 | O 0 216 x x sport's x x 0 224 | O 0 217 x x popularity x x 0 225 | O 0 218 x x on x x 0 226 | O 0 219 x x a x x 0 227 | O 0 220 x x global x x 0 228 | O 0 221 x x level x x 0 229 | O 0 222 x x " x x 0 230 | O 0 223 x x . x x 0 231 | 232 | O 0 224 x x The x x 0 233 | O 0 225 x x sale x x 0 234 | O 0 226 x x items x x 0 235 | O 0 227 x x included x x 0 236 | O 0 228 x x one x x 0 237 | O 0 229 x x of x x 0 238 | O 0 230 x x her x x 0 239 | O 0 231 x x performance x x 0 240 | O 0 232 x x leotards x x 0 241 | O 0 233 x x , x x 0 242 | O 0 234 x x her x x 0 243 | O 0 235 x x 1972 x x 0 244 | O 0 236 x x BBC x x 0 245 | O 0 237 x x Sports x x 0 246 | O 0 238 x x Personality x x 0 247 | O 0 239 x x of x x 0 248 | O 0 240 x x the x x 0 249 | O 0 241 x x Year x x 0 250 | O 0 242 x x Award x x 0 251 | O 0 243 x x , x x 0 252 | O 0 244 x x various x x 0 253 | O 0 245 x x Soviet x x 0 254 | O 0 246 x x medals x x 0 255 | O 0 247 x x and x x 0 256 | O 0 248 x x a x x 0 257 | O 0 249 x x sports x x 0 258 | O 0 250 x x magazine x x 0 259 | O 0 251 x x cover x x 0 260 | O 0 252 x x signed x x 0 261 | O 0 253 x x by x x 0 262 | O 0 254 x x her x x 0 263 | O 0 255 x x . x x 0 264 | 265 | O 0 256 x x The x x 0 266 | O 0 257 x x Korbut x x 0 267 | O 0 258 x x Flip x x 0 268 | O 0 259 x x was x x 0 269 | O 0 260 x x a x x 0 270 | O 0 261 x x spectacular x x 0 271 | O 0 262 x x trick x x 0 272 | O 0 263 x x that x x 0 273 | O 0 264 x x she x x 0 274 | O 0 265 x x performed x x 0 275 | O 0 266 x x on x x 0 276 | O 0 267 x x the x x 0 277 | O 0 268 x x asymmetric x x 0 278 | O 0 269 x x bars x x 0 279 | O 0 270 x x - x x 0 280 | O 0 272 x x a x x 0 281 | O 0 273 x x trick x x 0 282 | O 0 274 x x now x x 0 283 | O 0 275 x x banned x x 0 284 | O 0 276 x x from x x 0 285 | O 0 277 x x the x x 0 286 | O 0 278 x x Olympics x x 0 287 | O 0 279 x x as x x 0 288 | O 0 280 x x it x x 0 289 | O 0 281 x x is x x 0 290 | O 0 282 x x considered x x 0 291 | O 0 283 x x too x x 0 292 | O 0 284 x x dangerous x x 0 293 | O 0 285 x x . x x 0 294 | 295 | O 0 286 x x The x x 0 296 | O 0 287 x x flip x x 0 297 | O 0 288 x x can x x 0 298 | O 0 289 x x be x x 0 299 | O 0 290 x x seen x x 0 300 | O 0 291 x x on x x 0 301 | O 0 292 x x YouTube x x 0 302 | O 0 293 x x - x x 0 303 | O 0 295 x x it x x 0 304 | O 0 296 x x begins x x 0 305 | O 0 297 x x with x x 0 306 | O 0 298 x x a x x 0 307 | O 0 299 x x somersault x x 0 308 | O 0 300 x x on x x 0 309 | O 0 301 x x the x x 0 310 | O 0 302 x x top x x 0 311 | O 0 303 x x bar x x 0 312 | O 0 304 x x . x x 0 313 | 314 | O 0 305 x x From x x 0 315 | O 0 306 x x 1978 x x 0 316 | O 0 307 x x to x x 0 317 | O 0 308 x x 2000 x x 0 318 | O 0 309 x x Korbut x x 0 319 | O 0 310 x x was x x 0 320 | O 0 311 x x married x x 0 321 | O 0 312 x x to x x 0 322 | O 0 313 x x Leonid x x 0 323 | O 0 314 x x Bortkevich x x 0 324 | O 0 315 x x , x x 0 325 | O 0 316 x x a x x 0 326 | O 0 317 x x famous x x 0 327 | O 0 318 x x Soviet-era x x 0 328 | O 0 319 x x folk x x 0 329 | O 0 320 x x singer x x 0 330 | O 0 321 x x , x x 0 331 | O 0 322 x x with x x 0 332 | O 0 323 x x whom x x 0 333 | O 0 324 x x she x x 0 334 | O 0 325 x x moved x x 0 335 | O 0 326 x x to x x 0 336 | O 0 327 x x the x x 0 337 | O 0 328 x x US x x 0 338 | O 0 329 x x after x x 0 339 | O 0 330 x x the x x 0 340 | O 0 331 x x USSR's x x 0 341 | O 0 332 x x collapse x x 0 342 | O 0 333 x x in x x 0 343 | O 0 334 x x 1991 x x 0 344 | O 0 335 x x . x x 0 345 | 346 | O 0 336 x x She x x 0 347 | O 0 337 x x has x x 0 348 | O 0 338 x x a x x 0 349 | O 0 339 x x son x x 0 350 | O 0 340 x x called x x 0 351 | O 0 341 x x Richard x x 0 352 | O 0 342 x x . x x 0 353 | 354 | -------------------------------------------------------------------------------- /data/conll/eng/coutinho.conll: -------------------------------------------------------------------------------- 1 | O 0 1 x x Liverpool x x 0 2 | O 0 2 x x forward x x 0 3 | O 0 3 x x Philippe x x 0 4 | O 0 4 x x Coutinho x x 0 5 | O 0 5 x x has x x 0 6 | O 0 6 x x signed x x 0 7 | O 0 7 x x a x x 0 8 | O 0 8 x x new x x 0 9 | O 0 9 x x five-year x x 0 10 | O 0 10 x x contract x x 0 11 | O 0 11 x x worth x x 0 12 | O 0 12 x x about x x 0 13 | O 0 13 x x £150,000 x x 0 14 | O 0 14 x x a x x 0 15 | O 0 15 x x week x x 0 16 | O 0 16 x x , x x 0 17 | O 0 17 x x making x x 0 18 | O 0 18 x x him x x 0 19 | O 0 19 x x the x x 0 20 | O 0 20 x x highest-paid x x 0 21 | O 0 21 x x player x x 0 22 | O 0 22 x x at x x 0 23 | O 0 23 x x the x x 0 24 | O 0 24 x x club x x 0 25 | O 0 25 x x . x x 0 26 | 27 | O 0 26 x x The x x 0 28 | O 0 27 x x 24-year-old x x 0 29 | O 0 28 x x Brazil x x 0 30 | O 0 29 x x international x x 0 31 | O 0 30 x x joined x x 0 32 | O 0 31 x x the x x 0 33 | O 0 32 x x Reds x x 0 34 | O 0 33 x x from x x 0 35 | O 0 34 x x Inter x x 0 36 | O 0 35 x x Milan x x 0 37 | O 0 36 x x for x x 0 38 | O 0 37 x x £8.5m x x 0 39 | O 0 38 x x in x x 0 40 | O 0 39 x x January x x 0 41 | O 0 40 x x 2013 x x 0 42 | O 0 41 x x , x x 0 43 | O 0 42 x x and x x 0 44 | O 0 43 x x his x x 0 45 | O 0 44 x x new x x 0 46 | O 0 45 x x deal x x 0 47 | O 0 46 x x will x x 0 48 | O 0 47 x x take x x 0 49 | O 0 48 x x him x x 0 50 | O 0 49 x x through x x 0 51 | O 0 50 x x to x x 0 52 | O 0 51 x x 2022 x x 0 53 | O 0 52 x x . x x 0 54 | 55 | O 0 53 x x Coutinho x x 0 56 | O 0 54 x x has x x 0 57 | O 0 55 x x scored x x 0 58 | O 0 56 x x 34 x x 0 59 | O 0 57 x x goals x x 0 60 | O 0 58 x x in x x 0 61 | O 0 59 x x 163 x x 0 62 | O 0 60 x x appearances x x 0 63 | O 0 61 x x for x x 0 64 | O 0 62 x x Liverpool x x 0 65 | O 0 63 x x . x x 0 66 | 67 | O 0 64 x x " x x 0 68 | O 0 65 x x It x x 0 69 | O 0 66 x x is x x 0 70 | O 0 67 x x a x x 0 71 | O 0 68 x x club x x 0 72 | O 0 69 x x that x x 0 73 | O 0 70 x x I x x 0 74 | O 0 71 x x am x x 0 75 | O 0 72 x x very x x 0 76 | O 0 73 x x grateful x x 0 77 | O 0 74 x x to x x 0 78 | O 0 75 x x and x x 0 79 | O 0 76 x x this x x 0 80 | O 0 77 x x shows x x 0 81 | O 0 78 x x my x x 0 82 | O 0 79 x x happiness x x 0 83 | O 0 80 x x here x x 0 84 | O 0 81 x x , x x 0 85 | O 0 82 x x " x x 0 86 | O 0 83 x x he x x 0 87 | O 0 84 x x told x x 0 88 | O 0 85 x x the x x 0 89 | O 0 86 x x club's x x 0 90 | O 0 87 x x website x x 0 91 | O 0 88 x x . x x 0 92 | 93 | O 0 89 x x There x x 0 94 | O 0 90 x x is x x 0 95 | O 0 91 x x no x x 0 96 | O 0 92 x x release x x 0 97 | O 0 93 x x clause x x 0 98 | O 0 94 x x in x x 0 99 | O 0 95 x x Coutinho's x x 0 100 | O 0 96 x x new x x 0 101 | O 0 97 x x contract x x 0 102 | O 0 98 x x , x x 0 103 | O 0 99 x x the x x 0 104 | O 0 100 x x terms x x 0 105 | O 0 101 x x of x x 0 106 | O 0 102 x x which x x 0 107 | O 0 103 x x come x x 0 108 | O 0 104 x x into x x 0 109 | O 0 105 x x effect x x 0 110 | O 0 106 x x from x x 0 111 | O 0 107 x x 1 x x 0 112 | O 0 108 x x July x x 0 113 | O 0 109 x x . x x 0 114 | 115 | O 0 110 x x Coutinho x x 0 116 | O 0 111 x x , x x 0 117 | O 0 112 x x who x x 0 118 | O 0 113 x x had x x 0 119 | O 0 114 x x been x x 0 120 | O 0 115 x x linked x x 0 121 | O 0 116 x x with x x 0 122 | O 0 117 x x a x x 0 123 | O 0 118 x x move x x 0 124 | O 0 119 x x to x x 0 125 | O 0 120 x x Spanish x x 0 126 | O 0 121 x x champions x x 0 127 | O 0 122 x x Barcelona x x 0 128 | O 0 123 x x , x x 0 129 | O 0 124 x x added x x 0 130 | O 0 125 x x : x x 0 131 | O 0 126 x x " x x 0 132 | O 0 127 x x I x x 0 133 | O 0 128 x x signed x x 0 134 | O 0 129 x x this x x 0 135 | O 0 130 x x new x x 0 136 | O 0 131 x x contract x x 0 137 | O 0 132 x x to x x 0 138 | O 0 133 x x stay x x 0 139 | O 0 134 x x here x x 0 140 | O 0 135 x x for x x 0 141 | O 0 136 x x a x x 0 142 | O 0 137 x x few x x 0 143 | O 0 138 x x more x x 0 144 | O 0 139 x x years x x 0 145 | O 0 140 x x because x x 0 146 | O 0 141 x x it's x x 0 147 | O 0 142 x x a x x 0 148 | O 0 143 x x great x x 0 149 | O 0 144 x x honour x x 0 150 | O 0 145 x x for x x 0 151 | O 0 146 x x me x x 0 152 | O 0 147 x x . x x 0 153 | 154 | O 0 148 x x " x x 0 155 | O 0 149 x x It x x 0 156 | O 0 150 x x gives x x 0 157 | O 0 151 x x me x x 0 158 | O 0 152 x x great x x 0 159 | O 0 153 x x happiness x x 0 160 | O 0 154 x x because x x 0 161 | O 0 155 x x I x x 0 162 | O 0 156 x x was x x 0 163 | O 0 157 x x welcomed x x 0 164 | O 0 158 x x here x x 0 165 | O 0 159 x x with x x 0 166 | O 0 160 x x open x x 0 167 | O 0 161 x x arms x x 0 168 | O 0 162 x x by x x 0 169 | O 0 163 x x everyone x x 0 170 | O 0 164 x x at x x 0 171 | O 0 165 x x the x x 0 172 | O 0 166 x x club x x 0 173 | O 0 167 x x and x x 0 174 | O 0 168 x x the x x 0 175 | O 0 169 x x supporters x x 0 176 | O 0 170 x x right x x 0 177 | O 0 171 x x from x x 0 178 | O 0 172 x x my x x 0 179 | O 0 173 x x first x x 0 180 | O 0 174 x x day x x 0 181 | O 0 175 x x . x x 0 182 | O 0 176 x x " x x 0 183 | 184 | O 0 177 x x Coutinho x x 0 185 | O 0 178 x x was x x 0 186 | O 0 179 x x brought x x 0 187 | O 0 180 x x to x x 0 188 | O 0 181 x x Anfield x x 0 189 | O 0 182 x x by x x 0 190 | O 0 183 x x former x x 0 191 | O 0 184 x x manager x x 0 192 | O 0 185 x x Brendan x x 0 193 | O 0 186 x x Rodgers x x 0 194 | O 0 187 x x , x x 0 195 | O 0 188 x x with x x 0 196 | O 0 189 x x Southampton x x 0 197 | O 0 190 x x also x x 0 198 | O 0 191 x x interested x x 0 199 | O 0 192 x x in x x 0 200 | O 0 193 x x signing x x 0 201 | O 0 194 x x him x x 0 202 | O 0 195 x x at x x 0 203 | O 0 196 x x the x x 0 204 | O 0 197 x x time x x 0 205 | O 0 198 x x . x x 0 206 | 207 | O 0 199 x x He x x 0 208 | O 0 200 x x has x x 0 209 | O 0 201 x x established x x 0 210 | O 0 202 x x himself x x 0 211 | O 0 203 x x as x x 0 212 | O 0 204 x x one x x 0 213 | O 0 205 x x of x x 0 214 | O 0 206 x x the x x 0 215 | O 0 207 x x Reds x x 0 216 | O 0 208 x x ' x x 0 217 | O 0 209 x x key x x 0 218 | O 0 210 x x players x x 0 219 | O 0 211 x x during x x 0 220 | O 0 212 x x his x x 0 221 | O 0 213 x x four x x 0 222 | O 0 214 x x years x x 0 223 | O 0 215 x x at x x 0 224 | O 0 216 x x Anfield x x 0 225 | O 0 217 x x . x x 0 226 | 227 | O 0 218 x x Liverpool x x 0 228 | O 0 219 x x manager x x 0 229 | O 0 220 x x Jurgen x x 0 230 | O 0 221 x x Klopp x x 0 231 | O 0 222 x x believes x x 0 232 | O 0 223 x x his x x 0 233 | O 0 224 x x decision x x 0 234 | O 0 225 x x to x x 0 235 | O 0 226 x x sign x x 0 236 | O 0 227 x x a x x 0 237 | O 0 228 x x new x x 0 238 | O 0 229 x x long-term x x 0 239 | O 0 230 x x contract x x 0 240 | O 0 231 x x sends x x 0 241 | O 0 232 x x out x x 0 242 | O 0 233 x x a x x 0 243 | O 0 234 x x " x x 0 244 | O 0 235 x x big x x 0 245 | O 0 236 x x statement x x 0 246 | O 0 237 x x " x x 0 247 | O 0 238 x x . x x 0 248 | 249 | O 0 239 x x " x x 0 250 | O 0 240 x x This x x 0 251 | O 0 241 x x is x x 0 252 | O 0 242 x x wonderful x x 0 253 | O 0 243 x x news x x 0 254 | O 0 244 x x , x x 0 255 | O 0 245 x x " x x 0 256 | O 0 246 x x said x x 0 257 | O 0 247 x x Klopp x x 0 258 | O 0 248 x x , x x 0 259 | O 0 249 x x whose x x 0 260 | O 0 250 x x side x x 0 261 | O 0 251 x x are x x 0 262 | O 0 252 x x fourth x x 0 263 | O 0 253 x x in x x 0 264 | O 0 254 x x the x x 0 265 | O 0 255 x x Premier x x 0 266 | O 0 256 x x League x x 0 267 | O 0 257 x x , x x 0 268 | O 0 258 x x 10 x x 0 269 | O 0 259 x x points x x 0 270 | O 0 260 x x behind x x 0 271 | O 0 261 x x leaders x x 0 272 | O 0 262 x x Chelsea x x 0 273 | O 0 263 x x . x x 0 274 | 275 | O 0 264 x x " x x 0 276 | O 0 265 x x He x x 0 277 | O 0 266 x x is x x 0 278 | O 0 267 x x truly x x 0 279 | O 0 268 x x world x x 0 280 | O 0 269 x x class x x 0 281 | O 0 270 x x - x x 0 282 | O 0 272 x x in x x 0 283 | O 0 273 x x that x x 0 284 | O 0 274 x x very x x 0 285 | O 0 275 x x top x x 0 286 | O 0 276 x x bracket x x 0 287 | O 0 277 x x . x x 0 288 | O 0 278 x x He x x 0 289 | O 0 279 x x knows x x 0 290 | O 0 280 x x he x x 0 291 | O 0 281 x x can x x 0 292 | O 0 282 x x fulfil x x 0 293 | O 0 283 x x his x x 0 294 | O 0 284 x x dreams x x 0 295 | O 0 285 x x and x x 0 296 | O 0 286 x x ambitions x x 0 297 | O 0 287 x x here x x 0 298 | O 0 288 x x at x x 0 299 | O 0 289 x x Liverpool x x 0 300 | O 0 290 x x . x x 0 301 | O 0 291 x x " x x 0 302 | 303 | O 0 292 x x Coutinho x x 0 304 | O 0 293 x x has x x 0 305 | O 0 294 x x recently x x 0 306 | O 0 295 x x returned x x 0 307 | O 0 296 x x from x x 0 308 | O 0 297 x x an x x 0 309 | O 0 298 x x ankle x x 0 310 | O 0 299 x x injury x x 0 311 | O 0 300 x x , x x 0 312 | O 0 301 x x prior x x 0 313 | O 0 302 x x to x x 0 314 | O 0 303 x x which x x 0 315 | O 0 304 x x he x x 0 316 | O 0 305 x x had x x 0 317 | O 0 306 x x scored x x 0 318 | O 0 307 x x six x x 0 319 | O 0 308 x x goals x x 0 320 | O 0 309 x x in x x 0 321 | O 0 310 x x 14 x x 0 322 | O 0 311 x x appearances x x 0 323 | O 0 312 x x this x x 0 324 | O 0 313 x x season x x 0 325 | O 0 314 x x . x x 0 326 | 327 | -------------------------------------------------------------------------------- /data/conll/eng/eng.conll: -------------------------------------------------------------------------------- 1 | B-GPE 0 0 O O West x x 0 2 | I-GPE 0 1 O O Indian x x 0 3 | O 0 2 O O all-rounder x x 0 4 | O 0 3 O O Phil x x 0 5 | B-PER 0 4 O O Simmons x x 0 6 | O 0 5 O O took x x 0 7 | O 0 6 O O four x x 0 8 | O 0 7 O O for x x 0 9 | O 0 8 O O 38 x x 0 10 | O 0 9 O O on x x 0 11 | O 0 10 O O Friday x x 0 12 | O 0 11 O O as x x 0 13 | O 0 12 O O Leicestershire x x 0 14 | O 0 13 O O beat x x 0 15 | B-GPE 0 14 O O Somerset x x 0 16 | O 0 15 O O by x x 0 17 | O 0 16 O O an x x 0 18 | O 0 17 O O innings x x 0 19 | O 0 18 O O and x x 0 20 | O 0 19 O O 39 x x 0 21 | O 0 20 O O runs x x 0 22 | O 0 21 O O in x x 0 23 | O 0 22 O O two x x 0 24 | O 0 23 O O days x x 0 25 | O 0 24 O O to x x 0 26 | O 0 25 O O take x x 0 27 | O 0 26 O O over x x 0 28 | O 0 27 O O at x x 0 29 | O 0 28 O O the x x 0 30 | O 0 29 O O head x x 0 31 | O 0 30 O O of x x 0 32 | O 0 31 O O the x x 0 33 | O 0 32 O O county x x 0 34 | O 0 33 O O championship x x 0 35 | O 0 34 O O . x x 0 36 | 37 | O 0 35 O O Their x x 0 38 | O 0 36 O O stay x x 0 39 | O 0 37 O O on x x 0 40 | O 0 38 O O top x x 0 41 | O 0 39 O O , x x 0 42 | O 0 40 O O though x x 0 43 | O 0 41 O O , x x 0 44 | O 0 42 O O may x x 0 45 | O 0 43 O O be x x 0 46 | O 0 44 O O short-lived x x 0 47 | O 0 45 O O as x x 0 48 | O 0 46 O O title x x 0 49 | O 0 47 O O rivals x x 0 50 | B-LOC 0 48 O O Essex x x 0 51 | O 0 49 O O , x x 0 52 | B-GPE 0 50 O O Derbyshire x x 0 53 | O 0 51 O O and x x 0 54 | O 0 52 O O Surrey x x 0 55 | O 0 53 O O all x x 0 56 | O 0 54 O O closed x x 0 57 | O 0 55 O O in x x 0 58 | O 0 56 O O on x x 0 59 | O 0 57 O O victory x x 0 60 | B-ORG 0 58 O O while x x 0 61 | I-ORG 0 59 O O Kent x x 0 62 | O 0 60 O O made x x 0 63 | O 0 61 O O up x x 0 64 | O 0 62 O O for x x 0 65 | O 0 63 O O lost x x 0 66 | O 0 64 O O time x x 0 67 | O 0 65 O O in x x 0 68 | O 0 66 O O their x x 0 69 | O 0 67 O O rain-affected x x 0 70 | O 0 68 O O match x x 0 71 | O 0 69 O O against x x 0 72 | O 0 70 O O Nottinghamshire x x 0 73 | O 0 71 O O . x x 0 74 | 75 | O 0 72 O O After x x 0 76 | O 0 73 O O bowling x x 0 77 | B-GPE 0 74 O O Somerset x x 0 78 | O 0 75 O O out x x 0 79 | O 0 76 O O for x x 0 80 | O 0 77 O O 83 x x 0 81 | O 0 78 O O on x x 0 82 | O 0 79 O O the x x 0 83 | B-ORG 0 80 O O opening x x 0 84 | O 0 81 O O morning x x 0 85 | O 0 82 O O at x x 0 86 | B-LOC 0 83 O O Grace x x 0 87 | I-LOC 0 84 O O Road x x 0 88 | O 0 85 O O , x x 0 89 | O 0 86 O O Leicestershire x x 0 90 | O 0 87 O O extended x x 0 91 | O 0 88 O O their x x 0 92 | O 0 89 O O first x x 0 93 | O 0 90 O O innings x x 0 94 | O 0 91 O O by x x 0 95 | O 0 92 O O 94 x x 0 96 | O 0 93 O O runs x x 0 97 | O 0 94 O O before x x 0 98 | O 0 95 O O being x x 0 99 | O 0 96 O O bowled x x 0 100 | O 0 97 O O out x x 0 101 | O 0 98 O O for x x 0 102 | O 0 99 O O 296 x x 0 103 | O 0 100 O O with x x 0 104 | O 0 101 O O England x x 0 105 | O 0 102 O O discard x x 0 106 | B-PER 0 103 O O Andy x x 0 107 | I-PER 0 104 O O Caddick x x 0 108 | O 0 105 O O taking x x 0 109 | O 0 106 O O three x x 0 110 | O 0 107 O O for x x 0 111 | O 0 108 O O 83 x x 0 112 | O 0 109 O O . x x 0 113 | 114 | O 0 110 O O Trailing x x 0 115 | O 0 111 O O by x x 0 116 | O 0 112 O O 213 x x 0 117 | O 0 113 O O , x x 0 118 | B-GPE 0 114 O O Somerset x x 0 119 | O 0 115 O O got x x 0 120 | O 0 116 O O a x x 0 121 | O 0 117 O O solid x x 0 122 | O 0 118 O O start x x 0 123 | O 0 119 O O to x x 0 124 | O 0 120 O O their x x 0 125 | O 0 121 O O second x x 0 126 | O 0 122 O O innings x x 0 127 | O 0 123 O O before x x 0 128 | B-PER 0 124 O O Simmons x x 0 129 | O 0 125 O O stepped x x 0 130 | O 0 126 O O in x x 0 131 | O 0 127 O O to x x 0 132 | O 0 128 O O bundle x x 0 133 | O 0 129 O O them x x 0 134 | O 0 130 O O out x x 0 135 | O 0 131 O O for x x 0 136 | O 0 132 O O 174 x x 0 137 | O 0 133 O O . x x 0 138 | 139 | B-LOC 0 134 O O Essex x x 0 140 | O 0 135 O O , x x 0 141 | O 0 136 O O however x x 0 142 | O 0 137 O O , x x 0 143 | O 0 138 O O look x x 0 144 | O 0 139 O O certain x x 0 145 | O 0 140 O O to x x 0 146 | O 0 141 O O regain x x 0 147 | O 0 142 O O their x x 0 148 | O 0 143 O O top x x 0 149 | O 0 144 O O spot x x 0 150 | O 0 145 O O after x x 0 151 | B-PER 0 146 O O Nasser x x 0 152 | I-PER 0 147 O O Hussain x x 0 153 | O 0 148 O O and x x 0 154 | O 0 149 O O Peter x x 0 155 | O 0 150 O O Such x x 0 156 | O 0 151 O O gave x x 0 157 | O 0 152 O O them x x 0 158 | O 0 153 O O a x x 0 159 | O 0 154 O O firm x x 0 160 | O 0 155 O O grip x x 0 161 | O 0 156 O O on x x 0 162 | O 0 157 O O their x x 0 163 | O 0 158 O O match x x 0 164 | O 0 159 O O against x x 0 165 | O 0 160 O O Yorkshire x x 0 166 | O 0 161 O O at x x 0 167 | O 0 162 O O Headingley x x 0 168 | O 0 163 O O . x x 0 169 | 170 | -------------------------------------------------------------------------------- /data/getindian.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os 3 | from nltk.corpus import indian 4 | import txt2tajson 5 | 6 | # can also choose from: marathi, bangla, telugu, hindi 7 | lang = "hindi" 8 | 9 | if not os.path.exists("txt/" + lang): 10 | os.mkdir("txt/" + lang) 11 | 12 | sents = indian.sents(lang + ".pos") 13 | 14 | # arbitrarily put 10 sentences per document. 15 | num = 0 16 | for i in range(0,len(sents),10): 17 | with open("txt/" + lang + "/" + str(i), "w") as out: 18 | for sent in sents[i:i+10]: 19 | out.write(" ".join(sent) + "\n") 20 | num += 1 21 | 22 | print("Wrote {} text files to {}".format(num, "txt/" + lang)) 23 | 24 | # Now convert txt to tajson. 25 | txt2tajson.convert("txt/" + lang, "tajson/" + lang) 26 | 27 | print("Now run:\n $ ./scripts/buildindex.sh data/tajson/hindi/ data/index_hindi") 28 | -------------------------------------------------------------------------------- /data/mturk.csv: -------------------------------------------------------------------------------- 1 | DOCID,HTMLTEXT,LABEL1,LABEL2,LABEL3,LANGUAGE 2 | id1,This is some cool stuff right here .,LOC,PER,ORG,English -------------------------------------------------------------------------------- /data/ta/eng/ta-0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-0 -------------------------------------------------------------------------------- /data/ta/eng/ta-1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-1 -------------------------------------------------------------------------------- /data/ta/eng/ta-10: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-10 -------------------------------------------------------------------------------- /data/ta/eng/ta-11: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-11 -------------------------------------------------------------------------------- /data/ta/eng/ta-12: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-12 -------------------------------------------------------------------------------- /data/ta/eng/ta-13: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-13 -------------------------------------------------------------------------------- /data/ta/eng/ta-14: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-14 -------------------------------------------------------------------------------- /data/ta/eng/ta-15: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-15 -------------------------------------------------------------------------------- /data/ta/eng/ta-16: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-16 -------------------------------------------------------------------------------- /data/ta/eng/ta-17: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-17 -------------------------------------------------------------------------------- /data/ta/eng/ta-18: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-18 -------------------------------------------------------------------------------- /data/ta/eng/ta-19: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-19 -------------------------------------------------------------------------------- /data/ta/eng/ta-2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-2 -------------------------------------------------------------------------------- /data/ta/eng/ta-20: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-20 -------------------------------------------------------------------------------- /data/ta/eng/ta-21: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-21 -------------------------------------------------------------------------------- /data/ta/eng/ta-22: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-22 -------------------------------------------------------------------------------- /data/ta/eng/ta-23: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-23 -------------------------------------------------------------------------------- /data/ta/eng/ta-24: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-24 -------------------------------------------------------------------------------- /data/ta/eng/ta-3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-3 -------------------------------------------------------------------------------- /data/ta/eng/ta-4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-4 -------------------------------------------------------------------------------- /data/ta/eng/ta-5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-5 -------------------------------------------------------------------------------- /data/ta/eng/ta-6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-6 -------------------------------------------------------------------------------- /data/ta/eng/ta-7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-7 -------------------------------------------------------------------------------- /data/ta/eng/ta-8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-8 -------------------------------------------------------------------------------- /data/ta/eng/ta-9: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/data/ta/eng/ta-9 -------------------------------------------------------------------------------- /data/txt/eng/1.txt: -------------------------------------------------------------------------------- 1 | US President Donald Trump is seeking to boost defence spending by 10% in his proposed budget plan for 2018 . 2 | The blueprint will increase defence spending by $54bn ( £43bn ) but seeks to recoup that sum through deep cuts elsewhere , including to foreign aid . 3 | Mr Trump's plan leaves large welfare programmes untouched , despite Republican calls for reform . 4 | The president has consulted government agencies about his plans and will present his budget to Congress in May . 5 | Between now and then , he needs to identify where the agencies can make savings and work out what he does with tax reform . 6 | Republican John McCain said the $603bn defence budget - which White House officials outlined - would be insufficient . 7 | Speaking at the White House during a meeting with state governors on Monday morning , Mr Trump said : " We're going to do more with less and make the government lean and accountable . " 8 | The president , who vowed to increase military spending and preserve welfare programmes during his campaign , said the budget will focus on " military , safety , economic development " . 9 | " It will include an historic increase in defence spending to rebuild the depleted military of the United States of America at a time we most need it , " he said . -------------------------------------------------------------------------------- /data/txt/eng/2.txt: -------------------------------------------------------------------------------- 1 | In early 2012, Irish journalist Jonathan Spollen was visiting India - and then he vanished. 2 | In a series of interviews in the build-up to the anniversary of his disappearance, his friends and family have given their take on a case that has become no less mysterious over time. 3 | "Some people say to me: 'Would it not be easier if you had closure?'" Lynda Spollen says. 4 | "But closure is my most hated word in the dictionary." 5 | "Closure" is a way for people to ask her indirectly if she should move on, whether she should now give up hope of finding her son Jonathan. 6 | His last known conversation was on the phone with Lynda on 3 February 2012. 7 | She was in Dublin; he was travelling in northern India. 8 | They spoke for just under six minutes; not long, given the fact they could at times happily chat by phone for more than two hours. 9 | Jonathan told his mum he had changed his mind about travelling to Delhi. 10 | He would instead remain in Rishikesh, in the foothills of the Himalayas, to go on a short trek. 11 | He was reported missing on this date five years ago. 12 | In school, Jonathan Spollen was a natural joker, perhaps compensating for his being shorter than most of those around him. 13 | But much to Lynda's surprise, he later turned into, they both acknowledged, "a bit of a nerd" - one who went on to take an interest in politics and Middle East affairs. 14 | When he went missing, aged 28, he had not had much of a break since school. 15 | From a politics degree in Dublin, he went on to a Masters in Middle Eastern Studies in London, before moving to work as a journalist in Cairo, Abu Dhabi (where I worked with him), then Hong Kong. 16 | It was only natural, perhaps, that he became a journalist - his curiosity towards the world was reflected in his writing, and his affability meant he had no problem getting people to talk to him. 17 | In 2010, in an article written after a visit to Indian-administered Kashmir, he expressed his fear he was idealising the simple life he had witnessed there. 18 | "The whole experience, in fact, was as confounding to my world view as it was enjoyable," he wrote, "producing in me each day a lovely sort of confusion." 19 | When people speak about Jonathan - people who knew him at different points in his life - they all mention his compassion. 20 | As a child growing up in Ranelagh, a middle-class suburb in the south of Dublin, he was especially caring towards his dying grandmother: as she hallucinated mice running along her curtains, he understood what he had to do, and carefully removed the invisible mice before putting them in a bin. 21 | While in Beirut airport in his mid-20s, he saw a Filipina maid being scolded by officials for bringing along too much luggage, so he paid her $300 fine on the spot. 22 | Later, when he had moved to Hong Kong, he encountered a woman sleeping on a park bench and in distress having lost her identity papers. 23 | Jonathan offered her his room without hesitation. 24 | Jonathan and Hong Kong were not a great fit, and he left his job as a copy editor there late in 2011. 25 | As he travelled first to Nepal, then on a journey through India, he was contemplating what he would do next. 26 | "This was very much a time for him to decide whether he was going to continue in print journalism or go into documentaries," Lynda says. 27 | "Time to reassess things and make a plan." 28 | By early February 2012, having been in the country a little more than two months, Jonathan's Indian visa was a few weeks from ending, and he was based in Rishikesh, a few hours north of Delhi. 29 | The fact that he disappeared here, of all places, may not have been pure chance. 30 | Rishikesh sells itself as the yoga capital of the world - and Westerners are among the biggest customers 31 | In early December 2011, Jonathan arrived in Rishikesh, a small, laid-back city of about 100,000 people surrounded by lush forests in the northern Indian state of Uttarakhand. 32 | He was not alone - the state attracts more than 300,000 visitors a year, a third of whom come from abroad. 33 | Rishikesh, which pitches itself as the yoga capital of the world, is its most-visited city and is dotted with hundreds of yoga studios and ashrams. 34 | Its most famous visitors were The Beatles, in 1968, when they came to learn transcendental meditation under the Mahareshi Mahesh Yogi. 35 | The group's month in the city was one of the most creative periods in their history - while there, they wrote songs that later featured on The White Album and Abbey Road. 36 | Whether Jonathan was interested in a spiritual journey at this point in his life is one of the biggest questions at the centre of his case. 37 | Lynda is sure he was not going down this path; the subject never came up in their long conversations. 38 | Yes, Jonathan was interested in comparing different philosophies. 39 | Yes, he had once had a passing interest in Buddhism. 40 | But he had not expressed an interest to his mother in pursuing this journey in India, had not sought a guru, and had not been meditating, despite her encouraging him to do so. 41 | In their last conversation on 3 February 2012, he had told his mother he would be away on a short trek for a few weeks, one he wanted to do alone. 42 | "I want to do it on my own, kind of a spiritual thing," he said, in a manner Lynda describes as lighthearted. -------------------------------------------------------------------------------- /data/txt/eng/3.txt: -------------------------------------------------------------------------------- 1 | Olga Korbut, the Soviet gymnast who charmed the world at the 1972 Munich Olympics, has sold her medal haul and other trophies in a US auction. 2 | The sale of seven lots - including two golds and a silver from the Munich Games - fetched $183,300 (£147,000) for Korbut. 3 | The most expensive item was her team gold ($66,000). 4 | Born in Belarus, she moved to the US in 1991. Now 61, she lives in Arizona. 5 | In 1972, at the height of the Cold War, Korbut's breathtaking gymnastics won millions of admirers in the West when she was just 17. 6 | Just 1.5m tall (4ft 11), she was nicknamed "the Sparrow from Minsk". 7 | Her captivating smile and quirky charm helped to turn her into an Olympic legend. 8 | She won three golds (team, balance beam and floor exercise) and a silver at the 1972 Munich Olympics. 9 | In 1976 she won another gold and a silver at the Montreal Games. 10 | Heritage Auctions, organiser of the sale, says "there is hardly a gymnast alive who doesn't credit this tiny force of nature for the explosion of the sport's popularity on a global level". 11 | The sale items included one of her performance leotards, her 1972 BBC Sports Personality of the Year Award, various Soviet medals and a sports magazine cover signed by her. 12 | The Korbut Flip was a spectacular trick that she performed on the asymmetric bars - a trick now banned from the Olympics as it is considered too dangerous. 13 | The flip can be seen on YouTube - it begins with a somersault on the top bar. 14 | From 1978 to 2000 Korbut was married to Leonid Bortkevich, a famous Soviet-era folk singer, with whom she moved to the US after the USSR's collapse in 1991. 15 | She has a son called Richard. -------------------------------------------------------------------------------- /data/txt/eng/4.txt: -------------------------------------------------------------------------------- 1 | This is my standout look of the night. 2 | We've all been so excited to see Raf Simons' first work with Calvin Klein, and this white dress just perfectly encapsulates his directional take on femininity and glamour. 3 | Here his signature cool clean lines, with unexpected details like the cut out and the squared off train, are heightened by the stunning crust of sequins. 4 | And Naomie hasn't gone overboard with the details - simple hair, asymmetric crystal suede sandals, also by Calvin Klein, and Bulgari jewellery. 5 | We often imagine that Oscar dresses should look like an old-fashioned fairy tale princess fantasy, and I love the idea that the modern fairy tale princess ideal is more pared back. 6 | Naomie looks amazing, and also entirely like herself - not over the top but incredibly glamorous. 7 | I also love that Raf reached out to dress not just Naomi but her Moonlight co-stars, having seen the film and been blown away by it. 8 | It's a great example of fashion and Hollywood being inspired by each other. 9 | Obviously it's great that Emma Stone won an Oscar while dressed a bit like an Oscar, but this dress is also just perfect - for her, the film she is nominated for and for the ceremony. 10 | One of Riccardo Tisci's last designs before he stepped down from Givenchy, it exudes gilded era old-school glamour, just like La La Land, and the ombre fringing brings a playfulness that fits with Emma's whole vibe. 11 | Great jewellery too from Tiffany & Co, and a dash of politics with the addition of a gold Planned Parenthood pin. 12 | Pharrell is a guy who is always having fun with fashion, but after years where his style has been dominated by the big hat, it was really nice to see him graduate to looking unquestionably stylish - while still pushing the envelope. 13 | This look is head-to-toe-to-necklace Chanel - not a brand you often see men wearing on the red carpet - but entirely in keeping with Pharrell's creative energy. -------------------------------------------------------------------------------- /data/txt/fas/1.txt: -------------------------------------------------------------------------------- 1 | در جریان حمله شبه‌نظامیان حامی دولت نیکاراگوئه به کلیسایی که ده‌ها معترض در آن پناه گرفته بودند دست کم یک دانشجو کشته شده است . 2 | دانشجویان پس از آن وارد کلیسا شدند که در جریان تظاهرات اعتراضی خود با حمله شبه‌نظامیان مسلح روبرو شدند و به کلیسا پناه بردند اما به محاصره در آمدند . 3 | دانشجویان از دیشب ( جمعه ) در این کلیسا که در یکی از مناطق مسکونی بومیان واقع شده گیر افتاده بودند . 4 | آنها به همراه هزاران معترض دیگر خواستار استعفای دانیل اورتگا ، رئیس جمهوری هستند . 5 | بیش از ۳۰۰ نفر تاکنون در جریان این اعتراض‌های ضد دولتی کشته شده‌اند . 6 | کلیسایی که دانشجویان در آن محاصره شده بودند در نزدیکی دانشگاه اصلی ماناگوئه ، پایتخت است و گزارش شده که دست کم ۱۵۰ دانشجو به همراه کشیش و روزنامه‌نگاران در داخل این کلیسا پناه گرفته بودند . -------------------------------------------------------------------------------- /data/txt/fas/2.txt: -------------------------------------------------------------------------------- 1 | این روزها و با گذشت چند دهه از سیاست " جهانی‌شدن " ، ملی‌گرایی و دیگرهراسی به دستاویز مناسبی برای برخی سیاست‌مداران جویای قدرت بدل شده است . 2 | در اروپا و آمریکا، برخی سیاستمداران عمدتا راست‌گرا پیکان حمله را به سوی "آن مردم دیگر" که به زعم آنها مقصر تمامی مشکلات این روزهای "مردم ما" هستند، نشانه رفته‌اند. نیاز اولیه‌ای مانند غذا و اندازه سفره مردم، یکی از موارد مهم این تنش‌ها است که "ما مردم" و "آن مردم دیگر" را رو در روی هم قرار داده است . 3 | شاید جالب باشد بدانید که همین روزها یکی از چالش‌های پیش‌روی برگزیت ( خروج بریتانیا از اتحادیه اروپا ) ، عواقب این خروج بر ساز و کار تامین مواد غذایی مردم بریتانیا است. مثلا کمبود منابع و نیروی کار که از قضای روزگار اغلب توسط همان " مردم دیگر " تامین می‌شود ، در نهایت موجب افزایش قیمت‌ها خواهد شد . -------------------------------------------------------------------------------- /data/txt/fas/3.txt: -------------------------------------------------------------------------------- 1 | علی اکبر ولایتی ، مشاور امور بین الملل رهبر ایران می‌گوید ولادیمیر پوتین ، رئیس جمهوری روسیه وعده داده که در صنعت نفت و گاز ایران سرمایه‌گذاری کند . 2 | آقای ولایتی مبلع این سرمایه گذاری را تا ۵۰ میلیارد دلار اعلام کرده و گفته است که این پول می‌تواند جایگزین شرکت‌های غربی شود که به دلیل تحریم‌های آمریکا از ایران رفته‌اند . 3 | روسیه در دور قبلی تحریم‌ها هم با ایران مراودات اقتصادی داشت، اما شواهدی از تاثیر قابل توجه روابط اقتصادی ایران و روسیه بر آثار تحریم‌های هسته‌ای ایران در دست نیست. از طرفی، نگاهی به مقصد سرمایه‌گذاری‌های روسیه در سال‌های اخیر نشان می‌دهد که این کشور سرمایه‌گذاری قابل توجهی در خاورمیانه نداشته و بیشتر پول این کشور به اروپا سرازیر می‌شود . -------------------------------------------------------------------------------- /data/txt/rus/1.txt: -------------------------------------------------------------------------------- 1 | Матч за бронзу нельзя называть утешительным финалом . 2 | Утирать слезы и делиться носовыми платочками нужно с теми , кто не попал на элитную вечеринку , кто вылетел до стадии полуфиналов или просто не доехал до России . 3 | В Питере же прошел малый финал , прелюдия к основному действу , которое начнется в 18:00 по московскому времени 15 июля . 4 | Мы знаем , что тысячи англичан после четвертьфинала в восторге ринулись скупать билеты до Москвы , на исторический полуфинал чемпионата мира . 5 | И они на сто процентов рассчитывали остаться в столице , не уезжать в Питер , но у Марио Манджукича было свое мнение на этот счет . 6 | Так что из-за него на «Санкт-Петербурге» белых проплешин было больше . 7 | Почему проплешин ? 8 | Потому что трибуны были заполнены представителями слишком многих национальностей , а больше всех было россиян . -------------------------------------------------------------------------------- /data/txt/rus/2.txt: -------------------------------------------------------------------------------- 1 | Президент России Владимир Путин остался удовлетворен тем , как чемпионат мира повысил имидж России за рубежом , и готов разработать облегченный визовый режим для болельщиков , которые захотят вернуться в страну . 2 | Об этом он рассказал на открытии гала-концерта « Ночь в Большом » в субботу , 14 июля , в Большом театре , передает ТАСС . 3 | 4 | « Мы признательны за миллионы добрых слов , сказанных гостями чемпионата в адрес России и нашего народа , рады , что им понравилось его гостеприимство и открытость , природа , культура , традиции нашей большой страны . 5 | Мы рады , что наши гости все увидели своими глазами , что рухнули мифы и предубеждения » , — заявил он . -------------------------------------------------------------------------------- /data/txt/rus/3.txt: -------------------------------------------------------------------------------- 1 | Американский телеканал CNBC со ссылкой на свои источники рассказал об успешных испытаниях в России гиперзвуковой ракеты « Кинжал » . 2 | По данным автора материала , было проведено 12 тестов — во время последнего снаряд поразил цель на расстоянии чуть более 800 километров . 3 | Канал уточняет , что ракеты , вероятно , примут на вооружение в 2020 году . 4 | Отмечается также , что США пока ничего не могут противопоставить « Кинжалу » . -------------------------------------------------------------------------------- /data/txt2column.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | 3 | infolder = sys.argv[1] 4 | outfolder = sys.argv[2] 5 | 6 | os.mkdir(outfolder) 7 | 8 | for fname in os.listdir(infolder): 9 | with open(infolder + "/" + fname) as f: 10 | lines = f.readlines() 11 | with open(outfolder + "/" + fname, "w") as out: 12 | for sent in lines: 13 | toks = sent.split() 14 | for tok in toks: 15 | out.write("{} O\n".format(tok)) 16 | out.write("\n") 17 | 18 | -------------------------------------------------------------------------------- /data/txt2tajson.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import codecs 3 | import json 4 | import os 5 | import sys 6 | 7 | # This file converts a folder full of text files (one sentence per line, whitespace tokenized) 8 | # into a folder of tajson files. 9 | 10 | # Usage: 11 | # $ txt2tajson.py input_folder output_folder 12 | 13 | 14 | def lines2json(lines, fname): 15 | """ This takes a set of lines (read from some text file) 16 | and converts them into a JSON TextAnnotation. This assumes 17 | that there is one sentence per line, whitespace tokenized. """ 18 | 19 | doc = {} 20 | doc["corpusId"] = "" 21 | doc["id"] = fname 22 | 23 | sents = {} 24 | sentends = [] 25 | tokens = [] 26 | 27 | for sent in lines: 28 | toks = sent.split() 29 | tokens.extend(toks) 30 | sentends.append(len(tokens)) 31 | 32 | doc["text"] = " ".join(tokens) 33 | doc["tokens"] = tokens 34 | 35 | sents["sentenceEndPositions"] = sentends 36 | sents["score"] = 1.0 37 | sents["generator"] = "txt2tajson.py" 38 | doc["sentences"] = sents 39 | doc["views"] = [] 40 | 41 | return doc 42 | 43 | 44 | def convert(infolder, outfolder): 45 | if not os.path.exists(outfolder): 46 | os.mkdir(outfolder) 47 | 48 | for fname in os.listdir(infolder): 49 | with open(infolder + "/" + fname) as f: 50 | lines = f.readlines() 51 | with codecs.open(outfolder + "/" + fname, "w", encoding="utf-8") as out: 52 | doc = lines2json(lines, fname) 53 | json.dump(doc, out, sort_keys=True, indent=4, ensure_ascii=False) 54 | 55 | 56 | if __name__ == "__main__": 57 | if len(sys.argv) < 2: 58 | print("Usage: txt2tajson.py input_folder output_folder") 59 | exit(1) 60 | 61 | infolder = sys.argv[1] 62 | outfolder = sys.argv[2] 63 | convert(infolder, outfolder) 64 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | edu.illinois.cs.cogcomp 7 | talen 8 | 0.1.0 9 | 10 | 11 | 1.8 12 | 13 | 14 | 15 | org.springframework.boot 16 | spring-boot-starter-parent 17 | 2.0.1.RELEASE 18 | 19 | 20 | 21 | 22 | CogcompSoftware 23 | CogcompSoftware 24 | http://cogcomp.org/m2repo/ 25 | 26 | 27 | 28 | 29 | 30 | 31 | org.springframework.boot 32 | spring-boot-starter-web 33 | 34 | 35 | 36 | org.springframework.boot 37 | spring-boot-starter-thymeleaf 38 | 39 | 40 | 41 | org.springframework.boot 42 | spring-boot-devtools 43 | true 44 | 45 | 46 | 47 | nz.net.ultraq.thymeleaf 48 | thymeleaf-layout-dialect 49 | 2.0.5 50 | 51 | 52 | 53 | org.thymeleaf 54 | thymeleaf-spring5 55 | 3.0.13.RELEASE 56 | 57 | 58 | 59 | org.thymeleaf 60 | thymeleaf 61 | 3.0.9.RELEASE 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | edu.illinois.cs.cogcomp 73 | illinois-core-utilities 74 | 4.0.4 75 | 76 | 77 | slf4j-log4j12 78 | org.slf4j 79 | 80 | 81 | 82 | 83 | edu.illinois.cs.cogcomp 84 | illinois-corpusreaders 85 | 3.1.19 86 | 87 | 88 | slf4j-log4j12 89 | org.slf4j 90 | 91 | 92 | 93 | 94 | 95 | 96 | org.apache.lucene 97 | lucene-core 98 | 7.1.0 99 | 100 | 101 | 102 | org.apache.lucene 103 | lucene-analyzers-common 104 | 6.4.1 105 | 106 | 107 | 108 | org.apache.lucene 109 | lucene-queryparser 110 | 6.4.1 111 | 112 | 113 | 114 | org.apache.commons 115 | commons-lang3 116 | 3.0 117 | 118 | 119 | 120 | org.apache.commons 121 | commons-text 122 | 1.2 123 | 124 | 125 | 126 | org.springframework.boot 127 | spring-boot-starter-security 128 | 129 | 130 | 131 | commons-cli 132 | commons-cli 133 | 1.4 134 | 135 | 136 | 137 | cz.jirutka.unidecode 138 | unidecode 139 | 1.0.1 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | org.springframework.boot 149 | spring-boot-maven-plugin 150 | 2.0.2.RELEASE 151 | 152 | true 153 | 154 | 155 | 156 | 157 | 158 | repackage 159 | 160 | 161 | spring-boot 162 | 163 | io.github.mayhewsw.utils.TalenCLI 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | org.apache.maven.plugins 173 | maven-compiler-plugin 174 | 3.5.1 175 | 176 | 1.8 177 | 1.8 178 | 179 | 180 | 181 | org.apache.maven.plugins 182 | maven-dependency-plugin 183 | 2.4 184 | 185 | 186 | copy-dependencies 187 | package 188 | copy-dependencies 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | -------------------------------------------------------------------------------- /scripts/buildindex.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # generate/update the binary files and dependencies 3 | 4 | # Classpath 5 | cpath="target/classes:target/dependency/*:config" 6 | 7 | INPATH=$1 8 | OUTPATH=$2 9 | 10 | java -classpath ${cpath} -Xmx16g io.github.mayhewsw.TextFileIndexer -infolder $INPATH -indexfolder $OUTPATH -test 11 | -------------------------------------------------------------------------------- /scripts/install-cli.sh: -------------------------------------------------------------------------------- 1 | mvn package spring-boot:repackage 2 | 3 | INSTALL_DIR=$HOME/software/talen/ 4 | 5 | mkdir -p $INSTALL_DIR 6 | 7 | cp target/talen-0.1.0-spring-boot.jar $INSTALL_DIR 8 | 9 | echo "java -jar $INSTALL_DIR/talen-0.1.0-spring-boot.jar -indir \$@" > $INSTALL_DIR/talen-cli 10 | 11 | chmod +x $INSTALL_DIR/talen-cli 12 | 13 | echo "Don't forget to add $INSTALL_DIR to your path!" 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /scripts/preparedata.py: -------------------------------------------------------------------------------- 1 | #!/home/stephen/anaconda3/bin/python 2 | import csv 3 | import os 4 | 5 | LABELS = ["LOC", "PER", "ORG"] 6 | LANGUAGE = "English" 7 | 8 | 9 | def testfile(infile, outfile): 10 | """This method takes an ordinary input csv file 11 | and writes to a test html file. This is just 12 | to test locally before submitting to mechanical turk""" 13 | 14 | with open("src/main/resources/templates/mturk/mturkTemplate.html") as f: 15 | template = f.read() 16 | 17 | print("Reading just the header and first row of", infile) 18 | with open(infile) as csvfile: 19 | reader = csv.reader(csvfile, dialect="excel") 20 | 21 | header = next(reader) 22 | row1 = next(reader) 23 | 24 | d = dict(zip(header, row1)) 25 | 26 | for k in header: 27 | v = d[k] 28 | template = template.replace("${{{}}}".format(k), v) 29 | 30 | print("Writing to", outfile) 31 | with open(outfile, "w") as out: 32 | out.write("") 33 | out.write("\n") 34 | out.write(template) 35 | out.write("\n") 36 | out.write("") 37 | 38 | 39 | def preparedata(folder, outname): 40 | """This method takes as input a folder of text files, 41 | each with one sentence per line, and 42 | writes to a csv file with appropriate html in each 43 | field.""" 44 | 45 | header = ["DOCID","HTMLTEXT","LABEL1","LABEL2","LABEL3","LANGUAGE"] 46 | 47 | fnames = sorted(os.listdir(folder)) 48 | 49 | print("Writing to", outname) 50 | out = open(outname, "w", newline='') 51 | out.write(",".join(header) + "\n") 52 | 53 | for fname in fnames: 54 | with open(os.path.join(folder, fname)) as f: 55 | lines = f.readlines() 56 | 57 | # First DOCID 58 | outwrite = [] 59 | docid = fname 60 | for c in [".", ":"]: 61 | docid = docid.replace(c, "_") 62 | outwrite.append(docid) 63 | 64 | HTMLTEXT = "" 65 | i = 0 66 | for line in lines: 67 | line = line.strip() 68 | line = line.replace("\"", "\"\"") 69 | 70 | htmlformat = "{1}" 71 | 72 | HTMLTEXT += "

".format(i) 73 | for tok in line.split(): 74 | tokid = "tok-{}-{}".format(docid, i) 75 | HTMLTEXT += htmlformat.format(tokid, tok) 76 | i += 1 77 | HTMLTEXT += "

" 78 | 79 | outwrite.append("\"" + HTMLTEXT + "\"") 80 | outwrite.extend(LABELS) 81 | outwrite.append(LANGUAGE) 82 | out.write(",".join(outwrite) + "\n") 83 | 84 | 85 | out.close() 86 | 87 | 88 | if __name__ == "__main__": 89 | import argparse 90 | parser = argparse.ArgumentParser(description="Prepare data for mechanical turk submission") 91 | parser.add_argument("method", choices=["preparedata", "testfile"]) 92 | parser.add_argument("infile") 93 | parser.add_argument("outfile") 94 | 95 | args = parser.parse_args() 96 | 97 | locals()[args.method](args.infile, args.outfile) 98 | 99 | -------------------------------------------------------------------------------- /scripts/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ ! -f config/users.txt ]; then 3 | echo "config/users.txt not found... creating with default username and password (user/user)"; 4 | echo "user user" > config/users.txt; 5 | fi 6 | 7 | mkdir -p logs 8 | mkdir -p dicts 9 | 10 | mvn spring-boot:run 11 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/Application.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw; 2 | 3 | import org.springframework.boot.SpringApplication; 4 | import org.springframework.boot.autoconfigure.SpringBootApplication; 5 | 6 | @SpringBootApplication 7 | public class Application { 8 | 9 | public static void main(String[] args) { 10 | 11 | SpringApplication app = new SpringApplication(Application.class); 12 | app.run(args); 13 | 14 | 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/ConfigFile.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw; 2 | 3 | import org.apache.commons.lang3.StringUtils; 4 | 5 | import java.io.BufferedInputStream; 6 | import java.io.BufferedReader; 7 | import java.io.IOException; 8 | import java.io.InputStream; 9 | import java.util.*; 10 | 11 | /** 12 | * This class represents the exact names of each element in a config file. 13 | */ 14 | public class ConfigFile extends Properties{ 15 | private String folderpath; 16 | private String name; 17 | private String labels; 18 | private String dict; 19 | private String format; 20 | private String mode; 21 | private String indexpath; 22 | private String terms; 23 | 24 | public List allowableentries; 25 | 26 | public ConfigFile(){ 27 | // do something with properties? 28 | } 29 | 30 | public void loadProperties(BufferedReader var1) throws IOException { 31 | super.load(var1); 32 | 33 | System.out.println(StringUtils.join(this.stringPropertyNames(), " ")); 34 | 35 | // also load all the properties herein. 36 | folderpath = this.getProperty("folderpath"); 37 | name = this.getProperty("name"); 38 | labels = this.getProperty("labels"); 39 | mode = this.getProperty("mode"); 40 | 41 | if(folderpath == null){ 42 | System.err.println("folderpath must be non-null!"); 43 | } 44 | if(labels == null){ 45 | System.err.println("labels must be non-null!"); 46 | } 47 | if(name == null){ 48 | System.err.println("name must be non-null!"); 49 | } 50 | if(mode == null){ 51 | System.err.println("mode must be non-null!"); 52 | } 53 | 54 | dict = this.getProperty("dict"); 55 | format = this.getProperty("format"); 56 | 57 | indexpath = this.getProperty("indexpath"); 58 | terms = this.getProperty("terms"); 59 | } 60 | 61 | public String getIndexpath() { 62 | return indexpath; 63 | } 64 | 65 | public String getTerms() { 66 | return terms; 67 | } 68 | 69 | public void setIndexpath(String indexpath) { 70 | this.indexpath = indexpath; 71 | } 72 | 73 | public void setTerms(String terms) { 74 | this.terms = terms; 75 | } 76 | 77 | public String getFname() { 78 | String fname; 79 | if (mode.equals("document")) { 80 | fname = "doc-" + name + ".txt"; 81 | } else { 82 | fname = "sent-" + name + ".txt"; 83 | } 84 | return fname; 85 | } 86 | public String getFolderpath() { 87 | return folderpath; 88 | } 89 | 90 | public String getName() { 91 | return name; 92 | } 93 | 94 | public String getLabels() { 95 | return this.labels; 96 | } 97 | 98 | public void setLabels(String labels){ 99 | this.labels = labels; 100 | } 101 | 102 | public String getDict() { 103 | return dict; 104 | } 105 | 106 | public String getFormat() { 107 | return format; 108 | } 109 | 110 | public String getMode() { 111 | return mode; 112 | } 113 | 114 | public void setFolderpath(String folderpath) { 115 | this.folderpath = folderpath; 116 | } 117 | 118 | public void setName(String name) { 119 | this.name = name; 120 | } 121 | 122 | public void setDict(String dict) { 123 | this.dict = dict; 124 | } 125 | 126 | public void setFormat(String format) { 127 | this.format = format; 128 | } 129 | 130 | public void setMode(String mode) { 131 | this.mode = mode; 132 | } 133 | 134 | @Override 135 | public String toString() { 136 | 137 | HashMap entries = new HashMap<>(); 138 | 139 | entries.put("folderpath", folderpath); 140 | entries.put("name", name); 141 | entries.put("labels", labels); 142 | entries.put("dict", dict); 143 | entries.put("format", format); 144 | entries.put("mode", mode); 145 | entries.put("indexpath", indexpath); 146 | entries.put("terms", terms); 147 | 148 | StringJoiner sj = new StringJoiner("\n"); 149 | for(String k : entries.keySet()){ 150 | if(entries.get(k) != null && entries.get(k).trim().length() > 0){ 151 | sj.add(k + "\t" + entries.get(k)); 152 | } 153 | } 154 | 155 | return sj.toString(); 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/Dictionary.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw; 2 | 3 | import edu.illinois.cs.cogcomp.core.datastructures.Pair; 4 | import edu.illinois.cs.cogcomp.core.io.LineIO; 5 | 6 | import java.io.File; 7 | import java.io.IOException; 8 | import java.util.ArrayList; 9 | import java.util.Comparator; 10 | import java.util.HashMap; 11 | import java.util.List; 12 | import java.util.stream.Collectors; 13 | 14 | import org.slf4j.Logger; 15 | import org.slf4j.LoggerFactory; 16 | 17 | import static java.util.Comparator.comparing; 18 | import static java.util.stream.Collectors.toList; 19 | 20 | /** 21 | * Created by mayhew2 on 1/27/17. 22 | */ 23 | public class Dictionary extends HashMap> { 24 | 25 | private static Logger logger = LoggerFactory.getLogger(Dictionary.class); 26 | public String dictpath; 27 | public String dictname; 28 | private List> newpairs; 29 | 30 | /** 31 | * Get the all pairs of elements from two lists. In python this is an import. :( 32 | * @param alist 33 | * @param blist 34 | * @return a list of pairs. 35 | */ 36 | private static List> product(String[] alist, String[] blist){ 37 | 38 | List> prod = new ArrayList<>(); 39 | 40 | for(String a : alist){ 41 | for(String b : blist){ 42 | prod.add(new Pair<>(a,b)); 43 | } 44 | } 45 | return prod; 46 | } 47 | 48 | public boolean isEmpty(){ 49 | return this.keySet().isEmpty(); 50 | } 51 | 52 | public String getName(){ 53 | return this.dictname; 54 | } 55 | 56 | /** 57 | * Because I got tired of writing this so many times. 58 | * @param m 59 | * @param p 60 | */ 61 | private static void addOrIncrement(HashMap, Integer> m, Pair p){ 62 | // In python, this is a defaultdict. 63 | if(!m.containsKey(p)){ 64 | m.put(p, 0); 65 | } 66 | m.put(p, m.get(p)+1); 67 | } 68 | 69 | /** 70 | * Syntactic sugar. Just calls add(key, def, isnew) with isnew set to true 71 | * @param key 72 | * @param def 73 | */ 74 | public void add(String key, String def){ 75 | add(key, def, true); 76 | } 77 | 78 | /** 79 | * Add a definition to this dictionary. 80 | * @param key 81 | * @param def 82 | * @param isnew defines whether or not it should be added to the user dictionary list 83 | */ 84 | public void add(String key, String def, boolean isnew){ 85 | if(!this.containsKey(key)){ 86 | this.put(key, new ArrayList<>()); 87 | } 88 | // always add to the front of the list. 89 | this.get(key).add(0, def); 90 | 91 | if(isnew) { 92 | this.newpairs.add(new Pair<>(key, def)); 93 | } 94 | 95 | } 96 | 97 | /** 98 | * This saves the user-generated pairs to file. 99 | */ 100 | public void save(String dataname, String username) throws IOException { 101 | List outlines = newpairs.stream().map(p -> p.getFirst() + "\t" + p.getSecond()).collect(toList()); 102 | LineIO.write(getUserDictPath(dataname, username), outlines); 103 | } 104 | 105 | /** 106 | * This defines a standard for the user dictionary path 107 | * @param dataname 108 | * @param username 109 | * @return 110 | */ 111 | public String getUserDictPath(String dataname, String username){ 112 | String userdictpath = String.format("dicts/%s-%s.txt", dictname, username); 113 | return userdictpath; 114 | } 115 | 116 | /** 117 | * This creates a default name based on the dataname and username. 118 | */ 119 | public Dictionary(String dataname, String username){ 120 | this(dataname, null, username); 121 | } 122 | 123 | public Dictionary(String dictname, String dictpath, String username) { 124 | 125 | 126 | this.newpairs = new ArrayList<>(); 127 | 128 | this.dictpath = dictpath; 129 | this.dictname = dictname; 130 | 131 | ArrayList dictlines = null; 132 | 133 | if(dictpath != null) { 134 | try { 135 | dictlines = LineIO.read(dictpath); 136 | 137 | // I want a dictionary that maps from foreign->english. 138 | 139 | // This keeps track of pair counts, so we can sort according to popularity. 140 | HashMap, Integer> pairs = new HashMap<>(); 141 | 142 | logger.info("Loading dictionary..."); 143 | for (String line : dictlines) { 144 | String[] sline = line.split("\t"); 145 | if(sline.length < 2) continue; 146 | 147 | String f = sline[0]; 148 | String e = sline[1]; 149 | 150 | Pair ef = new Pair<>(e, f); 151 | Pair eflower = new Pair<>(e.toLowerCase(), f.toLowerCase()); 152 | 153 | addOrIncrement(pairs, ef); 154 | addOrIncrement(pairs, eflower); 155 | 156 | for (Pair p : product(e.split(" "), f.split(" "))) { 157 | addOrIncrement(pairs, p); 158 | Pair plower = new Pair<>(p.getFirst().toLowerCase(), p.getSecond().toLowerCase()); 159 | addOrIncrement(pairs, plower); 160 | } 161 | 162 | // actually add to dictionary. 163 | this.add(f, e, false); 164 | } 165 | 166 | // this is now f2e. 167 | for (String k : this.keySet()) { 168 | // scores gathers 169 | 170 | Comparator> comparator = Comparator.comparing(Pair::getSecond); 171 | 172 | List sortedpairs = this.get(k).stream() 173 | .map(w -> new Pair<>(w, k)) 174 | .map(p -> new Pair<>(p.getFirst(), pairs.get(p))) 175 | .sorted(comparator.reversed()) 176 | .map(p -> p.getFirst()) 177 | .collect(toList()); 178 | 179 | this.put(k, sortedpairs); 180 | } 181 | } catch (IOException e) { 182 | // an empty dictionary is a graceful failure. 183 | logger.info("Dictionary file not found: " + dictpath + ". Dictionary is empty."); 184 | } 185 | } 186 | 187 | // Also read the user generated pairs. 188 | ArrayList userlines = new ArrayList<>(); 189 | String userdictpath = this.getUserDictPath(dictname, username); 190 | try { 191 | userlines = LineIO.read(userdictpath); 192 | } catch (IOException e) { 193 | // an empty dictionary is a graceful failure. 194 | logger.info("User dictionary file not found: " + dictpath + "." + username +". User dictionary is empty."); 195 | } 196 | for(String line : userlines) { 197 | String[] sline = line.split("\t"); 198 | String f = sline[0]; 199 | String e = sline[1]; 200 | 201 | this.add(f, e, true); 202 | } 203 | 204 | 205 | logger.info("Done loading dictionary."); 206 | } 207 | 208 | 209 | public static void main(String[] args) throws IOException { 210 | //Dictionary d = new Dictionary("whatevs", "/shared/experiments/mayhew2/lexicons/spa-eng.masterlex.txt.gz"); 211 | Dictionary d = new Dictionary("whatevs", "/home/mayhew/IdeaProjects/ner-annotation/bendict.txt", "testuser"); 212 | } 213 | 214 | 215 | } 216 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/FeatureExtractor.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw; 2 | 3 | import edu.illinois.cs.cogcomp.core.datastructures.IntPair; 4 | import edu.illinois.cs.cogcomp.core.datastructures.Pair; 5 | import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; 6 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; 7 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView; 8 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; 9 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.View; 10 | import org.apache.commons.lang.StringUtils; 11 | 12 | import java.util.*; 13 | 14 | /** 15 | * Created by mayhew2 on 3/7/17. 16 | */ 17 | public class FeatureExtractor { 18 | 19 | /** 20 | * This adds a view called "feats" to the TextAnnotation. This view is based on 21 | * constituents from the NER view, which may change a good deal in other code. 22 | * @param ta 23 | */ 24 | public static void extract(TextAnnotation ta){ 25 | 26 | View ner = ta.getView(ViewNames.NER_CONLL); 27 | 28 | View myfeats = new SpanLabelView("feats", ta); 29 | 30 | // TODO: this should all be in lowercase, but it messes up the pattern matching back in the ta 31 | for(Constituent c : ner.getConstituents()){ 32 | String surface = c.getTokenizedSurfaceForm(); 33 | 34 | IntPair span = c.getSpan(); 35 | 36 | // deal with padding. 37 | String prevword = "_"; 38 | String prevprevword = "_"; 39 | if(span.getFirst() > 0) { 40 | prevword = ta.getToken(span.getFirst() - 1); 41 | } 42 | if(span.getFirst() > 1){ 43 | prevprevword = ta.getToken(span.getFirst()-2); 44 | } 45 | 46 | myfeats.addConstituent(new Constituent("context-before=" + prevword, "feats", ta, span.getFirst(), span.getSecond())); 47 | myfeats.addConstituent(new Constituent("context-before=" + prevprevword + "_" + prevword, "feats", ta, span.getFirst(), span.getSecond())); 48 | 49 | // deal with padding. 50 | String nextword = "_"; 51 | String nextnextword = "_"; 52 | if(span.getSecond() < ta.size() -1 ) { 53 | nextword = ta.getToken(span.getSecond() + 1); 54 | } 55 | if(span.getSecond() < ta.size()-2 ){ 56 | nextnextword = ta.getToken(span.getSecond()+2); 57 | } 58 | 59 | myfeats.addConstituent(new Constituent("context-after=" + nextword, "feats", ta, span.getFirst(), span.getSecond())); 60 | myfeats.addConstituent(new Constituent("context-after=" + nextword + "_" + nextnextword, "feats", ta, span.getFirst(), span.getSecond())); 61 | 62 | myfeats.addConstituent(new Constituent("full-string=" + surface.replaceAll(" ", "_"), "feats", ta, span.getFirst(), span.getSecond())); 63 | 64 | 65 | // for(String token : surface.split(" ")) { 66 | // myfeats.addConstituent(new Constituent("contains=" + token, "feats", ta, span.getFirst(), span.getSecond())); 67 | // } 68 | 69 | // if(!StringUtils.isAlpha(surface)){ 70 | // //feats.add(); 71 | // myfeats.addConstituent(new Constituent("nonalpha=" + surface, "feats", ta, span.getFirst(), span.getSecond())); 72 | // } 73 | 74 | 75 | } 76 | ta.addView("feats", myfeats); 77 | } 78 | 79 | /** 80 | * Given a TextAnnotation and the list of patterns (which is??), return suggestions. 81 | * @param ta 82 | * @param patterns 83 | * @return 84 | */ 85 | public static List findfeatfires(TextAnnotation ta, HashMap, Double> patterns){ 86 | List suggestions = new ArrayList<>(); 87 | 88 | for(Pair feat : patterns.keySet()){ 89 | String featname = feat.getFirst(); 90 | String label = feat.getSecond(); 91 | if(featname.startsWith("context-before")){ 92 | String context = featname.split("=")[1]; 93 | if(context.contains("_")){ 94 | context = context.replaceAll("_", " "); 95 | } 96 | 97 | // now find occurrences of this context in the ta. 98 | for(IntPair span : ta.getSpansMatching(context)){ 99 | IntPair nextspan = new IntPair(span.getSecond(), span.getSecond()+1); 100 | if(span.getSecond() < ta.size()) { 101 | Suggestion s = new Suggestion(nextspan, label, String.format("%s for %s, weight: %f", featname, label, patterns.get(feat))); 102 | suggestions.add(s); 103 | } 104 | } 105 | }else if(featname.startsWith("context-after")){ 106 | String context = featname.split("=")[1]; 107 | if(context.contains("_")){ 108 | context = context.replaceAll("_", " "); 109 | } 110 | 111 | // now find occurrences of this context in the ta. 112 | for(IntPair span : ta.getSpansMatching(context)){ 113 | IntPair prevspan = new IntPair(span.getFirst()-1, span.getFirst()); 114 | if(span.getFirst() > 0) { 115 | Suggestion s = new Suggestion(prevspan, label, String.format("%s for %s, weight: %f", featname, label, patterns.get(feat))); 116 | suggestions.add(s); 117 | } 118 | } 119 | }else if(featname.startsWith("full-string")){ 120 | String surface = featname.split("=")[1]; 121 | if(surface.contains("_")){ 122 | surface = surface.replaceAll("_", " "); 123 | } 124 | 125 | // now find occurrences of this context in the ta. 126 | for(IntPair span : ta.getSpansMatching(surface)){ 127 | Suggestion s = new Suggestion(span, label, String.format("%s for %s, weight: %f", featname, label, patterns.get(feat))); 128 | suggestions.add(s); 129 | } 130 | } 131 | 132 | 133 | } 134 | 135 | return suggestions; 136 | } 137 | 138 | } 139 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/Group.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw; 2 | 3 | import java.util.HashMap; 4 | import java.util.HashSet; 5 | 6 | /** 7 | * This is meant to add functionality to the hashset. 8 | * 9 | * Created by stephen on 6/29/17. 10 | */ 11 | public class Group extends HashSet { 12 | 13 | // is all annotated? 14 | // label dictionary 15 | // most common label 16 | private HashSet anno = new HashSet<>(); 17 | 18 | private HashMap typemap = new HashMap<>(); 19 | 20 | /** 21 | * Add an annotated sentence. 22 | * @param sent 23 | */ 24 | public void addAnno(String sent, String type){ 25 | this.add(sent); 26 | this.anno.add(sent); 27 | this.addType(type); 28 | } 29 | 30 | public HashSet getAnno() { 31 | return anno; 32 | } 33 | 34 | public void addType(String tag){ 35 | int num = typemap.getOrDefault(tag, 0); 36 | typemap.put(tag, num+1); 37 | } 38 | 39 | public String maxType(){ 40 | return typemap.keySet().stream().max((a,b) -> typemap.get(a) - typemap.get(b)).orElse("null"); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/KeyComparator.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw; 2 | 3 | import java.util.Comparator; 4 | 5 | /** 6 | * Created by mayhew2 on 5/4/17. 7 | */ 8 | public class KeyComparator implements Comparator { 9 | 10 | @Override 11 | public int compare(String o1, String o2) { 12 | int retval; 13 | try{ 14 | retval = Integer.compare(Integer.parseInt(o1), Integer.parseInt(o2)); 15 | }catch(NumberFormatException e){ 16 | retval = o1.compareTo(o2); 17 | } 18 | return retval; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/SessionData.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw; 2 | 3 | import edu.illinois.cs.cogcomp.core.datastructures.Pair; 4 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; 5 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; 6 | import io.github.mayhewsw.utils.SentenceCache; 7 | import org.apache.lucene.store.RAMDirectory; 8 | 9 | import javax.servlet.http.HttpSession; 10 | import java.util.*; 11 | 12 | /** 13 | * 14 | * This is a way to collect all the data that is typically found in a session. 15 | * 16 | * Created by mayhew2 on 2/23/17. 17 | */ 18 | public class SessionData { 19 | 20 | public HashMap datasets; 21 | public Dictionary dict; 22 | public String username; 23 | public Boolean showdefs; 24 | public Boolean showroman; 25 | public Boolean allowcopy; 26 | public Boolean showgoogle; 27 | 28 | 29 | public TreeMap tas; 30 | public String dataname; 31 | public HashMap, Double> patterns; 32 | public RAMDirectory ramDirectory; 33 | 34 | public ArrayList suffixes; 35 | public Properties prop; 36 | 37 | public SentenceCache cache; 38 | //HashMap annosents; 39 | 40 | public HashMap> contexts; 41 | public List labels; 42 | public String indexpath; 43 | 44 | public HashMap groups; 45 | public String logfile; 46 | 47 | public SessionData(HttpSession hs){ 48 | 49 | tas = (TreeMap) hs.getAttribute("tas"); 50 | dict = (Dictionary) hs.getAttribute("dict"); 51 | patterns = (HashMap, Double>) hs.getAttribute("patterns"); 52 | 53 | username = (String) hs.getAttribute("username"); 54 | dataname = (String) hs.getAttribute("dataname"); 55 | // folderpath = (String) hs.getAttribute("folderpath"); 56 | indexpath = (String) hs.getAttribute("indexpath"); 57 | 58 | showdefs = (Boolean) hs.getAttribute("showdefs"); 59 | if(showdefs == null){ 60 | showdefs = false; 61 | } 62 | 63 | showroman = (Boolean) hs.getAttribute("showroman"); 64 | if(showroman == null){ 65 | showroman = false; 66 | } 67 | 68 | showgoogle = (Boolean) hs.getAttribute("showgoogle"); 69 | if(showgoogle == null){ 70 | showgoogle = false; 71 | } 72 | 73 | allowcopy = (Boolean) hs.getAttribute("allowcopy"); 74 | if(allowcopy == null){ 75 | allowcopy = false; 76 | } 77 | 78 | suffixes = (ArrayList) hs.getAttribute("suffixes"); 79 | 80 | prop = (Properties) hs.getAttribute("prop"); 81 | labels = (List) hs.getAttribute("labels"); 82 | ramDirectory = (RAMDirectory) hs.getAttribute("ramdirectory"); 83 | 84 | contexts = (HashMap>) hs.getAttribute("contexts"); 85 | 86 | cache = (SentenceCache) hs.getAttribute("cache"); 87 | //annosents = (HashMap>) hs.getAttribute("annosents"); 88 | 89 | groups = (HashMap) hs.getAttribute("groups"); 90 | 91 | logfile = (String) hs.getAttribute("logfile"); 92 | 93 | datasets = (HashMap) hs.getAttribute("datasets"); 94 | 95 | } 96 | 97 | 98 | } 99 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/SessionInterceptor.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | import org.springframework.web.servlet.HandlerInterceptor; 6 | import org.springframework.web.servlet.ModelAndView; 7 | import org.springframework.web.servlet.handler.HandlerInterceptorAdapter; 8 | 9 | import javax.servlet.http.HttpServletRequest; 10 | import javax.servlet.http.HttpServletResponse; 11 | import javax.servlet.http.HttpSession; 12 | 13 | /** 14 | * Created by mayhew2 on 2/3/17. 15 | */ 16 | public class SessionInterceptor implements HandlerInterceptor { 17 | 18 | private static Logger logger = LoggerFactory.getLogger(SessionInterceptor.class); 19 | 20 | @Override 21 | public boolean preHandle(HttpServletRequest request, HttpServletResponse response, Object o) throws Exception { 22 | // Don't create a new session if it doesn't exist. 23 | HttpSession session = request.getSession(false); 24 | 25 | logger.info("REQUESTURI: " + request.getRequestURI()); 26 | 27 | if(!request.getRequestURI().startsWith("/setname") && 28 | !request.getRequestURI().equals("/") && session != null && 29 | session.getAttribute("username") == null) { 30 | logger.info("Username is null, redirecting to home page."); 31 | response.sendRedirect("/"); 32 | return false; 33 | } 34 | 35 | return true; 36 | } 37 | 38 | @Override 39 | public void postHandle(HttpServletRequest httpServletRequest, HttpServletResponse httpServletResponse, Object o, ModelAndView modelAndView) throws Exception { 40 | //System.out.println("POSTHANDLE"); 41 | 42 | } 43 | 44 | @Override 45 | public void afterCompletion(HttpServletRequest httpServletRequest, HttpServletResponse httpServletResponse, Object o, Exception e) throws Exception { 46 | 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/Suggestion.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw; 2 | 3 | import edu.illinois.cs.cogcomp.core.datastructures.IntPair; 4 | 5 | /** 6 | * 7 | * A suggestion object holds a span, a label, and a reason for this suggestion. 8 | * 9 | * Created by stephen on 2/27/17. 10 | */ 11 | public class Suggestion { 12 | 13 | public String label; 14 | public String reason; 15 | public IntPair span; 16 | 17 | public Suggestion(IntPair span, String label, String reason){ 18 | this.span = span; 19 | this.label = label; 20 | this.reason = reason; 21 | } 22 | 23 | public int getStartSpan(){ 24 | return this.span.getFirst(); 25 | } 26 | 27 | public int getEndSpan(){ 28 | return this.span.getSecond(); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/User.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw; 2 | 3 | /** 4 | * Created by mayhew2 on 5/31/16. 5 | */ 6 | 7 | public class User { 8 | 9 | private long id; 10 | private String name; 11 | 12 | public long getId() { 13 | return id; 14 | } 15 | 16 | public void setId(long id) { 17 | this.id = id; 18 | } 19 | 20 | public String getName() { 21 | return name; 22 | } 23 | 24 | public void setName(String name) { 25 | this.name = name; 26 | } 27 | 28 | } -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/WebSecurityConfig.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw; 2 | 3 | 4 | import edu.illinois.cs.cogcomp.core.io.LineIO; 5 | import org.springframework.beans.factory.annotation.Autowired; 6 | import org.springframework.context.annotation.Bean; 7 | import org.springframework.context.annotation.Configuration; 8 | import org.springframework.security.config.annotation.authentication.builders.AuthenticationManagerBuilder; 9 | import org.springframework.security.config.annotation.web.builders.HttpSecurity; 10 | import org.springframework.security.config.annotation.web.configuration.WebSecurityConfigurerAdapter; 11 | import org.springframework.security.config.annotation.web.configuration.EnableWebSecurity; 12 | import org.springframework.security.core.userdetails.UserDetails; 13 | import org.springframework.security.core.userdetails.User; 14 | import org.springframework.security.core.userdetails.UserDetailsService; 15 | import org.springframework.security.provisioning.InMemoryUserDetailsManager; 16 | 17 | import java.io.FileNotFoundException; 18 | import java.util.ArrayList; 19 | 20 | @Configuration 21 | @EnableWebSecurity 22 | public class WebSecurityConfig extends WebSecurityConfigurerAdapter { 23 | @Override 24 | protected void configure(HttpSecurity http) throws Exception { 25 | http.authorizeRequests() 26 | .antMatchers("/css/**", "/js/**", "/img/**", "/resources/**").permitAll() 27 | .anyRequest().authenticated() 28 | .and() 29 | .formLogin() 30 | .loginPage("/login") 31 | .defaultSuccessUrl("/", true) 32 | .permitAll() 33 | .and() 34 | .logout() 35 | .permitAll(); 36 | http.csrf().disable(); 37 | } 38 | 39 | @Bean 40 | @Override 41 | public UserDetailsService userDetailsService() { 42 | 43 | InMemoryUserDetailsManager manager = new InMemoryUserDetailsManager(); 44 | 45 | try { 46 | ArrayList users = LineIO.read("config/users.txt"); 47 | 48 | for(String up : users){ 49 | String[] userpass = up.split("\\s+"); 50 | if(userpass.length == 2) { 51 | UserDetails user = User.withDefaultPasswordEncoder() 52 | .username(userpass[0]) 53 | .password(userpass[1]) 54 | .roles("USER") 55 | .build(); 56 | manager.createUser(user); 57 | } 58 | } 59 | 60 | } catch (FileNotFoundException e) { 61 | System.err.println("config/users.txt file not found. Using default user/password instead."); 62 | UserDetails user = User.withDefaultPasswordEncoder() 63 | .username("user") 64 | .password("password") 65 | .roles("USER") 66 | .build(); 67 | manager.createUser(user); 68 | } 69 | return manager; 70 | } 71 | } -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/classifier/CandParser.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw.classifier; 2 | 3 | import edu.illinois.cs.cogcomp.lbjava.parse.Parser; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * Created by mayhew2 on 6/20/17. 9 | */ 10 | public class CandParser implements Parser { 11 | 12 | private final List cands; 13 | private int i; 14 | 15 | 16 | public CandParser(List cands){ 17 | this.cands = cands; 18 | this.i = 0; 19 | } 20 | 21 | @Override 22 | public Object next() { 23 | if(i >= cands.size()) return null; 24 | return cands.get(this.i++); 25 | } 26 | 27 | @Override 28 | public void reset() { 29 | this.i = 0; 30 | } 31 | 32 | @Override 33 | public void close() { 34 | 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/classifier/Candidate.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw.classifier; 2 | 3 | import org.apache.commons.lang.StringUtils; 4 | import org.apache.commons.math3.util.IterationEvent; 5 | 6 | import java.util.*; 7 | 8 | /** 9 | * Created by mayhew2 on 6/15/17. 10 | */ 11 | public class Candidate { 12 | public static final String punctuation = "!@#$%^&*()_-+=~`:;<>,./?|\\\"\'‹‹››،[]{}"; 13 | public boolean isgood; 14 | public ArrayList tokens; 15 | public HashMap contexts; 16 | public String name; 17 | public double totalcontexts; 18 | 19 | public Candidate(String name, HashMap ctx) { 20 | tokens = new ArrayList<>(); 21 | tokens.addAll(Arrays.asList(name.split(" "))); 22 | 23 | this.name = name; 24 | 25 | contexts = ctx; 26 | totalcontexts = ctx.values().stream().mapToDouble(i -> i).sum(); 27 | } 28 | 29 | @Override 30 | public String toString() { 31 | return "Candidate{" + 32 | "isgood=" + isgood + 33 | ", tokens=" + StringUtils.join(tokens, " ") + 34 | '}'; 35 | } 36 | 37 | public boolean haspunc(){ 38 | for(String t : tokens){ 39 | if(punctuation.contains(t)){ 40 | return true; 41 | } 42 | } 43 | return false; 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/controllers/Common.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw.controllers; 2 | 3 | import edu.illinois.cs.cogcomp.core.io.LineIO; 4 | import io.github.mayhewsw.ConfigFile; 5 | 6 | import java.io.*; 7 | import java.util.HashMap; 8 | import java.util.List; 9 | import java.util.Properties; 10 | 11 | public class Common { 12 | 13 | public static final String FOLDERTA = "ta"; 14 | public static final String FOLDERTAJSON = "tajson"; 15 | public static final String FOLDERCOLUMN = "column"; 16 | public static final String FOLDERCONLL = "conll"; 17 | 18 | 19 | public static HashMap loadConfig() { 20 | // hardcoded to look in config path 21 | File configfolder = new File("config"); 22 | 23 | File[] configfiles = configfolder.listFiles(); 24 | 25 | HashMap datasets = new HashMap<>(); 26 | 27 | for(File f : configfiles){ 28 | if(f.getName().endsWith("~")) continue; 29 | 30 | if(f.getName().startsWith("doc-") || f.getName().startsWith("sent-")) { 31 | 32 | System.out.println(f); 33 | ConfigFile c = new ConfigFile(); 34 | 35 | try { 36 | BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF8")); 37 | 38 | // load a properties file 39 | c.loadProperties(in); 40 | 41 | datasets.put(f.getName(), c); 42 | 43 | } catch (IOException e) { 44 | 45 | } 46 | } 47 | } 48 | return datasets; 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/controllers/DictionaryController.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw.controllers; 2 | 3 | import edu.illinois.cs.cogcomp.core.datastructures.Pair; 4 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; 5 | import edu.illinois.cs.cogcomp.core.io.LineIO; 6 | import io.github.mayhewsw.Dictionary; 7 | import io.github.mayhewsw.SessionData; 8 | import io.github.mayhewsw.utils.HtmlGenerator; 9 | import org.apache.commons.lang3.StringUtils; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | import org.springframework.http.HttpStatus; 13 | import org.springframework.stereotype.Controller; 14 | import org.springframework.ui.Model; 15 | import org.springframework.web.bind.annotation.*; 16 | 17 | import javax.servlet.http.HttpSession; 18 | import java.io.File; 19 | import java.io.IOException; 20 | import java.util.*; 21 | 22 | /** 23 | * Created by mayhew2 on 2/3/17. 24 | */ 25 | 26 | @Controller 27 | @RequestMapping("/dict") 28 | public class DictionaryController { 29 | 30 | private static Logger logger = LoggerFactory.getLogger(DictionaryController.class); 31 | 32 | @RequestMapping(value="", method=RequestMethod.GET) 33 | public String showdict(Model model, HttpSession hs) { 34 | SessionData sd = new SessionData(hs); 35 | // this is called if we go to /dict 36 | 37 | if(sd.dict == null){ 38 | Dictionary dict = new Dictionary(sd.dataname, sd.username); 39 | hs.setAttribute("dict", dict); 40 | sd.dict = dict; 41 | } 42 | 43 | List> entries = new ArrayList<>(); 44 | int i = 0; 45 | for (String key : sd.dict.keySet()) { 46 | String valstring = StringUtils.join(sd.dict.get(key), ", "); 47 | Pair p = new Pair<>(key, valstring); 48 | entries.add(p); 49 | i++; 50 | 51 | // hard break at i == 1000 52 | if (i > 1000) { 53 | break; 54 | } 55 | } 56 | model.addAttribute("entries", entries); 57 | 58 | return "dict"; 59 | } 60 | 61 | @RequestMapping(value="lookup", method=RequestMethod.GET) 62 | @ResponseBody 63 | public String getdict(@RequestParam(value="word") String word, HttpSession hs, Model model) { 64 | 65 | // pass a dict list to this? 66 | io.github.mayhewsw.Dictionary dict = (Dictionary) hs.getAttribute("dict"); 67 | 68 | List defs = dict.get(word); 69 | 70 | String ret = "No definition found"; 71 | if(defs != null){ 72 | ret = defs.toString(); 73 | } 74 | 75 | return ret; 76 | } 77 | 78 | 79 | @RequestMapping(value="add", method=RequestMethod.GET) 80 | @ResponseStatus(value = HttpStatus.OK) 81 | @ResponseBody 82 | public String adddef(@RequestParam(value="key") String key, @RequestParam(value="val") String val, HttpSession hs) throws IOException { 83 | 84 | SessionData sd = new SessionData(hs); 85 | 86 | logger.info("Adddef: add " + key + " -> " + val); 87 | 88 | if(key.length() > 0 && val.length() > 0) { 89 | sd.dict.add(key, val); 90 | sd.dict.save(sd.dataname, sd.username); 91 | } 92 | return "Success"; 93 | } 94 | 95 | 96 | 97 | 98 | } 99 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/controllers/LoginController.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw.controllers; 2 | 3 | import edu.illinois.cs.cogcomp.core.io.LineIO; 4 | import io.github.mayhewsw.ConfigFile; 5 | import io.github.mayhewsw.SessionData; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | import org.springframework.security.core.Authentication; 9 | import org.springframework.security.core.context.SecurityContextHolder; 10 | import org.springframework.stereotype.Controller; 11 | import org.springframework.web.bind.annotation.*; 12 | 13 | import javax.servlet.http.HttpSession; 14 | import java.io.IOException; 15 | import java.util.*; 16 | 17 | import org.springframework.ui.Model; 18 | 19 | /** 20 | * Created by stephen on 8/2/17. 21 | */ 22 | @SuppressWarnings("ALL") 23 | @Controller 24 | public class LoginController { 25 | 26 | private static Logger logger = LoggerFactory.getLogger(LoginController.class); 27 | 28 | @RequestMapping(value = "/", method = RequestMethod.GET) 29 | public String adduser(Model model, HttpSession hs) { 30 | 31 | Authentication auth = SecurityContextHolder.getContext().getAuthentication(); 32 | String name = auth.getName(); //get logged in username 33 | 34 | logger.info("Username is " + name); 35 | hs.setAttribute("username", name); 36 | 37 | // This will also add datasets... 38 | HashMap datasets = Common.loadConfig(); 39 | hs.setAttribute("datasets", datasets); 40 | 41 | // in case you want to add a new one! 42 | model.addAttribute("config", new ConfigFile()); 43 | 44 | return "index"; 45 | } 46 | 47 | @RequestMapping(value = "/login", method = RequestMethod.GET) 48 | public String login(HttpSession hs) { 49 | 50 | return "login"; 51 | } 52 | 53 | @PostMapping(value = "/config") 54 | public String config(@ModelAttribute ConfigFile c, HttpSession hs) throws IOException { 55 | 56 | System.out.println("Writing to: config/" + c.getFname()); 57 | 58 | 59 | LineIO.write("config/" + c.getFname(), Collections.singletonList(c.toString())); 60 | 61 | return "redirect:/"; 62 | 63 | } 64 | 65 | @PostMapping(value = "/loadconfig") 66 | @ResponseBody 67 | public String config(@RequestParam(value="config") String configname, Model model, HttpSession hs) throws IOException { 68 | SessionData sd = new SessionData(hs); 69 | ConfigFile c = sd.datasets.get(configname); 70 | model.addAttribute("config", c); 71 | 72 | return "success"; 73 | } 74 | 75 | 76 | @RequestMapping(value = "/logout") 77 | public void logout(HttpSession hs) { 78 | logger.info("Logging out..."); 79 | 80 | // I think this is preferable. 81 | hs.invalidate(); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/controllers/TextStatisticsController.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw.controllers; 2 | 3 | import edu.illinois.cs.cogcomp.core.datastructures.Pair; 4 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; 5 | import io.github.mayhewsw.SessionData; 6 | import io.github.mayhewsw.utils.Utils; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.stereotype.Controller; 10 | import org.springframework.ui.Model; 11 | import org.springframework.web.bind.annotation.*; 12 | 13 | import javax.servlet.http.HttpSession; 14 | import java.util.*; 15 | import java.util.stream.Collectors; 16 | 17 | /** 18 | * Created by mayhew2 on 2/3/17. 19 | */ 20 | 21 | @Controller 22 | @RequestMapping("/stats") 23 | public class TextStatisticsController { 24 | 25 | private static Logger logger = LoggerFactory.getLogger(TextStatisticsController.class); 26 | private static HashMap counts = new HashMap<>(); 27 | private static int numdocs = 0; 28 | private static HashMap term2numdocs = new HashMap<>(); 29 | 30 | @RequestMapping(value="gettopstats", method=RequestMethod.POST) 31 | @ResponseBody 32 | public String gettopstats(@RequestParam(value="docid") String docid, HttpSession hs, Model model) { 33 | 34 | SessionData sd = new SessionData(hs); 35 | 36 | TextAnnotation ta = sd.tas.get(docid); 37 | 38 | // need to keep track of the number of times this word appears in the document. 39 | HashMap doccounts = new HashMap<>(); 40 | String[] romantext = Utils.getRomanTaToks(ta); 41 | HashMap orig2roman = new HashMap<>(); 42 | 43 | for(int i = 0; i < ta.size(); i++){ 44 | String word = ta.getToken(i); 45 | int c = doccounts.getOrDefault(word, 0); 46 | doccounts.put(word, c+1); 47 | orig2roman.put(word, romantext[i]); 48 | } 49 | 50 | String ret = ""; 51 | ret += ""; 52 | ret += ""; 53 | ret += ""; 54 | ret += ""; 55 | ret += ""; 56 | 57 | HashSet> toptfidf = new HashSet(); 58 | 59 | for (String word : ta.getTokens()){ 60 | toptfidf.add(new Pair<>(word, tfidf(word, doccounts.get(word)))); 61 | } 62 | 63 | int k = 10; 64 | List> toptfidflist = toptfidf.stream() 65 | .sorted((a,b) -> b.getSecond().compareTo(a.getSecond())) 66 | .limit(k) 67 | .collect(Collectors.toList()); 68 | 69 | for(Pair p : toptfidflist){ 70 | String word = p.getFirst(); 71 | Double tfidf = p.getSecond(); 72 | 73 | String def = word; 74 | if (sd.showroman && orig2roman.containsKey(word)){ 75 | def = orig2roman.get(word); 76 | } else if (sd.showdefs && sd.dict != null && sd.dict.containsKey(word)) { 77 | def = "" + sd.dict.get(word).get(0) + ""; 78 | } 79 | 80 | String row = ""; 81 | row += ""; 82 | row += ""; 83 | row += String.format("",term2numdocs.getOrDefault(word, 0)/(float)numdocs); 84 | row += String.format("", tfidf); 85 | row += ""; 86 | ret += row; 87 | } 88 | 89 | 90 | ret += "
WordCnt%docsTfidf
"+def+""+counts.getOrDefault(word, 0)+"%.2f%.2f
"; 91 | 92 | return ret; 93 | } 94 | 95 | 96 | /** 97 | * This is meant to be a method that gets stats for individual words (e.g. if they are out of the top 10 words that are displayed 98 | * by default. But I think it is not that important and adds complexity. 99 | */ 100 | // @RequestMapping(value="getstats", method=RequestMethod.POST) 101 | // @ResponseBody 102 | // public String getstats(@RequestParam(value="text") String text, @RequestParam(value="alltext[]") String[] alltext, HttpSession hs, Model model) { 103 | // 104 | // SessionData sd = new SessionData(hs); 105 | // 106 | // // need to keep track of the number of times this word appears in the document. 107 | // HashMap doccounts = new HashMap<>(); 108 | // for(String word : alltext){ 109 | // int c = doccounts.getOrDefault(word, 0); 110 | // doccounts.put(word, c+1); 111 | // } 112 | // 113 | // String ret = ""; 114 | // ret += ""; 115 | // ret += ""; 116 | // ret += ""; 117 | // ret += ""; 118 | // ret += ""; 119 | // 120 | // String[] words = text.split(" "); 121 | // for (String word : words){ 122 | // 123 | // String def = word; 124 | // if (sd.showdefs && sd.dict != null && sd.dict.containsKey(word)) { 125 | // def = "" + sd.dict.get(word).get(0) + ""; 126 | // } 127 | // 128 | // String row = ""; 129 | // row += ""; 130 | // row += ""; 131 | // row += String.format("",term2numdocs.getOrDefault(word, 0)/(float)numdocs); 132 | // row += String.format("", tfidf(word, doccounts.get(word))); 133 | // row += ""; 134 | // 135 | // ret += row; 136 | // } 137 | // 138 | // ret += "
WordCnt%docsTfidf
"+def+""+counts.getOrDefault(word, 0)+"%.2f%.2f
"; 139 | // 140 | // return ret; 141 | // } 142 | 143 | public static void resetstats() { 144 | counts = new HashMap<>(); 145 | numdocs = 0; 146 | term2numdocs = new HashMap<>(); 147 | } 148 | 149 | /** 150 | * Assume that the text is tokenized with spaces. Assume this is 151 | * called once per document! 152 | * @param words 153 | */ 154 | public static void updateCounts(String[] words){ 155 | HashSet uniqwords = new HashSet<>(); 156 | for (String word : words){ 157 | int c = counts.getOrDefault(word, 0); 158 | counts.put(word, c +1); 159 | uniqwords.add(word); 160 | } 161 | 162 | for(String word : uniqwords) { 163 | int doccounts = term2numdocs.getOrDefault(word, 0); 164 | term2numdocs.put(word, doccounts + 1); 165 | } 166 | 167 | numdocs++; 168 | } 169 | 170 | public static double tfidf(String w, int doccounts){ 171 | if(term2numdocs.containsKey(w)) { 172 | return doccounts * Math.log(numdocs / term2numdocs.get(w)); 173 | }else{ 174 | return 0; 175 | } 176 | } 177 | 178 | // This came from loadData in document controller. Assumes a word frequency file that 179 | // has word freq on each line. Worth doing?? 180 | // String wordfreqsfile = prop.getProperty("wordfreqsfile"); 181 | // HashMap wordfreqs = new HashMap<>(); 182 | // if(wordfreqsfile != null){ 183 | // ArrayList lines = LineIO.read(wordfreqsfile); 184 | // for(String line : lines){ 185 | // String[] sline = line.split(" "); 186 | // String word = sline[0]; 187 | // int freq = Integer.parseInt(sline[1]); 188 | // wordfreqs.put(word, (float)freq); 189 | // } 190 | // }else{ 191 | // } 192 | // hs.setAttribute("wordfreqs", wordfreqs); 193 | 194 | 195 | 196 | 197 | 198 | 199 | } 200 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/utils/FinalSaver.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw.utils; 2 | 3 | import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; 4 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; 5 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; 6 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.View; 7 | import edu.illinois.cs.cogcomp.nlp.corpusreaders.CoNLLNerReader; 8 | import org.apache.commons.io.FileUtils; 9 | 10 | import java.io.File; 11 | import java.io.IOException; 12 | import java.nio.file.Paths; 13 | import java.util.*; 14 | 15 | import static edu.illinois.cs.cogcomp.nlp.corpusreaders.CoNLLNerReader.conllline; 16 | import static io.github.mayhewsw.controllers.SentenceController.getSentId; 17 | 18 | /** 19 | * This class is necessary because of the fact that most data we work with is non-Latin script, and we 20 | * work only with sentences. This takes a folder of Romanized annotation documents, an analogous folder 21 | * with original script documents, and a list of sentences that must be printed. 22 | * 23 | * This writes sentences out to file with the original script. This is a matter of matching up annotations 24 | * with sentences. Nothing too fancy. 25 | * 26 | * Created by stephen on 5/29/17. 27 | */ 28 | public class FinalSaver { 29 | 30 | public static void save(String origfolder, String romanfolder, String outfolder) throws IOException { 31 | 32 | 33 | if((new File(outfolder)).exists()) { 34 | CoNLLNerReader cnl = new CoNLLNerReader(romanfolder); 35 | 36 | while (cnl.hasNext()) { 37 | TextAnnotation ta = cnl.next(); 38 | View ner = ta.getView(ViewNames.NER_CONLL); 39 | 40 | TextAnnotation taorig = new CoNLLNerReader(origfolder + "/" + ta.getId()).next(); 41 | // this should overwrite the (empty) NER_CONLL view. 42 | taorig.addView(ViewNames.NER_CONLL, ner); 43 | View nerorig = taorig.getView(ViewNames.NER_CONLL); 44 | View sents = taorig.getView(ViewNames.SENTENCE); 45 | 46 | for (Constituent sent : sents.getConstituents()) { 47 | String sentid = getSentId(sent); 48 | 49 | if(nerorig.getConstituentsCovering(sent).size() == 0) continue; 50 | 51 | // how to rewrite this sentence as a textannotation. 52 | SentToConll(sent, outfolder); 53 | } 54 | } 55 | }else{ 56 | System.out.println(outfolder + " does not exist. Not doing anything."); 57 | } 58 | 59 | } 60 | 61 | 62 | /** 63 | * Write this sentence out to conll format. Each individual sentence becomes a file. 64 | * @param sent 65 | * @param outpath 66 | * @throws IOException 67 | */ 68 | public static void SentToConll(Constituent sent, String outpath) throws IOException { 69 | 70 | TextAnnotation ta = sent.getTextAnnotation(); 71 | ArrayList talines = new ArrayList(); 72 | View nerview = ta.getView("NER_CONLL"); 73 | 74 | for(int i = sent.getStartSpan(); i < sent.getEndSpan(); ++i) { 75 | String label = "O"; 76 | List constituents = nerview.getConstituentsCoveringToken(i); 77 | if(constituents.size() > 0) { 78 | Constituent sents = (Constituent)constituents.get(0); 79 | if(sents.getStartSpan() == i) { 80 | label = "B-" + sents.getLabel(); 81 | } else { 82 | label = "I-" + sents.getLabel(); 83 | } 84 | } 85 | 86 | talines.add(conllline(label, i, ta.getToken(i))); 87 | } 88 | 89 | FileUtils.writeLines(Paths.get(outpath, new String[]{getSentId(sent)}).toFile(), talines); 90 | } 91 | 92 | public static void main(String[] args) throws IOException { 93 | String username = "steve"; 94 | 95 | String dir = "/shared/corpora/corporaWeb/lorelei/evaluation-20170804/LDC2017E29_LORELEI_IL6_Incident_Language_Pack_for_Year_2_Eval_V1.1/"; 96 | 97 | String origfolder = dir + "conll-set0-sentanno-" + username; 98 | String romanfolder = dir + "conll-set0-sentanno-" + username; 99 | String outpath = dir + "final-" + username + "/"; 100 | 101 | 102 | //String dir = "/shared/corpora/corporaWeb/lorelei/data/LDC2016E86_LORELEI_Amharic_Representative_Language_Pack_Monolingual_Text_V1.1/data/monolingual_text/zipped/"; 103 | //String origfolder = dir + "conll/"; 104 | //String romanfolder = dir + "conll-pyrom-sentanno-"+username +"/"; 105 | //String outpath = dir + "final-"+username +"/"; 106 | 107 | save(origfolder, romanfolder, outpath); 108 | 109 | } 110 | 111 | } 112 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/utils/IO.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw.utils; 2 | 3 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; 4 | import edu.illinois.cs.cogcomp.core.io.IOUtils; 5 | import edu.illinois.cs.cogcomp.core.utilities.SerializationHelper; 6 | import edu.illinois.cs.cogcomp.nlp.corpusreaders.CoNLLNerReader; 7 | import io.github.mayhewsw.controllers.Common; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | import java.io.File; 12 | 13 | import java.io.FileNotFoundException; 14 | import java.io.IOException; 15 | import java.util.Collections; 16 | import java.util.Map; 17 | 18 | public class IO { 19 | private static Logger logger = LoggerFactory.getLogger(IO.class); 20 | 21 | /** 22 | * This loads a group of textannotations from 23 | * @param foldertype 24 | * @param folder 25 | * @param ret 26 | * @throws Exception 27 | */ 28 | public static void read(String foldertype, String folder, Map ret) throws Exception { 29 | 30 | if (foldertype.equals(Common.FOLDERTA)) { 31 | File outf = new File(folder); 32 | 33 | if(!outf.exists()){ 34 | throw new FileNotFoundException("Folder " + folder + " does not exist."); 35 | } 36 | 37 | File[] files = outf.listFiles(); 38 | int limit = Math.min(files.length, 500); 39 | 40 | for (int i = 0; i < limit; i++) { 41 | File file = files[i]; 42 | TextAnnotation ta = SerializationHelper.deserializeTextAnnotationFromFile(file.getAbsolutePath()); 43 | // NOTE: ideally file.getName() == ta.getID(), but this is not always the case. 44 | //ret.put(file.getName(), ta); 45 | ret.put(ta.getId(), ta); 46 | } 47 | }else if(foldertype.equals(Common.FOLDERTAJSON)){ 48 | 49 | File outf = new File(folder); 50 | 51 | if(!outf.exists()){ 52 | throw new FileNotFoundException("Folder " + folder + " does not exist."); 53 | } 54 | 55 | File[] files = outf.listFiles(); 56 | 57 | int limit = Math.min(files.length, 500); 58 | 59 | for (int i = 0; i < limit; i++) { 60 | File file = files[i]; 61 | TextAnnotation ta = SerializationHelper.deserializeTextAnnotationFromFile(file.getAbsolutePath(), true); 62 | ret.put(file.getName(), ta); 63 | } 64 | } else if (foldertype.equals(Common.FOLDERCONLL)) { 65 | CoNLLNerReader cnl = new CoNLLNerReader(folder); 66 | while (cnl.hasNext()) { 67 | TextAnnotation ta = cnl.next(); 68 | logger.info("Loading: " + ta.getId()); 69 | 70 | ret.put(ta.getId(), ta); 71 | } 72 | }else if (foldertype.equals(Common.FOLDERCOLUMN)) { 73 | ColumnReader cnl = new ColumnReader(folder); 74 | while (cnl.hasNext()) { 75 | TextAnnotation ta = cnl.next(); 76 | logger.info("Loading: " + ta.getId()); 77 | 78 | ret.put(ta.getId(), ta); 79 | } 80 | } 81 | 82 | } 83 | 84 | /** 85 | * This saves an individual TextAnnotation to the desired output folder. 86 | * @param foldertype 87 | * @param path 88 | * @param ta 89 | * @throws IOException 90 | */ 91 | public static void save(String foldertype, String path, TextAnnotation ta) throws IOException { 92 | if(!IOUtils.exists(path)) { 93 | IOUtils.mkdir(path); 94 | } 95 | 96 | if(foldertype.equals(Common.FOLDERTA)) { 97 | SerializationHelper.serializeTextAnnotationToFile(ta, path + "/" + ta.getId(), true); 98 | }else if(foldertype.equals(Common.FOLDERTAJSON)) { 99 | SerializationHelper.serializeTextAnnotationToFile(ta, path + "/" + ta.getId(), true, true); 100 | }else if(foldertype.equals(Common.FOLDERCONLL)) { 101 | CoNLLNerReader.TaToConll(Collections.singletonList(ta), path); 102 | }else if(foldertype.equals(Common.FOLDERCOLUMN)) { 103 | ColumnReader.TaToColumn(Collections.singletonList(ta), path); 104 | } 105 | 106 | } 107 | 108 | } 109 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/utils/Propagator.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw.utils; 2 | 3 | import edu.illinois.cs.cogcomp.core.datastructures.IntPair; 4 | import edu.illinois.cs.cogcomp.core.datastructures.Pair; 5 | import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; 6 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; 7 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; 8 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.View; 9 | import edu.illinois.cs.cogcomp.core.io.LineIO; 10 | import edu.illinois.cs.cogcomp.nlp.corpusreaders.CoNLLNerReader; 11 | import org.codehaus.groovy.runtime.callsite.ConstructorSite; 12 | import org.mapdb.Atomic; 13 | 14 | import java.io.IOException; 15 | import java.lang.reflect.Array; 16 | import java.util.ArrayList; 17 | import java.util.Collections; 18 | import java.util.HashMap; 19 | import java.util.List; 20 | import java.util.concurrent.atomic.AtomicInteger; 21 | 22 | /** 23 | * Created by mayhew2 on 3/13/17. 24 | */ 25 | public class Propagator { 26 | 27 | /** 28 | * This will save the rules out to file. 29 | * @param fname 30 | * @param rules 31 | * @throws IOException 32 | */ 33 | public static void saveRules(String fname, HashMap> rules) throws IOException { 34 | ArrayList rulelines = new ArrayList<>(); 35 | 36 | ArrayList sortedkeys = new ArrayList<>(rules.keySet()); 37 | Collections.sort(sortedkeys); 38 | 39 | for(String surface : sortedkeys){ 40 | HashMap labelcounts = rules.get(surface); 41 | String outs = surface + "\t"; 42 | for(String label : labelcounts.keySet()){ 43 | outs += label + ":" + labelcounts.get(label) + "\t"; 44 | } 45 | rulelines.add(outs.trim()); 46 | } 47 | 48 | LineIO.write(fname, rulelines); 49 | } 50 | 51 | /** 52 | * This induces a set of rules over a corpus. 53 | * @param infolder 54 | * @return 55 | */ 56 | public static HashMap> getRules(String infolder){ 57 | CoNLLNerReader cnr = new CoNLLNerReader(infolder); 58 | 59 | // maps from: surface text to {label:count, label:count} 60 | HashMap> rules = new HashMap<>(); 61 | 62 | while(cnr.hasNext()){ 63 | TextAnnotation ta = cnr.next(); 64 | 65 | View ner = ta.getView(ViewNames.NER_CONLL); 66 | 67 | 68 | for(Constituent c : ner.getConstituents()){ 69 | String surface = c.getSurfaceForm(); 70 | String label = c.getLabel(); 71 | 72 | HashMap counts = rules.getOrDefault(surface, new HashMap<>()); 73 | 74 | // increment label count. 75 | int labelcount = counts.getOrDefault(label, 0); 76 | counts.put(label, labelcount + 1); 77 | 78 | rules.put(surface, counts); 79 | } 80 | 81 | } 82 | System.out.println(rules); 83 | return rules; 84 | } 85 | 86 | /** 87 | * This takes a set of rules and applies them to a corpus. 88 | * @param outfolder 89 | * @param rules 90 | * @throws IOException 91 | */ 92 | public static void applyRules(String infolder, String outfolder, HashMap> rules) throws IOException { 93 | List outtas = new ArrayList<>(); 94 | 95 | int added = 0; 96 | 97 | // we want to count the consistency of this corpus 98 | // so this maps rule to (already applied, needs application). 99 | HashMap> rulecounts = new HashMap<>(); 100 | for(String surface : rules.keySet()){ 101 | rulecounts.put(surface, new Pair<>(new AtomicInteger(0), new AtomicInteger(0))); 102 | } 103 | 104 | CoNLLNerReader cnr = new CoNLLNerReader(infolder); 105 | while(cnr.hasNext()){ 106 | TextAnnotation ta = cnr.next(); 107 | View ner = ta.getView(ViewNames.NER_CONLL); 108 | 109 | int i = 1; 110 | for(String surface : rules.keySet()){ 111 | 112 | 113 | 114 | List spans = ta.getSpansMatching(surface); 115 | 116 | String maxlabel = Collections.max(rules.get(surface).entrySet(), (entry1, entry2) -> entry1.getValue() - entry2.getValue()).getKey(); 117 | 118 | // skip rules that show up less than 3 times. 119 | // if(rules.get(surface).get(maxlabel) < 3) continue; 120 | 121 | for(IntPair span : spans){ 122 | 123 | List others = ner.getConstituentsCoveringSpan(span.getFirst(), span.getSecond()); 124 | if(others.size() > 1){ 125 | System.err.println("Should only be one constituent on this string..."); 126 | for(Constituent other : others){ 127 | ner.removeConstituent(other); 128 | others = new ArrayList<>(); 129 | } 130 | } 131 | 132 | boolean add = false; 133 | 134 | // two cases 135 | if(others.size() == 0){ 136 | // apply label, increment unlabeld counter 137 | rulecounts.get(surface).getSecond().incrementAndGet(); 138 | add = true; 139 | }else{ 140 | Constituent other = others.get(0); 141 | if(other.getLabel().equals(maxlabel) && other.getSpan().equals(span)){ 142 | // if exact match to rule, increment labeled counter 143 | rulecounts.get(surface).getFirst().incrementAndGet(); 144 | }else { 145 | // remove constituent, apply label, increment unlabeled counter. 146 | // if other is wholly contained in span, then remove it and add current. 147 | // if current is wholly contained in other, then do nothing. 148 | int a = span.getFirst(); 149 | int b = span.getSecond(); 150 | int c = other.getStartSpan(); 151 | int d = other.getEndSpan(); 152 | 153 | if (a <= c && b >= d) { 154 | // remove other, add current. 155 | ner.removeConstituent(other); 156 | add = true; 157 | rulecounts.get(surface).getSecond().incrementAndGet(); 158 | } 159 | } 160 | 161 | } 162 | 163 | if(add) { 164 | Constituent newc = new Constituent(maxlabel, ViewNames.NER_CONLL, ta, span.getFirst(), span.getSecond()); 165 | ner.addConstituent(newc); 166 | added++; 167 | System.out.println("Adding: " + surface + " -> " + maxlabel); 168 | } 169 | } 170 | } 171 | outtas.add(ta); 172 | } 173 | System.out.println(rulecounts); 174 | 175 | int totalspans = 0; 176 | int unlabeledspans = 0; 177 | for(String surface : rulecounts.keySet()){ 178 | Pair p = rulecounts.get(surface); 179 | totalspans += p.getFirst().get() + p.getSecond().get(); 180 | unlabeledspans += p.getSecond().get(); 181 | } 182 | 183 | System.out.println("There are " + rules.keySet().size() + " rules."); 184 | System.out.println("out of " + totalspans + " applicable spans, " + unlabeledspans + " were unlabeled."); 185 | 186 | 187 | System.out.println("Added this many: " + added); 188 | System.out.println("Writing to: " + outfolder); 189 | CoNLLNerReader.TaToConll(outtas, outfolder); 190 | } 191 | 192 | public static void main(String[] args) throws IOException { 193 | 194 | // rules are learned from this folder 195 | String annotator = "bridgel2"; 196 | String lang = "ug"; 197 | 198 | String infolder = "/shared/corpora/ner/human/"+lang+"/conll-anno-" + annotator; 199 | 200 | HashMap> rules = getRules(infolder); 201 | saveRules("/shared/corpora/ner/human/"+lang+"/rules-" + annotator, rules); 202 | 203 | // rules are applied to this folder. 204 | //String outfolder = "/shared/corpora/ner/human/"+lang+"/conll-anno-"+annotator+"-prop"; 205 | String testfolder = "/shared/corpora/ner/lorelei/ug/"; 206 | applyRules(testfolder + "All-stem-clear", testfolder + "All-stem-"+annotator+"-rules", rules); 207 | } 208 | 209 | } 210 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/utils/Sandbox.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw.utils; 2 | 3 | /** 4 | * Created by mayhew2 on 5/18/17. 5 | */ 6 | public class Sandbox { 7 | 8 | 9 | public static void main(String[] args) throws Exception { 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/io/github/mayhewsw/utils/Utils.java: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw.utils; 2 | 3 | import cz.jirutka.unidecode.Unidecode; 4 | import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; 5 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; 6 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; 7 | import edu.illinois.cs.cogcomp.core.datastructures.textannotation.View; 8 | 9 | import java.util.ArrayList; 10 | import java.util.HashMap; 11 | import java.util.List; 12 | import java.util.Random; 13 | 14 | /** 15 | * Created by mayhew2 on 6/7/17. 16 | */ 17 | public class Utils { 18 | 19 | public static HashMap labelcolors; 20 | static { 21 | labelcolors = new HashMap<>(); 22 | // put some common label colors here. 23 | labelcolors.put("PER", "yellow"); 24 | labelcolors.put("LOC", "greenyellow"); 25 | labelcolors.put("GPE", "coral"); 26 | labelcolors.put("MISC", "coral"); 27 | labelcolors.put("ORG", "lightblue"); 28 | labelcolors.put("G", "grey"); 29 | } 30 | 31 | /** 32 | * Given a label, this will return a standard color, or a random color. 33 | * @param label 34 | * @return 35 | */ 36 | public static String getColorOrRandom(String label){ 37 | String color; 38 | if(Utils.labelcolors.containsKey(label)){ 39 | color = Utils.labelcolors.get(label); 40 | }else{ 41 | Random random = new Random(); 42 | int nextInt = random.nextInt(256*256*256); 43 | color = String.format("#%06x", nextInt); 44 | } 45 | return color; 46 | } 47 | 48 | /** 49 | * Given a TextAnnotation, this will return the tokens in a cloned String[] array. If the ROMANIZATION 50 | * view is present, the tokens will come from there. Otherwise, this uses unidecode to get a base romanization. 51 | * 52 | * We recommend using the excellent Uroman library: https://www.isi.edu/~ulf/uroman.html 53 | * 54 | * @param ta TextAnnotation 55 | * @return an array of words, romanized if available. 56 | */ 57 | public static String[] getRomanTaToks(TextAnnotation ta){ 58 | String[] text; 59 | if(ta.hasView("ROMANIZATION")){ 60 | View translit = ta.getView("ROMANIZATION"); 61 | StringBuilder sb = new StringBuilder(); 62 | for(Constituent c : translit.getConstituents()){ 63 | String romantext = c.getLabel().replace(" ", "_"); 64 | if (romantext.length() == 0){ 65 | romantext = "_"; 66 | } 67 | sb.append(romantext +" "); 68 | } 69 | text = sb.toString().trim().split(" "); 70 | }else { 71 | 72 | Unidecode unidecode = Unidecode.toAscii(); 73 | 74 | text = ta.getTokens().clone(); 75 | for(int t = 0; t < text.length; t++){ 76 | text[t] = unidecode.decode(text[t]); 77 | } 78 | } 79 | 80 | return text; 81 | } 82 | 83 | public static String[] getGoogleTaToks(TextAnnotation ta){ 84 | String[] text; 85 | if(ta.hasView("GOOGLE")){ 86 | View google = ta.getView("GOOGLE"); 87 | StringBuilder sb = new StringBuilder(); 88 | 89 | int currIndex = 0; 90 | 91 | for(Constituent c : google.getConstituents()){ 92 | String googletext = c.getLabel().replace(" ", "_"); 93 | if (googletext.length() == 0){ 94 | googletext = "_"; 95 | } 96 | 97 | int start = c.getStartSpan(); 98 | int end = c.getEndSpan(); 99 | 100 | String[] tokensBefore = ta.getTokensInSpan(currIndex, start); 101 | for (int i = 0; i < tokensBefore.length; i++) { 102 | sb.append(tokensBefore[i] + " "); 103 | } 104 | 105 | currIndex = end; 106 | 107 | sb.append(googletext + " "); 108 | } 109 | 110 | int lastTAIndex = ta.getTokens().length; 111 | 112 | if (currIndex != lastTAIndex) { 113 | String[] tokensBefore = ta.getTokensInSpan(currIndex, lastTAIndex); 114 | for (int i = 0; i < tokensBefore.length; i++) { 115 | sb.append(tokensBefore[i] + " "); 116 | } 117 | } 118 | 119 | text = sb.toString().trim().split(" "); 120 | } else { 121 | System.out.println("GOOGLE_RELEVANT view not found"); 122 | text = ta.getTokens().clone(); 123 | } 124 | return text; 125 | } 126 | 127 | 128 | /** 129 | * This removes all stems from a word, even if they are stacked. 130 | * @param word 131 | * @param suffixes 132 | * @return 133 | */ 134 | public static String stem(String word, List suffixes){ 135 | boolean stemmed = false; 136 | while(!stemmed) { 137 | stemmed = true; 138 | for (String suff : suffixes) { 139 | if (word.endsWith(suff)) { 140 | word = word.substring(0, word.length() - suff.length()); 141 | stemmed = false; 142 | } 143 | } 144 | } 145 | return word; 146 | } 147 | 148 | public static void main(String[] args) { 149 | String w = "jungoliqlarning"; 150 | List suf = new ArrayList<>(); 151 | suf.add("liq"); 152 | suf.add("lar"); 153 | suf.add("ning"); 154 | System.out.println(stem(w, suf)); 155 | } 156 | 157 | } 158 | -------------------------------------------------------------------------------- /src/main/lbjava/CandClassifier.lbj: -------------------------------------------------------------------------------- 1 | package io.github.mayhewsw.classifier.lbjava; 2 | 3 | import io.github.mayhewsw.classifier.Candidate; 4 | import java.util.*; 5 | 6 | 7 | discrete% Words(Candidate c) <- 8 | { 9 | // sense "first" : c.tokens.get(0); 10 | // sense "last" : c.tokens.get(c.tokens.size()-1); 11 | 12 | //for(int i = 0; i < c.tokens.size(); i++){ 13 | // sense c.tokens.get(i); 14 | //} 15 | } 16 | 17 | discrete HasPunc(Candidate c) <- 18 | { 19 | return c.haspunc(); 20 | } 21 | 22 | discrete% Contexts(Candidate c) <- 23 | { 24 | Iterator iter = c.contexts.keySet().iterator(); 25 | while(iter.hasNext()){ 26 | String ctx = (String) iter.next(); 27 | double frequency = c.contexts.get(ctx); 28 | sense ctx ; 29 | } 30 | } 31 | 32 | discrete NumTokens(Candidate c) <- 33 | { 34 | return c.tokens.size(); 35 | } 36 | 37 | discrete GoodCandidate(Candidate c) <- { return c.isgood; } 38 | 39 | 40 | discrete CandClassifier(Candidate c) <- 41 | learn GoodCandidate 42 | using Contexts,NumTokens,HasPunc 43 | 44 | with new SparseNetworkLearner(new SparseAveragedPerceptron(.1, 0, 20)) 45 | 46 | end 47 | 48 | -------------------------------------------------------------------------------- /src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | #logging.config=src/main/resources/log4j.properties 2 | #logging.file=./log-spring.out 3 | logging.level.org.thymeleaf.TemplateEngine=INFO 4 | logging.level.org.springframework=INFO 5 | logging.level.io.github.mayhewsw=DEBUG 6 | server.port=8009 7 | # try 3283 8 | 9 | logging.level.edu.illinois.cs.cogcomp.ner.InferenceMethods = WARN 10 | log4j.logger.edu.illinois.cs.cogcomp.ner.ExpressiveFeatures = DEBUG 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # suppress inspection "UnusedProperty" for whole file 2 | # Root logger option 3 | log4j.rootLogger=DEBUG, stdout 4 | 5 | # Direct log messages to stdout 6 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 7 | log4j.appender.stdout.Target=System.out 8 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 9 | log4j.appender.stdout.layout.ConversionPattern=%d{HH:mm:ss} %-5p %c{1}:%L - %m%n 10 | 11 | log4j.logger.edu.illinois.cs.cogcomp.ner.InferenceMethods = WARN 12 | log4j.logger.edu.illinois.cs.cogcomp.ner.ExpressiveFeatures = DEBUG 13 | log4j.logger.edu.illinois.cs.cogcomp.curator.CuratorClient = WARN 14 | log4j.logger.edu.illinois.cs.cogcomp.annotation.handler.NERAnnotator = OFF -------------------------------------------------------------------------------- /src/main/resources/static/css/style.css: -------------------------------------------------------------------------------- 1 | .pointer { 2 | cursor: pointer; 3 | /*margin-right: 3px;*/ 4 | display: inline-block; 5 | } 6 | 7 | .spacer { 8 | height: 15px; 9 | } 10 | 11 | #spinner { 12 | /*width: 100px; 13 | height: 50px;*/ 14 | position: absolute; 15 | left: 50%; 16 | top: 50%; 17 | margin-top: -150px; 18 | margin-left: -250px; 19 | } 20 | 21 | .cons { 22 | /*padding-top: 2px;*/ 23 | /*padding-bottom: 2px;*/ 24 | border-radius: 5px; 25 | } 26 | 27 | .token { 28 | padding: 1px; 29 | /*padding-left: 1px;*/ 30 | /*padding-right: 1px;*/ 31 | border: 1px solid transparent; 32 | } 33 | 34 | .bestlabel { 35 | border: black 1px solid; 36 | color: black; 37 | } 38 | 39 | .ignorelabel { 40 | color: darkgray; 41 | background-color: white; 42 | } 43 | 44 | .suggestion { 45 | border-bottom: 5px solid DarkSlateBlue; 46 | } 47 | 48 | .tooltip-inner { 49 | max-width: 200px; 50 | padding: 3px 8px; 51 | color: black; 52 | text-align: center; 53 | background-color: white; 54 | border-radius: 4px; 55 | border: 1px solid lightgray; 56 | } 57 | 58 | .suffix { 59 | color: lightgray; 60 | } 61 | 62 | .clickable-row { 63 | cursor: pointer; 64 | } 65 | 66 | #legendcontainer { 67 | padding: 7px; 68 | } 69 | 70 | .legend { 71 | width: 10px; 72 | height: 10px; 73 | padding: 3px; 74 | border-radius: 3px; 75 | } 76 | 77 | .text { 78 | font-size: 12pt; 79 | text-align: initial; 80 | } 81 | 82 | .nocopy{ 83 | -webkit-user-select: none; /* Chrome all / Safari all */ 84 | -moz-user-select: none; /* Firefox all */ 85 | -ms-user-select: none; /* IE 10+ */ 86 | } 87 | 88 | #text { 89 | font-size: 12pt; 90 | } 91 | 92 | .popover{ 93 | max-width: 600px; /* Max Width of the popover (depending on the container!) */ 94 | } 95 | 96 | #savebutton { 97 | width: 100px; 98 | } 99 | 100 | #tagdescriptions { 101 | margin: 10px; 102 | } 103 | 104 | #tagdescriptions span { 105 | padding: 2px; 106 | border-radius: 2px; 107 | } 108 | 109 | .def { 110 | font-style: italic; 111 | } 112 | 113 | body { 114 | margin-top: 70px; 115 | } 116 | 117 | .emph { 118 | font-weight: bold; 119 | } 120 | 121 | .highlightsingle { 122 | border: lightgray solid 1px; 123 | border-radius: 3px; 124 | } 125 | 126 | .highlightstart { 127 | border: 1px solid lightgray; 128 | border-right-color: transparent; 129 | border-radius: 3px 0 0 3px; 130 | } 131 | 132 | .highlighted { 133 | border: 1px solid lightgray; 134 | border-right-color: transparent; 135 | border-left-color: transparent; 136 | } 137 | 138 | .highlightend { 139 | border: 1px solid lightgray; 140 | border-left-color: transparent; 141 | border-radius: 0 3px 3px 0; 142 | } 143 | 144 | /*.highlighted {*/ 145 | /*background-color: #fff2ac;*/ 146 | /*}*/ 147 | 148 | mark{ 149 | padding: 0; 150 | background: transparent; 151 | color: red; 152 | } -------------------------------------------------------------------------------- /src/main/resources/static/img/disk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/src/main/resources/static/img/disk.png -------------------------------------------------------------------------------- /src/main/resources/static/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/src/main/resources/static/img/favicon.ico -------------------------------------------------------------------------------- /src/main/resources/static/img/favicon.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/src/main/resources/static/img/favicon.xcf -------------------------------------------------------------------------------- /src/main/resources/static/img/loading.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/src/main/resources/static/img/loading.gif -------------------------------------------------------------------------------- /src/main/resources/static/img/logo-black-trans.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/src/main/resources/static/img/logo-black-trans.png -------------------------------------------------------------------------------- /src/main/resources/static/img/logo-black-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/src/main/resources/static/img/logo-black-white.png -------------------------------------------------------------------------------- /src/main/resources/static/img/logo-grey-trans.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/src/main/resources/static/img/logo-grey-trans.png -------------------------------------------------------------------------------- /src/main/resources/static/img/logo-white-black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/src/main/resources/static/img/logo-white-black.png -------------------------------------------------------------------------------- /src/main/resources/static/img/logo-white-trans.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/src/main/resources/static/img/logo-white-trans.png -------------------------------------------------------------------------------- /src/main/resources/static/img/logo.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/src/main/resources/static/img/logo.xcf -------------------------------------------------------------------------------- /src/main/resources/static/img/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/src/main/resources/static/img/screenshot.png -------------------------------------------------------------------------------- /src/main/resources/static/img/selection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogComp/talen/31fad4797bef66919820b4d15290418f2ddbc2bc/src/main/resources/static/img/selection.png -------------------------------------------------------------------------------- /src/main/resources/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | 8 | 9 | Title 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 54 | 55 | 56 |
NO CONTENT HERE
57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /src/main/resources/templates/dict.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Unified Annotation 5 | 6 | 7 | 8 |
9 | 14 |
15 | 16 |
17 |
18 | 19 |
20 | 21 |
22 | 23 |
24 |
25 | 26 | 30 | 31 | 32 |
33 | 34 |
35 |
36 | 37 |

38 | 39 | 40 |
41 | 42 |
43 | 44 |
45 |
46 | 47 | 48 | 49 |
50 |
51 |

Definition

52 |
53 |
54 | 55 |
56 |
57 |
58 |
59 |
60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 |
KeyDefinition
75 | 76 |
77 |
78 | 79 |
80 | 81 |
82 | 83 | 84 | 85 |
86 | 87 | 88 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /src/main/resources/templates/document/annotation.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Document Annotation 5 | 6 | 13 | 14 |
15 | 21 |
22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /src/main/resources/templates/document/doc-base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Annotation Home 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/main/resources/templates/document/getstarted.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Documents 5 | 6 | 7 | 8 | 9 | 10 | 11 |
12 |
13 | 20 | 21 | 22 |

Let's get started

23 | 24 |
25 | 26 |
27 |
28 | Info 29 |
30 |
31 |

Green rows are documents you have seen and saved. Click on a row to see that document.

32 | 33 |
34 |
35 | 36 |
37 |
38 | Corpus Statistics 39 |
40 |
    41 |
  • 42 |
43 |
44 |
45 | 46 |
47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 |
IDNum LabelsText annotation
64 | 65 |
66 |
67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /src/main/resources/templates/document/home.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Choose a dataset 5 | 6 | 7 | 8 |
9 | 10 |
11 | 12 |
13 |
14 |
15 | 16 |
17 |

Welcome!

18 |

Looks like you are new here. This page will display datasets once they are created 19 | and the config file is placed in the config folder.

20 | 21 |

Here is a small example config file:

22 |
23 | name    MyDatasetName
24 | path    /path/to/file
25 | labels  LABEL1 LABEL2 LABEL3
26 |                     
27 | 28 |

Important: the name of this config file must be prefixed with doc- in order to show up here.

29 | 30 | 31 |
32 | 33 |
34 |

Please select an option:

35 |
36 | 37 |
38 |
39 |
40 |
41 |
42 | 43 |
44 | 45 |
46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /src/main/resources/templates/fragments/nav.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | 8 | 9 | Title 10 | 11 | 12 | 13 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/main/resources/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Annotation Home 5 | 6 | 7 | 23 | 24 | 25 | 26 | 27 | 28 |
29 |
30 | 31 |
32 |
33 | 34 |
35 |
36 |

Hello, [[${#httpServletRequest.remoteUser}]]!

37 | 38 |
39 |

Refresh page to reload config files.

40 | 41 | 42 | 43 |
44 | 45 | 46 | 47 | 123 | 124 | 125 |
126 |
127 |
128 | 129 |
130 |
131 | 132 |
133 | 134 | 135 |
136 |
137 | Document options: 138 |
139 | 140 | 149 |
150 | 151 |
152 | 153 |
154 | 155 | 156 |
157 |
158 | Sentence options: 159 |
160 | 161 |
162 | 163 | Add new... 164 |
165 |
166 |
167 | 168 |
169 | 170 |
171 |
172 | 173 |
174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /src/main/resources/templates/instructions.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Getting Started 5 |
6 | 7 | 8 | 9 |
10 |
11 |

Instructions

12 | 13 |

Assume the language of interest is Tamil.

14 | 15 |

Click "Load dictionary" in the banner, and load the Tamil dictionary.

16 | 17 |

Then go to "All Annotations" and click on Tamil. This will show you a listing of documents. Click on the first one and start annotating. The order doesn't matter, but it might be simplest to start at the beginning.

18 | 19 |

To annotate: click on a word and choose the tag. To annotate a phrase, highlight the text. There are also rules on the right hand side. These are not necessarily correct, but are just aids in telling you previous annotations. Click on a rule to apply it.

20 | 21 |

Right click to remove tags.

22 | 23 |

Click on Toggle Dictionary to show English translations in place. Feel free to go back and fix previous annotations (actually highly recommended).

24 | 25 | 26 |
27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /src/main/resources/templates/layout.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | YOUR TITLE HERE 5 | 6 | 7 | 8 |
9 | 10 |
11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /src/main/resources/templates/login.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Login 5 | 6 | 7 | 8 |
9 |
10 |
11 |
12 |
13 |
14 | 15 |
16 |
17 | 18 |
19 | 20 | 21 |
22 |
23 | 24 | 25 |
26 |
27 | 28 | 29 |
30 |
31 |
32 | 33 |
34 | 35 |
36 | 37 |
38 |
39 |
40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /src/main/resources/templates/sentence/annotation.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Sentence Annotation 5 | 6 | 13 | 14 |
15 | 20 |
21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /src/main/resources/templates/sentence/getstarted.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Documents 5 | 6 | 7 | 8 | 9 |
10 |
11 | 12 |

Let's get started

13 | 14 | 15 | 20 | 21 | 22 |
23 | 24 |
25 | 26 |
27 | 28 |
29 |
30 |
31 |
32 | 33 |
34 | 35 | 36 | 37 | 38 | 43 | 44 | 45 |
46 | 47 |
48 |
49 |
50 |
51 |
52 |

53 |

54 |

55 |

56 | 57 |
58 |
59 |
60 | 61 | 62 |
63 | 64 |
65 | 66 |
67 |
68 | 75 |
76 |
77 | 78 |
79 |

80 |

81 |

82 | 83 |
84 |
85 |
86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 |
107 | 108 |
109 |
110 | 111 | 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /src/main/resources/templates/sentence/home.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Choose a dataset 5 | 6 | 7 | 8 |
9 | 10 |
11 | 12 |
13 | 16 | 17 |
18 |
19 | 20 |

Please select an option:

21 |
22 | 23 |
24 | 25 |
26 | 27 |
28 |
29 |
30 |
31 | 32 | 33 | 50 | 51 | 52 |
53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /src/main/resources/templates/sentence/sent-base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Sentence Annotation 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /src/main/resources/templates/unified-annotation.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Unified Annotation 5 | 6 | 7 | 8 | 9 | 10 |
11 | 12 | 13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | Save 21 | 22 | 23 | 24 | 25 |
26 | 27 | 28 | Options: 29 | 30 |
31 |
32 | 33 |
34 | 35 | 36 |
37 |
38 |
39 | 40 |
41 |
42 | 43 |
44 | 45 | 46 |
47 |
48 |
49 | 50 |
51 |
52 | 53 |
54 | 55 | 56 |
57 |
58 |
59 | 60 |
61 |
62 | 63 |
64 | 65 | 66 |
67 |
68 |
69 | 70 |
71 |
72 | 73 | 77 | 78 |
79 |
80 | 81 | 82 | 83 | 84 |
85 | 86 |
87 | Previous 88 | Next 89 |
90 | 91 | 92 |
93 | 94 |
95 | 96 |
97 | 98 | Back to top 99 | 100 |
101 | 102 |
103 |
104 | 105 | 106 | 107 | 111 | 112 | 147 | 148 |
149 |
150 | 151 | 152 | 153 | --------------------------------------------------------------------------------