├── .gitignore ├── LICENSE ├── README.rst ├── data ├── AnEM │ ├── ATTRIBUTION │ ├── CONLL-format │ │ ├── data │ │ │ ├── AnEM.test │ │ │ └── AnEM.train │ │ └── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ ├── LICENSE │ └── README.rst ├── BBN │ ├── CONLL-format │ │ ├── docs │ │ │ ├── corpusconfig.cfg │ │ │ ├── corrections.rst │ │ │ ├── entity-list.txt │ │ │ ├── tagset-with_DESC.txt │ │ │ └── tagset.txt │ │ └── utils │ │ │ └── bbn2conll.py │ ├── README.rst │ ├── clean.sh │ ├── original_BBN_dataset │ │ └── README.md │ └── rootadd.py ├── BTC │ ├── CONLL-format │ │ ├── data │ │ │ ├── a.conll │ │ │ ├── b.conll │ │ │ ├── e.conll │ │ │ ├── f.conll │ │ │ ├── g.conll │ │ │ └── h.conll │ │ └── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ ├── LICENSE │ ├── NOTE.rst │ └── README.md ├── CADEC │ ├── CONLL-format │ │ ├── data_orig │ │ │ └── README │ │ └── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ ├── LICENSE │ └── README.rst ├── GUM │ ├── CONLL-format │ │ ├── data-by-corpustype │ │ │ ├── interview │ │ │ │ ├── GUM_interview_ants.tsv │ │ │ │ ├── GUM_interview_brotherhood.tsv │ │ │ │ ├── GUM_interview_chomsky.tsv │ │ │ │ ├── GUM_interview_cocktail.tsv │ │ │ │ ├── GUM_interview_cyclone.tsv │ │ │ │ ├── GUM_interview_daly.tsv │ │ │ │ ├── GUM_interview_dungeon.tsv │ │ │ │ ├── GUM_interview_gaming.tsv │ │ │ │ ├── GUM_interview_herrick.tsv │ │ │ │ ├── GUM_interview_hill.tsv │ │ │ │ ├── GUM_interview_libertarian.tsv │ │ │ │ ├── GUM_interview_licen.tsv │ │ │ │ ├── GUM_interview_mcguire.tsv │ │ │ │ ├── GUM_interview_mckenzie.tsv │ │ │ │ ├── GUM_interview_messina.tsv │ │ │ │ ├── GUM_interview_onion.tsv │ │ │ │ ├── GUM_interview_peres.tsv │ │ │ │ ├── GUM_interview_shalev.tsv │ │ │ │ └── GUM_interview_stardust.tsv │ │ │ ├── news │ │ │ │ ├── GUM_news_asylum.tsv │ │ │ │ ├── GUM_news_clock.tsv │ │ │ │ ├── GUM_news_crane.tsv │ │ │ │ ├── GUM_news_defector.tsv │ │ │ │ ├── GUM_news_election.tsv │ │ │ │ ├── GUM_news_expo.tsv │ │ │ │ ├── GUM_news_flag.tsv │ │ │ │ ├── GUM_news_hackers.tsv │ │ │ │ ├── GUM_news_homeopathic.tsv │ │ │ │ ├── GUM_news_ie9.tsv │ │ │ │ ├── GUM_news_imprisoned.tsv │ │ │ │ ├── GUM_news_iodine.tsv │ │ │ │ ├── GUM_news_korea.tsv │ │ │ │ ├── GUM_news_lanterns.tsv │ │ │ │ ├── GUM_news_nasa.tsv │ │ │ │ ├── GUM_news_sensitive.tsv │ │ │ │ ├── GUM_news_stampede.tsv │ │ │ │ ├── GUM_news_taxes.tsv │ │ │ │ ├── GUM_news_warhol.tsv │ │ │ │ ├── GUM_news_warming.tsv │ │ │ │ └── GUM_news_worship.tsv │ │ │ ├── voyage │ │ │ │ ├── GUM_voyage_athens.tsv │ │ │ │ ├── GUM_voyage_chatham.tsv │ │ │ │ ├── GUM_voyage_cleveland.tsv │ │ │ │ ├── GUM_voyage_coron.tsv │ │ │ │ ├── GUM_voyage_cuba.tsv │ │ │ │ ├── GUM_voyage_fortlee.tsv │ │ │ │ ├── GUM_voyage_guadeloupe.tsv │ │ │ │ ├── GUM_voyage_isfahan.tsv │ │ │ │ ├── GUM_voyage_lodz.tsv │ │ │ │ ├── GUM_voyage_merida.tsv │ │ │ │ ├── GUM_voyage_oakland.tsv │ │ │ │ ├── GUM_voyage_phoenix.tsv │ │ │ │ ├── GUM_voyage_socotra.tsv │ │ │ │ ├── GUM_voyage_thailand.tsv │ │ │ │ ├── GUM_voyage_tulsa.tsv │ │ │ │ ├── GUM_voyage_vavau.tsv │ │ │ │ └── GUM_voyage_york.tsv │ │ │ └── whow │ │ │ │ ├── GUM_whow_arrogant.tsv │ │ │ │ ├── GUM_whow_ballet.tsv │ │ │ │ ├── GUM_whow_basil.tsv │ │ │ │ ├── GUM_whow_cactus.tsv │ │ │ │ ├── GUM_whow_chicken.tsv │ │ │ │ ├── GUM_whow_cupcakes.tsv │ │ │ │ ├── GUM_whow_elevator.tsv │ │ │ │ ├── GUM_whow_flirt.tsv │ │ │ │ ├── GUM_whow_glowstick.tsv │ │ │ │ ├── GUM_whow_joke.tsv │ │ │ │ ├── GUM_whow_languages.tsv │ │ │ │ ├── GUM_whow_mice.tsv │ │ │ │ ├── GUM_whow_overalls.tsv │ │ │ │ ├── GUM_whow_packing.tsv │ │ │ │ ├── GUM_whow_parachute.tsv │ │ │ │ ├── GUM_whow_procrastinating.tsv │ │ │ │ ├── GUM_whow_quidditch.tsv │ │ │ │ ├── GUM_whow_quinoa.tsv │ │ │ │ └── GUM_whow_skittles.tsv │ │ ├── data │ │ │ ├── test │ │ │ │ └── gum-test.conll │ │ │ └── train │ │ │ │ └── gum-train.conll │ │ ├── docs-train-test │ │ │ └── corpusconfig.cfg │ │ ├── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ │ └── utils │ │ │ └── webAnnotsv_to_conll.py │ ├── LICENSE.txt │ └── README.rst ├── MITMovieCorpus │ ├── CONLL-format │ │ └── trivia10k13 │ │ │ ├── data │ │ │ ├── test │ │ │ │ └── README │ │ │ └── train │ │ │ │ └── README │ │ │ └── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ └── README.rst ├── MITRestaurantCorpus │ ├── CONLL-format │ │ ├── data │ │ │ ├── test │ │ │ │ └── README │ │ │ └── train │ │ │ │ └── README │ │ └── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ └── README.rst ├── MUC-6 │ ├── CONLL-format │ │ ├── data │ │ │ ├── test │ │ │ │ └── README │ │ │ └── train │ │ │ │ └── README │ │ └── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ └── README.rst ├── MalwareTextDB │ ├── CONLL-format │ │ ├── data │ │ │ └── README │ │ └── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ └── README.rst ├── NIST_IEER │ ├── CONLL-format │ │ ├── data │ │ │ └── README │ │ ├── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ │ └── utils │ │ │ ├── makeconll.py │ │ │ └── quick_comma_fix.py │ └── README.rst ├── Ritter │ ├── CONLL-format │ │ ├── data │ │ │ ├── dev │ │ │ │ └── README │ │ │ ├── test │ │ │ │ └── README │ │ │ └── train │ │ │ │ └── README │ │ └── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ └── README.rst ├── SEC-filings │ ├── CONLL-format │ │ ├── data │ │ │ ├── test │ │ │ │ └── FIN3.txt │ │ │ └── train │ │ │ │ └── FIN5.txt │ │ └── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ ├── NOTE.txt │ └── README.txt ├── WNUT17 │ ├── CITATION │ ├── CONLL-format │ │ ├── data │ │ │ ├── dev │ │ │ │ └── emerging.dev.conll │ │ │ ├── test │ │ │ │ └── emerging.test.annotated │ │ │ └── train │ │ │ │ └── wnut17train.conll │ │ └── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ ├── LICENSE │ └── README.rst ├── conll2003 │ ├── CONLL-format │ │ ├── data │ │ │ └── README │ │ └── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ └── README.rst ├── gmb │ └── gmb-1.0.0 │ │ ├── CONLL-format │ │ ├── data_original │ │ │ └── README │ │ └── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ │ └── README.rst ├── i2b2_2006 │ ├── CONLL-format │ │ ├── data │ │ │ ├── test │ │ │ │ └── README │ │ │ └── train │ │ │ │ └── README │ │ ├── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ │ └── utils │ │ │ └── i2b2toconll.py │ └── README.rst ├── i2b2_2014 │ ├── CONLL-format │ │ ├── data │ │ │ ├── dev │ │ │ │ └── README │ │ │ ├── test │ │ │ │ └── README │ │ │ └── train │ │ │ │ └── README │ │ └── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ └── README.rst ├── re3d │ ├── CONLL-format │ │ ├── data-by-source │ │ │ └── conll │ │ │ │ ├── Australian_Department_of_Foreign_Affairs │ │ │ │ ├── 2FA0B45CA064AF09334944B6D68B5987.conll │ │ │ │ ├── 58A3656BDCA7B89C38CE5A1985834740.conll │ │ │ │ ├── D863D678C3A4BD8EDFCE2B7D27881974.conll │ │ │ │ └── DEC7A9B44536B3E01C85E64BAF2E07A6.conll │ │ │ │ ├── BBC_Online │ │ │ │ ├── 3095FB15AFF7B552702E9EF46C083FDD.conll │ │ │ │ ├── 55AE94FC6D11B6BC1A89A144F6BA49FF.conll │ │ │ │ ├── 62A11A900C2139D329C5F0E29B0B2AC4.conll │ │ │ │ ├── A0302EA7B1BDE005835BC09ECBF2930A.conll │ │ │ │ ├── A575C2B05115236A223DBE644F73BE9A.conll │ │ │ │ ├── AEBFCD2500E5B57C7C416020C270EA5A.conll │ │ │ │ ├── DC40EB7B4BDB1254DAF8A430A8E5383B.conll │ │ │ │ └── DDC23537DF2DA27754B9327683DCA114.conll │ │ │ │ ├── CENTCOM │ │ │ │ ├── 001C9C3F3DFE16B4921B1E906F66E161.conll │ │ │ │ ├── 1B05F2376ED66CA9FFEB00BE752C16DC.conll │ │ │ │ ├── 26CC32861EF7D38FBAE9CABAFC7487E1.conll │ │ │ │ ├── 28829CF80D445874061C5948E6A3751D.conll │ │ │ │ ├── 482C10613016D2F430C464B2D13A8F41.conll │ │ │ │ ├── 5FBCC6E1D5B4B2A699196670789C6B0B.conll │ │ │ │ ├── 6646F63F6AC6B483D15DA258CB182147.conll │ │ │ │ ├── 6874F49C56340E2BD65BF958916C97AE.conll │ │ │ │ ├── 68A04D0D3111DBD88DE961B5727ADFA4.conll │ │ │ │ ├── 6B183780227A18218A12E917A5BC8654.conll │ │ │ │ ├── 76957901E4278B498901454FA209CFFA.conll │ │ │ │ ├── 8BC6B4D334BC5607D97A1BA18854B513.conll │ │ │ │ ├── B2E5F55F5D7E31B291D9F76E8E4AC75D.conll │ │ │ │ ├── C044623AD872323AF717F26F21E9352B.conll │ │ │ │ ├── D21D5CC3C9BEF5BF1D44976154E580A5.conll │ │ │ │ ├── D52939C747F4DB8D1ECF6D415559ADA0.conll │ │ │ │ ├── E19357552CAE0E45EE364E60A317AC59.conll │ │ │ │ ├── E50B8563BB8395102D95333699FF73A3.conll │ │ │ │ ├── E9035FF1D0DA74A11674C82688C05C51.conll │ │ │ │ ├── F5B5E86320A4E95E92A7EA2FB8E8B484.conll │ │ │ │ ├── F7893F2F4E38C8E00C7D22E22E100428.conll │ │ │ │ ├── F7AC57DBBAD0284C8C0DE6566B9B5C29.conll │ │ │ │ └── FEB7BC61C9765C9859F46B9B634F039F.conll │ │ │ │ ├── Delegation_of_the_European_Union_to_Syria │ │ │ │ ├── 26E0189E3D774CB2B8F078A082E5088C.conll │ │ │ │ └── 8BF4B60AA80B0601BE12F0164E2E7001.conll │ │ │ │ ├── UK_Government │ │ │ │ ├── 09FF200CBB76594D688F8EB565717543.conll │ │ │ │ ├── 1101048768E50FA1F8DA3F7B7CD105AB.conll │ │ │ │ ├── 1563238E6CF89746FDD44909F11EE319.conll │ │ │ │ ├── 1718A3656CA87E3826A28BC3D041F024.conll │ │ │ │ ├── 189862CF0AC0BBFA012D27D85CE90BE7.conll │ │ │ │ ├── 208C0A8E536C75AB9D151D7030276BE9.conll │ │ │ │ ├── 32AACD4CCB88ACB2C0FAB156E7BDCC2F.conll │ │ │ │ ├── 33A216AD3A006BFF03390675E6C56594.conll │ │ │ │ ├── 3A7EC63500380D5B35C211D4FF5E20EA.conll │ │ │ │ ├── 6CB38F21A7E71AD4FD0044E4FCE78F52.conll │ │ │ │ ├── 7064D852A8BF7FB8126978D3AC40E4FC.conll │ │ │ │ ├── 7148F3D092DA3E560332528D7167DF0F.conll │ │ │ │ ├── 74E7DCD5876B4EF1E95448F0E0C073CD.conll │ │ │ │ ├── 78C6FA20C3559F23A36974F2B22BC73A.conll │ │ │ │ ├── 821F5BBD9F50DD7301C12ECCC926B00C.conll │ │ │ │ ├── 85F0CF36CAEE839799563350B30B96B1.conll │ │ │ │ ├── 8A716D1B7D81B723DFD8361BAEA100DC.conll │ │ │ │ ├── 8B3BA274503F6D30F7E80C41E7568744.conll │ │ │ │ ├── AB37B1171B0A38B24A6B74B18903E8D5.conll │ │ │ │ ├── C35F45FE7CBA87AE07646066842C91E0.conll │ │ │ │ ├── C3BB2D0EE548765402C961463415E4CA.conll │ │ │ │ ├── C66A9E77E1142C3F0909657DDDD475A5.conll │ │ │ │ ├── CBD73D24324D3C3A3A76AC430E05A069.conll │ │ │ │ ├── D23AD6D80730BEF27D23CDD2287AC200.conll │ │ │ │ ├── E967805EB658EB937216A53EE9733656.conll │ │ │ │ └── ED9C03D12FCA518C78DD6C3909CA5E42.conll │ │ │ │ ├── US_State_Department │ │ │ │ ├── 108A8CFC05CC3139A6E65B7A4239F5C6.conll │ │ │ │ ├── 1E4938BF59D806C4E3AC9DDF2CC3E87F.conll │ │ │ │ ├── 1F9A1D68D16594512C9CBAD02D53E4C1.conll │ │ │ │ ├── 204532F7EDD22A40E15407ADA16AC058.conll │ │ │ │ ├── 2C271BB862754AECF35665280FE7B93C.conll │ │ │ │ ├── 2C6CB502FAD51B6B4CA765E57E221BEA.conll │ │ │ │ ├── 2E857EFC55A390B6382FDE5C87F06D67.conll │ │ │ │ ├── 31E63458C6BF8F6F1D50D7041E5B89C9.conll │ │ │ │ ├── 340238F0CEF0581CBBA9106FFB2EA97C.conll │ │ │ │ ├── 4174EF50EF8D9024BCC261D602202738.conll │ │ │ │ ├── 487C0B93A959D64A34CA2F40EE82C18B.conll │ │ │ │ ├── 6413ED35E27EFB504C689047F2F17B0F.conll │ │ │ │ ├── 6E62DC65EFDFB1B357175EB255F25C36.conll │ │ │ │ ├── 8EC0382F2D0E4360EC7A85FA8FEA02D8.conll │ │ │ │ ├── 9104BDCC8E9A1CD2090757B487421DC6.conll │ │ │ │ ├── 9DFBD0B607245D1B56B90FD64E9AB952.conll │ │ │ │ ├── B5E6AB37F9BF98EC2A4EFDDACE2768C9.conll │ │ │ │ ├── C02FDFE461425ED8B64B982D05BD2D62.conll │ │ │ │ ├── CFEF9B4E83786F435E8483A03DB20D8E.conll │ │ │ │ ├── E1526BBD89FAB8E7914B3CF5867666CC.conll │ │ │ │ ├── F669C19A4C3DB410BBEC57ECC5DC62B1.conll │ │ │ │ └── FAB032C0995F606F54B1DD3EA8D61930.conll │ │ │ │ └── Wikipedia │ │ │ │ ├── 0388FEF6427DE0223688EB04A8DA71E1.conll │ │ │ │ ├── 051A4176702090F39056F34A9C29D574.conll │ │ │ │ ├── 22BCFA25729AF03473435DAAAE67F5B8.conll │ │ │ │ ├── 2B0DB9AA5ACC73214D2C69C20AA734A8.conll │ │ │ │ ├── 2ED27E1CBF9EFDEF369E721DC948D2F4.conll │ │ │ │ ├── 3B6F5E49DF05198FD982806C6CA1591C.conll │ │ │ │ ├── 76F75D985AD7642B43A8AA4DBC146621.conll │ │ │ │ ├── 774684266A20284937043C2BC997EC3B.conll │ │ │ │ ├── 7EA8D40D4CBE1191B91C9B3503A07338.conll │ │ │ │ ├── 99036C6E6DB96D33FA0830D3095E7B71.conll │ │ │ │ ├── B7CAC5946BE12615BA4815FFE0FE4C54.conll │ │ │ │ ├── BC4D46232E91E65C8C69F4F6F2D9C618.conll │ │ │ │ └── FD93899C448B33796DBBB7BBFEEFA3A6.conll │ │ ├── data │ │ │ ├── README.md │ │ │ ├── test │ │ │ │ └── re3d-test.conll │ │ │ └── train │ │ │ │ └── re3d-train.conll │ │ ├── docs │ │ │ ├── corpusconfig.cfg │ │ │ └── entity-list.txt │ │ └── utils │ │ │ └── re3d_to_bratann.py │ ├── LICENSES │ │ ├── Australian_Department_of_Foreign_Affairs │ │ │ ├── ATTRIBUTION │ │ │ └── LICENSE │ │ ├── BBC_Online │ │ │ ├── ATTRIBUTION │ │ │ └── LICENSE │ │ ├── CENTCOM │ │ │ ├── ATTRIBUTION │ │ │ └── LICENSE │ │ ├── Delegation_of_the_European_Union_to_Syria │ │ │ ├── ATTRIBUTION │ │ │ └── LICENSE │ │ ├── README │ │ ├── UK_Government │ │ │ ├── ATTRIBUTION │ │ │ └── LICENSE │ │ ├── US_State_Department │ │ │ ├── ATTRIBUTION │ │ │ └── LICENSE │ │ └── Wikipedia │ │ │ ├── ATTRIBUTION │ │ │ └── LICENSE │ └── README.rst └── wikigold │ ├── CONLL-format │ ├── data │ │ └── wikigold.conll.txt │ └── docs │ │ ├── corpusconfig.cfg │ │ └── entity-list.txt │ ├── LICENSE │ └── README.rst ├── requirements.txt └── src ├── file_locations.cfg ├── sentence_utils.py ├── stratified_split.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Applied Research Laboratories, The University of Texas at Austin (ARL:UT). 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /data/AnEM/ATTRIBUTION: -------------------------------------------------------------------------------- 1 | Files AnEM.train and AnEM.test from AnEM-1.0.4/test are included here 2 | (in directory CONLL-format/data) and have not been modified. 3 | 4 | The corpus is presented in the following manuscript: 5 | 6 | * Tomoko Ohta, Sampo Pyysalo, Jun'ichi Tsujii and Sophia Ananiadou (2012). 7 | Open-domain Anatomical Entity Mention Detection. 8 | In Proceedings of ACL 2012 Workshop on Detecting Structure in 9 | Scholarly Discourse (DSSD). 10 | 11 | The corpus home page is located at http://www.nactem.ac.uk/anatomy/ 12 | 13 | -------------------------------------------------------------------------------- /data/AnEM/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = Anatomical_system 4 | Cell 5 | Cellular_component 6 | Developing_anatomical_structure 7 | Immaterial_anatomical_entity 8 | Multi-tissue_structure 9 | Organ 10 | Organism_subdivision 11 | Organism_substance 12 | Pathological_formation 13 | Tissue 14 | 15 | [IOB-format] 16 | 17 | IOB = IOB2 18 | 19 | [file-settings] 20 | 21 | sep = tab 22 | iob_pos = 3 23 | word_pos = 0 24 | pos_pos = none 25 | -------------------------------------------------------------------------------- /data/AnEM/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | Anatomical_system 4 | Cell 5 | Cellular_component 6 | Developing_anatomical_structure 7 | Immaterial_anatomical_entity 8 | Multi-tissue_structure 9 | Organ 10 | Organism_subdivision 11 | Organism_substance 12 | Pathological_formation 13 | Tissue 14 | 15 | [format] 16 | 17 | IOB2 18 | 19 | [notes] 20 | 21 | [content] 22 | 23 | [stats] 24 | 25 | tokens sentences 26 | -------------------------- 27 | train 71697 2815 28 | test 45939 1882 29 | -------------------------- 30 | all 117636 4697 31 | 32 | Entities: 33 | 34 | -------------------------------------- 35 | Anatomical_system 51 36 | Cell 776 37 | Cellular_component 199 38 | Developing_anatomical_structure 39 39 | Immaterial_anatomical_entity 60 40 | Multi-tissue_structure 639 41 | Organ 381 42 | Organism_subdivision 162 43 | Organism_substance 291 44 | Pathological_formation 368 45 | Tissue 169 46 | 47 | -------------------------------------------------------------------------------- /data/AnEM/LICENSE: -------------------------------------------------------------------------------- 1 | LICENSE 2 | 3 | 1. Annotations 4 | 5 | The AnEM corpus annotations licenced under the Creative Commons BY-SA 6 | 3.0 license. See http://creativecommons.org/licenses/by-sa/3.0/ for 7 | the license terms. 8 | 9 | Please attribute the corpus by citing the DSSD'12 paper referenced 10 | below in publications and linking to http://www.nactem.ac.uk/anatomy/ 11 | in online resources. 12 | 13 | 2. Texts 14 | 15 | The abstracts contained in the corpus are from PubMed, a database of 16 | the U.S. National Library of Medicine (NLM). Please see the NLM page 17 | on copyright information regarding the copyright of the abstracts: 18 | http://www.nlm.nih.gov/databases/download.html 19 | 20 | The full text extracts contained in the corpus are from articles in 21 | the Open Access Subset of the PubMed Central (PMC) database of the 22 | NLM. These articles are made available under a Creative Commons or 23 | similar license. Please see the REFERENCES file in this distribution 24 | for references to the articles and the PMC version of each article for 25 | the specific license terms. 26 | 27 | 28 | References 29 | 30 | Please attribute the corpus by citing the following paper: 31 | 32 | * Tomoko Ohta, Sampo Pyysalo, Jun'ichi Tsujii and Sophia Ananiadou (2012). 33 | Open-domain Anatomical Entity Mention Detection. 34 | In Proceedings of ACL 2012 Workshop on Detecting Structure in 35 | Scholarly Discourse (DSSD). 36 | -------------------------------------------------------------------------------- /data/AnEM/README.rst: -------------------------------------------------------------------------------- 1 | The Anatomical Entity Mention (AnEM) corpus 2 | =========================================== 3 | 4 | This dataset consists of abstracts and full-text biomedical papers. 5 | It was downloaded in November 2017 from: 6 | 7 | http://www.nactem.ac.uk/anatomy/ 8 | 9 | Specifically: 10 | 11 | http://www.nactem.ac.uk/anatomy/data/AnEM-1.0.4.tar.gz 12 | 13 | Files AnEM.test and AnEM.train from the AnEM-1.0.4/test directory were 14 | moved to the directory CONLL-format/data. No changes were made. 15 | 16 | See also 17 | -------- 18 | 19 | http://brat.nlplab.org/examples.html#corpus-examples-brat 20 | 21 | Attribution 22 | ----------- 23 | 24 | Tomoko Ohta, Sampo Pyysalo, Jun'ichi Tsujii and Sophia Ananiadou (2012). 25 | Open-domain Anatomical Entity Mention Detection. In Proceedings of ACL 2012 26 | Workshop on Detecting Structure in Scholarly Discourse (DSSD), pp. 27-36. 27 | 28 | For more information, see: 29 | 30 | http://www.nactem.ac.uk/anatomy/ 31 | -------------------------------------------------------------------------------- /data/BBN/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = TODO 4 | 5 | [IOB-format] 6 | 7 | IOB = IOB2 8 | 9 | [file-settings] 10 | 11 | sep = tab 12 | iob_pos = 1 13 | word_pos = 0 14 | pos_pos = none 15 | -------------------------------------------------------------------------------- /data/BBN/CONLL-format/docs/corrections.rst: -------------------------------------------------------------------------------- 1 | # THESE ARE INCORRECT: 2 | ##################################################################### 3 | FAC_DESC:STREET_HIGHWAY -> FAC_DESC:HIGHWAY_STREET 4 | PRODCUT:OTHER -> PRODUCT:OTHER 5 | FAC:HOTEL -> ORGANIZATION:HOTEL 6 | DATE -> DATE:DATE [wsj06c.qa] 7 | LOCATION -> LOCATION:OTHER # 5 mentions (districts) 8 | 9 | ###################################################################### 10 | # 13 more types removed. 11 | 12 | # There were too few (< 20) of the following entity types, 13 | # so they were changed as follows: 14 | 15 | CONTACT_INFO:OTHER -> O # 3 mentions 16 | SUBSTANCE:NUCLEAR -> O # 3 mentions 17 | CONTACT_INFO:ADDRESS -> O # 4 mentions 18 | 19 | LOCATION:BORDER -> LOCATION:OTHER # 1 mention 20 | LOCATION:CITY -> GPE:CITY # 1 mention 21 | 22 | ORGANIZATION:STATE_PROVINCE -> GPE:CITY # 1 mention 23 | ORGANIZATION:CITY -> GPE:CITY # 2 mentions 24 | 25 | QUANTITY:TEMPERATURE -> QUANTITY:OTHER # 1 mention 26 | QUANTITY:SPEED -> QUANTITY:OTHER # 14 mentions 27 | QUANTITY:ENERGY -> QUANTITY:OTHER # 20 mentions, but 3 of them are incorrect 28 | # (kilobytes and megabytes). 29 | 30 | PRODUCT:FOOD -> PRODUCT:OTHER # 1 mention 31 | PRODUCT:DRUG -> SUBSTANCE:DRUG # 2 mentions 32 | 33 | WORK_OF_ART:PAINTING -> WORK_OF_ART:OTHER # 13 mentions 34 | 35 | 36 | -------------------------------------------------------------------------------- /data/BBN/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | ANIMAL 4 | CARDINAL 5 | CONTACT_INFO 6 | DATE 7 | DISEASE 8 | EVENT 9 | FAC 10 | GAME 11 | GPE 12 | LANGUAGE 13 | LAW 14 | LOCATION 15 | MONEY 16 | NORP 17 | ORDINAL 18 | ORGANIZATION 19 | PERCENT 20 | PERSON 21 | PLANT 22 | PRODUCT 23 | QUANTITY 24 | SUBSTANCE 25 | TIME 26 | WORK_OF_ART 27 | 28 | [format] 29 | 30 | none 31 | 32 | [notes] 33 | 34 | [content] 35 | 36 | [stats] 37 | 38 | sentences tokens 39 | --------------------------- 40 | train 38911 935514 41 | test 10000 240637 42 | --------------------------- 43 | all 48911 1176151 44 | 45 | 46 | -------------------------------------------------------------------------------- /data/BBN/CONLL-format/docs/tagset-with_DESC.txt: -------------------------------------------------------------------------------- 1 | Entity Label Counts 2 | ---------------------------------------- 3 | LANGUAGE 84 4 | LAW 382 5 | DISEASE 317 6 | ANIMAL 396 7 | CARDINAL 10321 8 | ORDINAL 1098 9 | GAME 90 10 | MONEY 11097 11 | PERCENT 5976 12 | PERSON 13751 13 | PER_DESC 26353 14 | PLANT 194 15 | TIME 1069 16 | CONTACT_INFO:PHONE 33 17 | FAC:AIRPORT 34 18 | FAC:ATTRACTION 13 19 | FAC:BRIDGE 41 20 | FAC:BUILDING 154 21 | FAC:HIGHWAY_STREET 116 22 | FAC:OTHER 87 23 | FAC_DESC:AIRPORT 53 24 | FAC_DESC:ATTRACTION 33 25 | FAC_DESC:BRIDGE 40 26 | FAC_DESC:BUILDING 1808 27 | FAC_DESC:HIGHWAY_STREET 206 28 | FAC_DESC:OTHER 430 29 | GPE:CITY 5606 30 | GPE:COUNTRY 5079 31 | GPE:STATE_PROVINCE 2694 32 | GPE:OTHER 192 33 | GPE_DESC:CITY 377 34 | GPE_DESC:COUNTRY 992 35 | GPE_DESC:STATE_PROVINCE 397 36 | GPE_DESC:OTHER 69 37 | ORGANIZATION:CORPORATION 23441 38 | ORGANIZATION:EDUCATIONAL 366 39 | ORGANIZATION:GOVERNMENT 4630 40 | ORGANIZATION:HOSPITAL 23 41 | ORGANIZATION:HOTEL 61 42 | ORGANIZATION:MUSEUM 14 43 | ORGANIZATION:POLITICAL 413 44 | ORGANIZATION:RELIGIOUS 44 45 | ORGANIZATION:OTHER 1255 46 | ORG_DESC:CORPORATION 15186 47 | ORG_DESC:EDUCATIONAL 238 48 | ORG_DESC:GOVERNMENT 2502 49 | ORG_DESC:HOSPITAL 55 50 | ORG_DESC:HOTEL 56 51 | ORG_DESC:MUSEUM 20 52 | ORG_DESC:POLITICAL 151 53 | ORG_DESC:RELIGIOUS 51 54 | ORG_DESC:OTHER 1191 55 | PRODUCT:VEHICLE 382 56 | PRODUCT:WEAPON 21 57 | PRODUCT:OTHER 521 58 | PRODUCT_DESC:VEHICLE 1223 59 | PRODUCT_DESC:WEAPON 132 60 | PRODUCT_DESC:OTHER 26 61 | DATE:AGE 620 62 | DATE:DATE 16124 63 | DATE:DURATION 3145 64 | DATE:OTHER 787 65 | EVENT:HURRICANE 104 66 | EVENT:WAR 47 67 | EVENT:OTHER 220 68 | LOCATION:CONTINENT 256 69 | LOCATION:LAKE_SEA_OCEAN 80 70 | LOCATION:REGION 526 71 | LOCATION:RIVER 39 72 | LOCATION:OTHER 187 73 | NORP:NATIONALITY 3238 74 | NORP:POLITICAL 677 75 | NORP:RELIGION 88 76 | NORP:OTHER 91 77 | QUANTITY:1D 188 78 | QUANTITY:2D 74 79 | QUANTITY:3D 105 80 | QUANTITY:WEIGHT 192 81 | QUANTITY:OTHER 69 82 | SUBSTANCE:CHEMICAL 531 83 | SUBSTANCE:DRUG 441 84 | SUBSTANCE:FOOD 888 85 | SUBSTANCE:OTHER 850 86 | WORK_OF_ART:BOOK 123 87 | WORK_OF_ART:PLAY 31 88 | WORK_OF_ART:SONG 39 89 | WORK_OF_ART:OTHER 515 90 | -------------------------------------------------------------------------------- /data/BBN/CONLL-format/docs/tagset.txt: -------------------------------------------------------------------------------- 1 | Entity label Counts 2 | --------------------------------------- 3 | LANGUAGE 84 4 | LAW 382 5 | DISEASE 317 6 | ANIMAL 396 7 | CARDINAL 10321 8 | ORDINAL 1098 9 | GAME 90 10 | MONEY 11097 11 | PERCENT 5976 12 | PERSON 40104 13 | PLANT 194 14 | TIME 1069 15 | CONTACT_INFO:PHONE 33 16 | 17 | ################################# 18 | 19 | FAC:AIRPORT 87 20 | FAC:ATTRACTION 46 21 | FAC:BRIDGE 81 22 | FAC:BUILDING 1962 23 | FAC:HIGHWAY_STREET 322 24 | FAC:OTHER 517 25 | 26 | GPE:CITY 5983 27 | GPE:COUNTRY 6071 28 | GPE:STATE_PROVINCE 3091 29 | GPE:OTHER 261 30 | 31 | ORGANIZATION:CORPORATION 38627 32 | ORGANIZATION:EDUCATIONAL 604 33 | ORGANIZATION:GOVERNMENT 7132 34 | ORGANIZATION:HOSPITAL 78 35 | ORGANIZATION:HOTEL 117 36 | ORGANIZATION:MUSEUM 34 37 | ORGANIZATION:POLITICAL 564 38 | ORGANIZATION:RELIGIOUS 95 39 | ORGANIZATION:OTHER 2446 40 | 41 | PRODUCT:VEHICLE 1605 42 | PRODUCT:WEAPON 153 43 | PRODUCT:OTHER 547 44 | 45 | ################################## 46 | 47 | DATE:AGE 620 48 | DATE:DATE 16124 49 | DATE:DURATION 3145 50 | DATE:OTHER 787 51 | 52 | EVENT:HURRICANE 104 53 | EVENT:WAR 47 54 | EVENT:OTHER 220 55 | 56 | LOCATION:CONTINENT 256 57 | LOCATION:LAKE_SEA_OCEAN 80 58 | LOCATION:REGION 526 59 | LOCATION:RIVER 39 60 | LOCATION:OTHER 187 61 | 62 | NORP:NATIONALITY 3238 63 | NORP:POLITICAL 677 64 | NORP:RELIGION 88 65 | NORP:OTHER 91 66 | 67 | QUANTITY:1D 188 68 | QUANTITY:2D 74 69 | QUANTITY:3D 105 70 | QUANTITY:WEIGHT 192 71 | QUANTITY:OTHER 69 72 | 73 | SUBSTANCE:CHEMICAL 531 74 | SUBSTANCE:DRUG 441 75 | SUBSTANCE:FOOD 888 76 | SUBSTANCE:OTHER 850 77 | 78 | WORK_OF_ART:BOOK 123 79 | WORK_OF_ART:PLAY 31 80 | WORK_OF_ART:SONG 39 81 | WORK_OF_ART:OTHER 515 82 | -------------------------------------------------------------------------------- /data/BBN/README.rst: -------------------------------------------------------------------------------- 1 | The BBN Pronoun Coreference and Entity Type Corpus 2 | ================================================== 3 | 4 | The BBN corpus cannot be distributed due to licensing restrictions. It can 5 | be obtained from the LDC (catalog number LDC2005T33) 6 | 7 | However, we included a script (bbn2conll.py) that will convert the dataset 8 | into CONLL 2003 format: 9 | 10 | 1. Put the contents of the BBN dataset (available from LDC, catalog 11 | number LDC2005T33), in directory original_BBN_dataset. 12 | 13 | 2. Create directory 'data' within the CONLL-format directory. 14 | 15 | 3. Some minor cleaning must be done (some XML errors). These can be fixed 16 | by running: 17 | 18 | . clean.sh 19 | 20 | See below for details of what this script does. 21 | 22 | 4. Then, in directory utils, in Python 2: 23 | 24 | import bbn2conll 25 | bbn2conll.write_all_to_conll() 26 | 27 | Cleaning 28 | -------- 29 | 30 | This describes the cleaning that is done by script clean.sh. 31 | 32 | Label mismatches: 33 | 34 | * wsj00c.qa, line 194: mismatch: more than 4,000 hours . 35 | NUMEX should be TIMEX. 36 | 37 | Ampersands: turn all & to & for the following files: 38 | 39 | wsj15c.qa 40 | wsj20b.qa 41 | ...and many others... 42 | 43 | The following were missing the (at start) and at the end; added 44 | them back in (wsj and .qa are omitted for convenience): 45 | 46 | 15c 47 | 20b 48 | 20c 49 | 17a 50 | 20a 51 | 03b 52 | 13c 53 | 16c 54 | 17d 55 | 03d 56 | 14a 57 | 16d 58 | 14b 59 | 13b 60 | 04b 61 | 15b 62 | 06b 63 | 13a 64 | 12d 65 | 18b 66 | 17b 67 | 18c 68 | 17c 69 | 18a 70 | 13d 71 | 16a 72 | 19c 73 | 20d 74 | 03c 75 | 04d 76 | 14d 77 | 14c 78 | 04c 79 | 03a 80 | 16b 81 | 15a 82 | 19d 83 | 04a 84 | 15d 85 | 18d 86 | 87 | -------------------------------------------------------------------------------- /data/BBN/clean.sh: -------------------------------------------------------------------------------- 1 | # Fix label mismatch: NUMEX that should be TIMEX: 2 | sed -i '194s/NUMEX/TIMEX/' original_BBN_dataset/data/WSJtypes-subtypes/wsj00c.qa 3 | 4 | # Fix ampersand issue: & -> %amp; 5 | sed -i 's/&/TTEMP/g' original_BBN_dataset/data/WSJtypes-subtypes/* 6 | sed -i 's/&/&/g' original_BBN_dataset/data/WSJtypes-subtypes/* 7 | sed -i 's/TTEMP/\&/g' original_BBN_dataset/data/WSJtypes-subtypes/* 8 | 9 | # Fix broken or missing and : 10 | python rootadd.py 11 | 12 | -------------------------------------------------------------------------------- /data/BBN/original_BBN_dataset/README.md: -------------------------------------------------------------------------------- 1 | Due to licensing restrictions, the BBN Pronoun Coreference and Entity Type 2 | Corpus cannot be distributed. It can be obtained from the LDC. 3 | 4 | However, we included a script that will convert the dataset into CONLL 2003 5 | format. 6 | 7 | 1. Put the contents of the BBN dataset (available from LDC, catalog 8 | number LDC2005T33), in this directory. 9 | 10 | 2. Create directory `data` within CONLL-format 11 | 12 | 3. Some minor cleaning must be done (some XML errors). See BBN/README.rst 13 | for the details. 14 | 15 | 4. Then, in directory utils, in Python 2: 16 | 17 | from bbn2conll import * 18 | write_all_to_conll() 19 | 20 | This will write the BBN corpus into CONLL-formatted files in the directory 21 | CONLL-format/data/ 22 | 23 | -------------------------------------------------------------------------------- /data/BBN/rootadd.py: -------------------------------------------------------------------------------- 1 | import os 2 | DIR = 'original_BBN_dataset/data/WSJtypes-subtypes/' 3 | 4 | filenames = os.listdir(DIR) 5 | 6 | for filename in filenames: 7 | with open(DIR + filename, 'r') as f: 8 | d = f.read() 9 | 10 | if d[0:6] != '': 11 | d = '' + d 12 | 13 | if d[-10:] == '\r\n\r\n' 15 | 16 | if d[-10:] == '\r\n\r\n': 17 | d = d + '\r\n' 18 | 19 | with open(DIR + filename, 'w') as f: 20 | f.write(d) 21 | -------------------------------------------------------------------------------- /data/BTC/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = LOC 4 | ORG 5 | PER 6 | 7 | [IOB-format] 8 | 9 | IOB = IOB2 10 | 11 | [file-settings] 12 | 13 | sep = tab 14 | iob_pos = 1 15 | word_pos = 0 16 | pos_pos = none 17 | -------------------------------------------------------------------------------- /data/BTC/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/498078d512c3acdfd1bec5ca11bd74ecd517b38c/data/BTC/CONLL-format/docs/entity-list.txt -------------------------------------------------------------------------------- /data/BTC/LICENSE: -------------------------------------------------------------------------------- 1 | License: https://creativecommons.org/licenses/by/4.0/ 2 | 3 | Attribution 4.0 International (CC BY 4.0) 4 | 5 | This is a human-readable summary of (and not a substitute for) the license. Disclaimer. 6 | You are free to: 7 | Share — copy and redistribute the material in any medium or format 8 | Adapt — remix, transform, and build upon the material 9 | for any purpose, even commercially. 10 | 11 | This license is acceptable for Free Cultural Works. 12 | The licensor cannot revoke these freedoms as long as you follow the license terms. 13 | Under the following terms: 14 | Attribution — You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use. 15 | 16 | No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits. 17 | 18 | Notices: 19 | You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation. 20 | No warranties are given. The license may not give you all of the permissions necessary for your intended use. For example, other rights such as publicity, privacy, or moral rights may limit how you use the material. 21 | 22 | See https://creativecommons.org/licenses/by/4.0/ 23 | -------------------------------------------------------------------------------- /data/BTC/NOTE.rst: -------------------------------------------------------------------------------- 1 | The CONLL-formatted files from the BTC corpus were downloaded from 2 | https://github.com/GateNLP/broad_twitter_corpus in August, 2018. 3 | 4 | For more information, see the README.md, and the following paper: 5 | 6 | Broad Twitter Corpus: A Diverse Named Entity Recognition Resource. 7 | Leon Derczynski, Kalina Bontcheva, and Ian Roberts. Proceedings of COLING, 8 | pages 1169-1179 2016. 9 | 10 | -------------------------------------------------------------------------------- /data/BTC/README.md: -------------------------------------------------------------------------------- 1 | # Broad Twitter Corpus 2 | 3 | This is the Broad Twitter corpus, a dataset of tweets collected over stratified times, places and social uses. The goal is to represent a broad range of activities, giving a dataset more representative of the language used in this hardest of social media formats to process. Further, the BTC is annotated for named entities. The entities and the crowd annotations are all provided with the corpus, as well as (where possible) the raw twitter JSON. 4 | 5 | You can find the full description of the corpus at http://www.aclweb.org/anthology/C16-1111 6 | 7 | ## Use 8 | 9 | The BTC is released as CC-BY 4.0. If you use this data, you should cite the accompanying paper: 10 | 11 | Broad Twitter Corpus: A Diverse Named Entity Recognition Resource. Leon Derczynski, Kalina Bontcheva, and Ian Roberts. Proceedings of COLING, pages 1169-1179 2016. 12 | 13 | The paper's full open access, and can be found easily; here's one link: http://www.aclweb.org/anthology/C16-1111 14 | 15 | ## Sections 16 | 17 | Section|Region|Collection period|Description|Annotators|Tweet count 18 | ---|---|---|---|---|--- 19 | A | UK| 2012.01| General collection |Expert| 1000 20 | B |UK |2012.01-02 |Non-directed tweets |Expert |2000 21 | E |Global| 2014.07| Related to MH17 disaster| Crowd & expert |200 22 | F |Stratified |2009-2014| Twitterati |Crowd & expert |2000 23 | G |Stratified| 2011-2014| Mainstream news| Crowd & expert| 2351 24 | H |Non-UK| 2014 |General collection |Crowd & expert |2000 25 | 26 | ## Format 27 | 28 | The data is provided in up to three formats: CoNLL, JSON, and GATE XML. JSON is the richest of these. For the JSON, we generally provide the raw tweet JSON from twitter, augmented with fields describing token offsets, token texts, and token labels. In some cases (sections A and B), some tweets have been deleted, and so only the rehydrateable tweets are provided. In these cases, the GATE XML annotations are canonical. The ".no-labels.json" files are the raw result from the Twitter API with no additional information. Not all sections are available in all formats with annotations, but this will improve as we re-export the corpus into new formats and update data. At a minimum, CONLL format data is given for all sections. 29 | 30 | Plain annotation data from CrowdFlower is also provided, in the .annotations/ directories. This is to assist future users working on crowdsourcing, e.g. on predicting the utility of a worker judgment. 31 | 32 | As well as CONLL-format annotations, for each document in each section, there's a .json file. This file has one record per line, each record corresponding to a document. At a minimum, every record will have a "tokens" and a "labels" list, which are the tokenized text and matching BIO labels for person, organization and location entities. There may be other data in the case that the whole tweet from the Twitter API was available at distribution time. Additionally, there may be an "annotation offsets" parameter that aligns the token annotations to the plain text in the tweet (often in the top-level "text" object, if the tweet's there). 33 | -------------------------------------------------------------------------------- /data/CADEC/CONLL-format/data_orig/README: -------------------------------------------------------------------------------- 1 | This directory is for the CONLL-formatted data files. 2 | 3 | Once the data files are here, remove this file. 4 | -------------------------------------------------------------------------------- /data/CADEC/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = ADR 4 | Disease 5 | Drug 6 | Finding 7 | Symptom 8 | 9 | [IOB-format] 10 | 11 | IOB = IOB2 12 | 13 | [file-settings] 14 | 15 | sep = tab 16 | iob_pos = 1 17 | word_pos = 0 18 | pos_pos = none 19 | -------------------------------------------------------------------------------- /data/CADEC/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | ADR 4 | Disease 5 | Drug 6 | Finding 7 | Symptom 8 | 9 | [format] 10 | 11 | IOB2 12 | 13 | [notes] 14 | 15 | ADR: adverse drug reaction 16 | Finding: clinical finding 17 | 18 | There wasn't a standard test/train split so we created one using the Python 19 | script stratified_split.py. 20 | 21 | [content] 22 | 23 | [stats] 24 | 25 | tokens sentences documents 26 | -------------------------------------- 27 | train 106451 6610 - 28 | test 16129 1000 - 29 | -------------------------------------- 30 | all 122580 7610 1250 31 | 32 | 33 | Entities: 34 | train test all 35 | --------------------------------- 36 | ADR 4995 761 5756 37 | Disease 247 35 282 38 | Drug 1563 236 1799 39 | Finding 362 52 414 40 | Symptom 234 34 268 41 | 42 | -------------------------------------------------------------------------------- /data/CADEC/LICENSE: -------------------------------------------------------------------------------- 1 | Content license: CSIRO Data License. 2 | -------------------------------------------------------------------------------- /data/CADEC/README.rst: -------------------------------------------------------------------------------- 1 | CADEC dataset 2 | ============= 3 | 4 | The CADEC v.2 dataset was accessed on November 2017 from: 5 | 6 | https://data.csiro.au/dap/ 7 | 8 | NOTE: the original corpus includes nested entities. 9 | 10 | For more information on the dataset, see Karimi et al., Cadec: A corpus of 11 | adverse drug event annotations. Journal of biomedical informatics, 55:73-81. 12 | 13 | Due to licensing restrictions, we cannot redistribute this dataset. Some notes 14 | on how we processed this dataset are given below. 15 | 16 | Files 17 | ----- 18 | 19 | For NER, the important data directories are cadec/original and 20 | cadec/text: 21 | 22 | - cadec/text: contains the original raw text 23 | - cadec/original: annotations of the entities in row/column format 24 | (tab-separated), with some comments starting with # interspersed. The columns 25 | are, in order: 26 | - T1, T2 etc (we ignore this) 27 | - entity type (ADR, Drug, Disease, Symptom, Finding) 28 | - first character position 29 | - 1+last character position 30 | - the entity text 31 | 32 | Processing to CONLL 33 | ------------------- 34 | 35 | First we fixed the following mistake: 36 | - In file DICLOFENAC-SODIUM.7.ann: 37 | On line 8, 421 was changed to 411. 38 | 39 | The data is in brat standoff format. To convert it to CONLL format, we used 40 | the standoff2conll script from: 41 | 42 | https://github.com/spyysalo/standoff2conll 43 | 44 | This converts from brat standoff annotation format to CONLL format. 45 | The corresponding .ann and .txt files must be in the same directory with 46 | nothing else in that directory. 47 | 48 | Note: In the event of overlapping entities, standoff2conll chooses the 49 | entities with maximum span. 50 | 51 | Cleaning 52 | -------- 53 | 54 | - We removed the following files (the original files 55 | were empty and there is nothing there.) 56 | 57 | - VOLTAREN-XR.9.conll 58 | - LIPITOR.40.conll 59 | 60 | Remarks 61 | ------- 62 | 63 | - The standoff2conll.py resolves the following: 64 | - Discontinuous spans are replaced by full spans if possible 65 | (sometimes this is not possible as there is no full span 66 | replacement, and the tags are ignored). 67 | - When spans are nested, the largest span is chosen. 68 | 69 | Test/Train split 70 | ---------------- 71 | 72 | There wasn't a standard test/train split so we created one using the Python 73 | script stratified_split.py, via: 74 | 75 | TRAIN, TEST = write_new_split('CADEC', 1000, filedir, 'cadec', max_count = 2) 76 | -------------------------------------------------------------------------------- /data/GUM/CONLL-format/data-by-corpustype/news/GUM_news_stampede.tsv: -------------------------------------------------------------------------------- 1 | 2 | Hundreds person 3 | dead O 4 | in O 5 | Hajj event 6 | stampede event 7 | 8 | Thursday time 9 | , O 10 | January time 11 | 12 time 12 | , time 13 | 2006 time 14 | 15 | The place 16 | Plains place 17 | of place 18 | Arafat place 19 | on O 20 | the time 21 | day time 22 | of time 23 | Hajj time 24 | 25 | According O 26 | to O 27 | the organization 28 | Saudi organization 29 | Interior organization 30 | Ministry organization 31 | over person 32 | 345 person 33 | Muslim person 34 | pilgrims person 35 | have O 36 | been O 37 | killed O 38 | in O 39 | a event 40 | stampede event 41 | during O 42 | the event 43 | annual event 44 | Hajj event 45 | pilgrimage event 46 | near O 47 | Mecca place 48 | , O 49 | Saudi place 50 | Arabia place 51 | . O 52 | 53 | The event 54 | stampede event 55 | at event 56 | Islam event 57 | 's event 58 | most event 59 | holy event 60 | site event 61 | happened O 62 | at O 63 | Jamarat place 64 | Bridge place 65 | , O 66 | during O 67 | an event 68 | event event 69 | where O 70 | pebbles object 71 | are O 72 | thrown O 73 | at O 74 | a object 75 | pillar object 76 | to O 77 | represent O 78 | the event 79 | stoning event 80 | of event 81 | Satan event 82 | as O 83 | part O 84 | of O 85 | the event 86 | final event 87 | rites event 88 | of event 89 | the event 90 | Hajj event 91 | . O 92 | 93 | The event 94 | stampede event 95 | began O 96 | when O 97 | luggage object 98 | from object 99 | a object 100 | bus object 101 | tripped O 102 | pilgrims person 103 | at person 104 | the person 105 | eastern person 106 | end person 107 | of person 108 | the person 109 | wall person 110 | , O 111 | causing O 112 | a object 113 | bottleneck object 114 | . O 115 | 116 | Those O 117 | who O 118 | were O 119 | tripped O 120 | were O 121 | then O 122 | crushed O 123 | by O 124 | the person 125 | wave person 126 | of person 127 | people person 128 | behind person 129 | them person 130 | . O 131 | 132 | Muslims person 133 | are O 134 | required O 135 | to O 136 | make O 137 | a O 138 | pilgrimage event 139 | to event 140 | Mecca event 141 | during O 142 | their abstract 143 | lifetimes abstract 144 | , O 145 | if O 146 | physically O 147 | or O 148 | financially O 149 | possible O 150 | , O 151 | as O 152 | it event 153 | is O 154 | one abstract 155 | of abstract 156 | the abstract 157 | five abstract 158 | pillars abstract 159 | of abstract 160 | the abstract 161 | Islamic abstract 162 | faith abstract 163 | . O 164 | 165 | Saudi person 166 | officials person 167 | had O 168 | already O 169 | provided O 170 | safeguards object 171 | by O 172 | installing O 173 | an object 174 | oval object 175 | wall object 176 | with object 177 | padded object 178 | edges object 179 | to O 180 | protect O 181 | pilgrams person 182 | from O 183 | a event 184 | crush event 185 | , O 186 | installed O 187 | security object 188 | cameras object 189 | and O 190 | placed O 191 | over person 192 | 60,000 person 193 | security person 194 | personnel person 195 | in O 196 | the place 197 | area place 198 | . O 199 | 200 | This event 201 | is O 202 | not O 203 | the O 204 | first O 205 | time O 206 | deadly event 207 | stampedes event 208 | have O 209 | taken O 210 | place O 211 | during O 212 | the event 213 | Hajj event 214 | , O 215 | with O 216 | the event 217 | deadliest event 218 | stampede event 219 | during O 220 | the event 221 | 1991 event 222 | Hajj event 223 | , O 224 | in O 225 | which O 226 | 1,426 person 227 | pilgrims person 228 | were O 229 | trampled O 230 | . O 231 | -------------------------------------------------------------------------------- /data/GUM/CONLL-format/data-by-corpustype/news/GUM_news_worship.tsv: -------------------------------------------------------------------------------- 1 | 2 | Greek event 3 | court event 4 | rules event 5 | worship event 6 | of event 7 | ancient event 8 | Greek event 9 | deities event 10 | is event 11 | legal event 12 | 13 | Monday time 14 | , time 15 | March time 16 | 27 time 17 | , time 18 | 2006 time 19 | 20 | Greek event 21 | court event 22 | has event 23 | ruled event 24 | that event 25 | worshippers event 26 | of event 27 | the event 28 | ancient event 29 | Greek event 30 | religion event 31 | may event 32 | now event 33 | formally event 34 | associate event 35 | and event 36 | worship event 37 | at event 38 | archeological event 39 | sites event 40 | . O 41 | 42 | Prior O 43 | to O 44 | the event 45 | ruling event 46 | , O 47 | the event 48 | religion event 49 | was event 50 | banned event 51 | from event 52 | conducting event 53 | public event 54 | worship event 55 | at event 56 | archeological event 57 | sites event 58 | by O 59 | the organization 60 | Greek organization 61 | Ministry organization 62 | of organization 63 | Culture organization 64 | . O 65 | 66 | Due O 67 | to O 68 | that event 69 | , O 70 | the organization 71 | religion organization 72 | was O 73 | relatively O 74 | secretive O 75 | . O 76 | 77 | The organization 78 | Greek organization 79 | Orthodox organization 80 | Church organization 81 | , O 82 | a organization 83 | Christian organization 84 | denomination organization 85 | , O 86 | is O 87 | extremely O 88 | critical O 89 | of O 90 | worshippers person 91 | of person 92 | the person 93 | ancient person 94 | deities person 95 | . O 96 | 97 | Today time 98 | , O 99 | about person 100 | 100,000 person 101 | Greeks person 102 | worship O 103 | the abstract 104 | ancient abstract 105 | gods abstract 106 | , O 107 | such O 108 | as O 109 | Zeus person 110 | , O 111 | Hera person 112 | , O 113 | Poseidon person 114 | , O 115 | Aphrodite person 116 | , O 117 | and O 118 | Athena person 119 | . O 120 | 121 | The organization 122 | Greek organization 123 | Orthodox organization 124 | Church organization 125 | estimates O 126 | that quantity 127 | number quantity 128 | is O 129 | closer O 130 | to O 131 | 40,000 O 132 | . O 133 | -------------------------------------------------------------------------------- /data/GUM/CONLL-format/docs-train-test/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = person 4 | place 5 | organization 6 | quantity 7 | time 8 | event 9 | abstract 10 | substance 11 | object 12 | animal 13 | plant 14 | 15 | [IOB-format] 16 | 17 | IOB = IOB2 18 | 19 | [file-settings] 20 | 21 | sep = tab 22 | iob_pos = 1 23 | word_pos = 0 24 | pos_pos = none 25 | -------------------------------------------------------------------------------- /data/GUM/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = person 4 | place 5 | organization 6 | quantity 7 | time 8 | event 9 | abstract 10 | substance 11 | object 12 | animal 13 | plant 14 | 15 | [IOB-format] 16 | 17 | IOB = none 18 | 19 | [file-settings] 20 | 21 | sep = tab 22 | iob_pos = 1 23 | word_pos = 0 24 | pos_pos = none 25 | -------------------------------------------------------------------------------- /data/GUM/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | person 4 | place 5 | organization 6 | quantity 7 | time 8 | event 9 | abstract 10 | substance 11 | object 12 | animal 13 | plant 14 | 15 | [format] 16 | 17 | none 18 | 19 | [notes] 20 | 21 | - There wasn't a standard test/train split so we created one using the Python 22 | script stratified_split.py. 23 | - There was one mention of a "newspaper" entity; this was removed. 24 | - The corpus was hand-annotated by students. 25 | - The entity types were inspired by Ontonotes but diverge a bit; the entities 26 | which differ are related according to the following: 27 | 28 | GUM ONTONOTES 29 | ------------------------- 30 | place GPE, LOC, FAC 31 | organization NORP, ORG 32 | object PRODUCT, WORK_OF_ART, and any other concrete objects 33 | time DAT, TIM 34 | abstract LANGUAGE, LAW, and all other abstractions 35 | quantity PCT, MON, and all other quantities 36 | 37 | ORDINAL and CARDINAL were not annotated at all. 38 | 39 | - For more information, see p. 595 of Zeldes, Amir (2017) "The GUM Corpus: 40 | Creating Multilayer Resources in the Classroom". Language Resources and 41 | Evaluation 51(3), 581–612. 42 | 43 | [content] 44 | 45 | whow: How-tos (wikihow) 46 | voyage: Travel guides (wikivoyage) 47 | news: news (wikinews) 48 | interview: interviews from wikinews 49 | 50 | See: https://corpling.uis.georgetown.edu/gum/ 51 | 52 | [stats] 53 | 54 | 55 | tokens sentences 56 | -------------------------- 57 | train 44111 2495 58 | test 18236 1000 59 | -------------------------- 60 | all 62347 3495 61 | -------------------------- 62 | 63 | 64 | tokens sentences documents 65 | ------------------------------------------ 66 | whow: 16562 1076 19 67 | voyage: 14480 755 17 68 | news: 13610 621 21 69 | interview: 17695 1043 19 70 | ------------------------------------------ 71 | total: 62347 3495 76 72 | 73 | Entities 74 | train test all 75 | ----------------------------------- 76 | 77 | person 1920 823 2743 78 | place 1150 469 1613 79 | organization 397 192 589 80 | quantity 97 44 141 81 | time 401 179 580 82 | event 738 315 1053 83 | abstract 2002 798 2800 84 | substance 278 95 373 85 | object 1017 420 1437 86 | animal 141 42 183 87 | plant 144 62 206 88 | 89 | 90 | whow voyage news interview all 91 | -------------------------------------------------------- 92 | person 628 321 713 1081 2743 93 | place 150 1040 249 180 1619 94 | organization 22 76 247 244 589 95 | quantity 45 48 29 19 141 96 | time 72 215 178 115 580 97 | event 289 165 272 327 1053 98 | abstract 838 445 478 1039 2800 99 | substance 235 13 40 85 373 100 | object 576 289 285 287 1437 101 | animal 94 16 0 73 183 102 | plant 169 21 14 2 206 103 | 104 | 105 | -------------------------------------------------------------------------------- /data/GUM/CONLL-format/utils/webAnnotsv_to_conll.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script can be used to convert Webanno .tsv files to CONLL. 3 | For use with Python 2. 4 | 5 | """ 6 | 7 | import os 8 | 9 | filename = 'GUM_whow_skittles.tsv' 10 | 11 | # column indices that we need to know 12 | word_ind = 2 13 | iob_ind = 3 14 | 15 | CONLLDIR_BASE = '../data_CONLL-format/' 16 | rootDir_BASE = '../data_orig/gum/coref/tsv/' 17 | 18 | def recursively(): 19 | """ This converts every tsv file in the rootDir_BASE to CONLL-format 20 | and saves it in the CONLLDIR_BASE directory. 21 | 22 | """ 23 | CONLLDIR = CONLLDIR_BASE 24 | rootDir = rootDir_BASE 25 | 26 | #CONLLDIR is the directory where the new files should be written. 27 | for dirName, subdirList, fileList in os.walk(rootDir): 28 | newdir = CONLLDIR+dirName[len(rootDir):] 29 | if not os.path.exists(newdir): 30 | os.makedirs(newdir) 31 | print 'Made directory: ', newdir 32 | for fname in fileList: 33 | writefile = os.path.join(newdir, fname) 34 | readfile = os.path.join(dirName, fname) 35 | 36 | # Parse the file and write to writefile 37 | file_to_conllfile(readfile, writefile) 38 | #print 'Wrote to: ', writefile 39 | #print 'orig: ', readfile 40 | 41 | def file_to_conllfile(filename, writefile): 42 | """ Read in a tsv file, and write to a CONLL-2003 formatted file. 43 | 44 | """ 45 | sentences = file_to_sent_list(filename) 46 | new_sentences = fix_sentences(sentences) 47 | #writefile = CONLLDIR+filename 48 | with open(writefile,'a+') as fd: 49 | for sent in new_sentences: 50 | fd.write('\n') 51 | for tok in sent: 52 | fd.write(tok[0] + '\t' + tok[1] + '\n') 53 | print 'Done with file', filename 54 | 55 | 56 | def file_to_sent_list(filename): 57 | """ Returns a list of lists; each sublist contains tuples of the form 58 | (word, pos, ent), but ent is in the CONLL-2012 format, ie it is given 59 | by things like *, (PERSON* , (NORP) (using brackets rather than IOB 60 | tags. 61 | 62 | """ 63 | with open(filename,'r') as fd: 64 | L = fd.readlines() 65 | 66 | # The document is split into pieces, so there are some more # in between these 67 | # lines; filter them out. 68 | L = [l for l in L if l[0]!='#'] 69 | 70 | sentences = [] 71 | sent = [] 72 | for l in L: 73 | if l=='\n': 74 | sentences.append(sent) 75 | sent = [] 76 | else: 77 | stripped = l.split() 78 | word = stripped[word_ind] 79 | iob = stripped[iob_ind] 80 | 81 | sent.append((word, iob) ) 82 | return sentences 83 | 84 | def fix_iob(iob): 85 | if iob[-1]==']' and iob.find('[')>=1: 86 | # might have | 87 | if '|' in iob: 88 | first = iob[:iob.find('|')] 89 | second = iob[iob.find('|')+1:] 90 | iob = first 91 | 92 | LBP = iob.find('[') 93 | if not iob[LBP+1:-1].isdigit(): 94 | print iob 95 | raise ValueError("?") 96 | iob = iob[: iob.find('[')] 97 | 98 | if iob == '_': 99 | iob = 'O' 100 | 101 | return iob 102 | 103 | 104 | def fix_sentences(sentences): 105 | s2 = [ [(w,fix_iob(iob)) for (w,iob) in s] for s in sentences] 106 | s2 = [x for x in s2 if x!=[]] # eliminate any extra empty spaces 107 | return s2 108 | 109 | ############################################################################ 110 | 111 | -------------------------------------------------------------------------------- /data/GUM/README.rst: -------------------------------------------------------------------------------- 1 | The GUM Corpus was downloaded on November 2017 from 2 | 3 | https://github.com/amir-zeldes/gum 4 | 5 | In particular, we use the data from: 6 | 7 | https://github.com/amir-zeldes/gum/tree/master/coref/tsv 8 | 9 | This was converted to CONLL format. 10 | 11 | Attribution 12 | ----------- 13 | 14 | The GUM corpus was collected and annotated at Georgetown University. For more 15 | information, see the LICENSE, https://corpling.uis.georgetown.edu/gum, 16 | and the following publication: 17 | 18 | Zeldes, Amir (2016) "The GUM Corpus: Creating Multilayer Resources in the 19 | Classroom". Language Resources and Evaluation. 20 | 21 | Notes 22 | ----- 23 | - The NER data is in the coref subdirectory. 24 | 25 | Converting to CONLL 26 | ------------------- 27 | 28 | As the GUM datasets have CC licenses (see the LICENSE file) that permit 29 | sharing and modification, the CONLL-formatted versions are contained in 30 | CONLL-format/data (with a test/train split) and in 31 | CONLL-format/data-by-corpustype (split into categories whow, voyage, news, 32 | interview). 33 | 34 | Here are some notes on how we processed the data. 35 | 36 | 1.) To convert to 'CONLL' format (not worrying about IOB2 here) 37 | we used the webAnnotsv_to_conll.py in the utils directory: 38 | 39 | import webAnnotsv_to_conll 40 | webAnnotsv_to_conll.recursively() 41 | 42 | 2.) Moved the files into the corresponding directories (news, voyage, 43 | interview, whow). 44 | 45 | 3.) Data cleaning: 46 | 47 | - In file GUM_interview_licen.tsv, replaced the two 48 | occurrences of * with O. 49 | 50 | - There was only one entity ("The Independent") tagged as "newspaper" 51 | entity; so we replaced these with "organization". 52 | 53 | Test/Train split 54 | ---------------- 55 | 56 | There wasn't a standard test/train split so we created one using the Python 57 | script stratified_split.py, via: 58 | 59 | TRAIN, TEST = write_new_split('GUM', 1000, filedir, 'gum', max_count = 2) 60 | -------------------------------------------------------------------------------- /data/MITMovieCorpus/CONLL-format/trivia10k13/data/test/README: -------------------------------------------------------------------------------- 1 | Download the data from https://groups.csail.mit.edu/sls/downloads/movie/ 2 | 3 | Place file trivia10k13test.bio in this directory, and then remove this file. 4 | 5 | 6 | -------------------------------------------------------------------------------- /data/MITMovieCorpus/CONLL-format/trivia10k13/data/train/README: -------------------------------------------------------------------------------- 1 | Download the data from https://groups.csail.mit.edu/sls/downloads/movie/ 2 | 3 | Place file trivia10k13train.bio in this directory, and then remove this file. 4 | 5 | 6 | -------------------------------------------------------------------------------- /data/MITMovieCorpus/CONLL-format/trivia10k13/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = Actor 4 | Character_Name 5 | Director 6 | Genre 7 | Plot 8 | Year 9 | Soundtrack 10 | Opinion 11 | Award 12 | Origin 13 | Quote 14 | Relationship 15 | 16 | [IOB-format] 17 | 18 | IOB = IOB2 19 | 20 | [file-settings] 21 | 22 | sep = tab 23 | iob_pos = 0 24 | word_pos = 1 25 | pos_pos = none 26 | -------------------------------------------------------------------------------- /data/MITMovieCorpus/CONLL-format/trivia10k13/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | Actor 4 | Character_Name 5 | Director 6 | Genre 7 | Plot 8 | Year 9 | Soundtrack 10 | Opinion 11 | Award 12 | Origin 13 | Quote 14 | Relationship 15 | 16 | [format] 17 | 18 | IOB2 19 | 20 | [notes] 21 | 22 | [content] 23 | 24 | [stats] 25 | 26 | tokens sentences 27 | -------------------------- 28 | train 158823 7816 29 | test 39035 1953 30 | -------------------------- 31 | all 197858 9769 32 | 33 | 34 | Entities 35 | train test all 36 | ------------------------------------- 37 | Actor 5010 1274 6284 38 | Character_Name 1025 283 1308 39 | Director 1787 425 2212 40 | Genre 3384 789 4173 41 | Plot 6468 1577 8045 42 | Year 2702 661 3363 43 | Soundtrack 50 8 58 44 | Opinion 810 195 1005 45 | Award 309 66 375 46 | Origin 779 190 969 47 | Quote 126 47 173 48 | Relationship 580 171 751 49 | 50 | 51 | -------------------------------------------------------------------------------- /data/MITMovieCorpus/README.rst: -------------------------------------------------------------------------------- 1 | Downloaded from 2 | --------------- 3 | 4 | https://groups.csail.mit.edu/sls/downloads/movie/ 5 | 6 | More information: 7 | https://groups.csail.mit.edu/sls/downloads/ 8 | 9 | According to their website, the 'eng' files are simple queries, while the 10 | trivial10k13 are made up of more complex queries. 11 | 12 | We used the trivial10k13 portion. 13 | 14 | Remarks 15 | ------- 16 | 17 | I'm not sure why they use a different tag set for 18 | eng and for trivia10k13. Some of the entities seem to 19 | overlap: 20 | 21 | eng trivia10k13 22 | ---------------------- 23 | ACTOR Actor 24 | CHARACTER Character_Name 25 | DIRECTOR Director 26 | GENRE Genre 27 | PLOT Plot 28 | YEAR Year 29 | SONG Soundtrack 30 | -------------------------- 31 | REVIEW Opinion (?) 32 | 33 | Only in eng: 34 | - RATINGS_AVERAGE 35 | - RATING 36 | - TITLE 37 | - TRAILER 38 | 39 | Only in trivial10k13: 40 | -Award 41 | -Origin 42 | -Quote 43 | -Relationship 44 | 45 | Note -- RATING and Opinion are somewhat different 46 | -- Opinion and REVIEW seem similar, but check again (?) 47 | -------------------------------------------------------------------------------- /data/MITRestaurantCorpus/CONLL-format/data/test/README: -------------------------------------------------------------------------------- 1 | Download the data from: 2 | 3 | https://groups.csail.mit.edu/sls/downloads/restaurant/ 4 | 5 | and put file restauranttest.bio in this directory. 6 | 7 | Then delete this file. 8 | -------------------------------------------------------------------------------- /data/MITRestaurantCorpus/CONLL-format/data/train/README: -------------------------------------------------------------------------------- 1 | Download the data from: 2 | 3 | https://groups.csail.mit.edu/sls/downloads/restaurant/ 4 | 5 | and put file restauranttrain.bio in this directory. 6 | 7 | Then delete this file. 8 | -------------------------------------------------------------------------------- /data/MITRestaurantCorpus/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = Amenity 4 | Cuisine 5 | Dish 6 | Hours 7 | Location 8 | Price 9 | Rating 10 | Restaurant_Name 11 | 12 | [IOB-format] 13 | 14 | IOB = IOB2 15 | 16 | [file-settings] 17 | 18 | sep = tab 19 | iob_pos = 0 20 | word_pos = 1 21 | pos_pos = none 22 | -------------------------------------------------------------------------------- /data/MITRestaurantCorpus/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | Amenity 4 | Cuisine 5 | Dish 6 | Hours 7 | Location 8 | Price 9 | Rating 10 | Restaurant_Name 11 | 12 | [format] 13 | 14 | IOB2 15 | 16 | [notes] 17 | 18 | [content] 19 | 20 | [stats] 21 | 22 | tokens sentences 23 | ------------------------- 24 | train 70525 7660 25 | test 14256 1521 26 | all 84781 9181 27 | 28 | Entities: 29 | train test all 30 | ------------------------------------ 31 | Amenity 2541 533 3074 32 | Cuisine 2839 532 3371 33 | Dish 1475 288 1763 34 | Hours 990 212 1202 35 | Location 3817 812 4629 36 | Price 730 171 901 37 | Rating 1070 201 1271 38 | Restaurant_Name 1901 402 2303 39 | 40 | -------------------------------------------------------------------------------- /data/MITRestaurantCorpus/README.rst: -------------------------------------------------------------------------------- 1 | MIT Restaurant dataset 2 | ====================== 3 | 4 | This dataset was downloaded on January 2018 from 5 | 6 | https://groups.csail.mit.edu/sls/downloads/restaurant/ 7 | 8 | See also: 9 | https://groups.csail.mit.edu/sls/downloads/ 10 | -------------------------------------------------------------------------------- /data/MUC-6/CONLL-format/data/test/README: -------------------------------------------------------------------------------- 1 | Put the CONLL-formatted test data here; then remove this file. 2 | -------------------------------------------------------------------------------- /data/MUC-6/CONLL-format/data/train/README: -------------------------------------------------------------------------------- 1 | Put the CONLL-formatted train data here; then remove this file. 2 | -------------------------------------------------------------------------------- /data/MUC-6/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = PER 4 | LOC 5 | ORG 6 | PCT 7 | MON 8 | DAT 9 | 10 | [IOB-format] 11 | 12 | IOB = IOB2 13 | 14 | [file-settings] 15 | 16 | sep = tab 17 | iob_pos = 1 18 | word_pos = 0 19 | pos_pos = none 20 | -------------------------------------------------------------------------------- /data/MUC-6/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | PER 4 | LOC 5 | ORG 6 | PCT 7 | MON 8 | DAT 9 | 10 | [format] 11 | 12 | IOB2 13 | 14 | [content] 15 | 16 | [stats] 17 | 18 | 151840 tokens; 6576 sentences 19 | 20 | ---------- 21 | PER: 1879 22 | LOC: 2241 23 | ORG: 3484 24 | PCT: 747 25 | MON: 1329 26 | DAT: 1173 27 | 28 | [notes] 29 | 30 | I have only converted the training data; it seems the test and dev data are in 31 | files formal-tst.NE.key.04oct95 and dryrun-test.NE.key.02may95, respectively. 32 | These files are quite small (around 1000 and 2000 lines so probably not worth it.). 33 | 34 | -------------------------------------------------------------------------------- /data/MalwareTextDB/CONLL-format/data/README: -------------------------------------------------------------------------------- 1 | The data can be downloaded from 2 | 3 | http://www.statnlp.org/paper/malwaretextdb-a-database-for-annotated-malware-articles.html 4 | 5 | Place the files from the ann+brown directory here, and then remove this file. 6 | 7 | Besides entity tags, POS tags and Brown cluster tags are also included (in the 8 | second and third columns, respectively). 9 | 10 | 11 | 12 | For more information, see: 13 | 14 | Lim, Swee Kiat, et al. "MalwareTextDB: A Database for Annotated Malware Articles." Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). Vol. 1. 2017 15 | 16 | -------------------------------------------------------------------------------- /data/MalwareTextDB/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = Entity 4 | Preposition 5 | Verb 6 | 7 | [IOB-format] 8 | 9 | IOB = IOB2 10 | 11 | [file-settings] 12 | 13 | sep = ' ' 14 | iob_pos = 3 15 | word_pos = 0 16 | pos_pos = 1 17 | -------------------------------------------------------------------------------- /data/MalwareTextDB/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | Entity 4 | Preposition 5 | Verb 6 | 7 | [format] 8 | 9 | IOB2 10 | 11 | [notes] 12 | 13 | [content] 14 | 15 | [stats] 16 | 17 | 168138 tokens; 6966 sentences; 39 documents 18 | 19 | Entity stats: 20 | --------------------- 21 | Entity 6094 22 | Preposition 1786 23 | Verb 2838 24 | -------------------------------------------------------------------------------- /data/MalwareTextDB/README.rst: -------------------------------------------------------------------------------- 1 | The MalwareTextDB Dataset 2 | ========================= 3 | 4 | This dataset consists of texts about malware. It was developed by researchers 5 | at the Singapore University of Technology and Design and 6 | DSO National Laboratories, and can be downloaded from: 7 | 8 | http://www.statnlp.org/paper/malwaretextdb-a-database-for-annotated-malware-articles.html 9 | 10 | The files in directory data/ann+brown contain the following annotations: 11 | 12 | - Entity tags 13 | - POS tags (obtained from Stanford's POSTagger) 14 | - the Brown cluster 15 | 16 | Remark 17 | ------ 18 | 19 | This corpus is interesting in that entities are labeled (in IOB2 format) with 20 | "Entity" (no entity types are indicated). There are also labels for Verb and 21 | Preposition. 22 | 23 | NOTE: looking at entity "Regin", in Regis_The_Intercept.txtcbn, it seems that 24 | the annotation is inconsistent. 25 | 26 | Cite as 27 | ------- 28 | 29 | If using this dataset, please cite as: 30 | 31 | Lim, Swee Kiat, et al. "MalwareTextDB: A Database for Annotated Malware Articles." Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). Vol. 1. 2017 32 | 33 | -------------------------------------------------------------------------------- /data/NIST_IEER/CONLL-format/data/README: -------------------------------------------------------------------------------- 1 | This directory is for the dataset in CONLL format. 2 | -------------------------------------------------------------------------------- /data/NIST_IEER/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = PER 4 | LOC 5 | ORG 6 | PCT 7 | MON 8 | TIM 9 | DAT 10 | DUR 11 | CAR 12 | MEA 13 | 14 | [IOB-format] 15 | 16 | IOB = IOB2 17 | 18 | [file-settings] 19 | 20 | sep = tab 21 | iob_pos = 1 22 | word_pos = 0 23 | pos_pos = none 24 | -------------------------------------------------------------------------------- /data/NIST_IEER/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | PER 4 | LOC 5 | ORG 6 | PCT 7 | MON 8 | TIM 9 | DAT 10 | DUR 11 | CAR 12 | MEA 13 | 14 | [format] 15 | 16 | IOB2 17 | 18 | [notes] 19 | 20 | CAR=CARDINALITY 21 | DUR=DURATION 22 | MEA=MEASURE 23 | 24 | [content] 25 | 26 | NYT and APW news articles from 1998 (March and April) 27 | 28 | [stats] 29 | 30 | 60362 tokens; 2695 sentences 31 | 32 | PER: 1504 33 | LOC: 878 34 | ORG: 939 35 | PCT: 76 36 | MON: 121 37 | TIM: 12 38 | DAT: 534 39 | DUR: 242 40 | CAR: 461 41 | MEA: 186 42 | 43 | -------------------------------------------------------------------------------- /data/NIST_IEER/CONLL-format/utils/quick_comma_fix.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script can fix comma tokenization errors. 3 | 4 | NOTE: only use this if have already eliminated the commas 5 | corresponding to entity names. 6 | 7 | """ 8 | filename = '../data/APW_19980314' 9 | 10 | newfile = 'APW_19980314.conll' 11 | 12 | def clean(): 13 | 14 | newsentences = [] 15 | badlist = [] 16 | 17 | #with open(newfile,'a+') as nfd: 18 | with open(filename,'rb') as fd: 19 | file_content = fd.read().decode('utf-8').strip() 20 | annotated_sentences = file_content.split('\n\n') 21 | for annotated_sentence in annotated_sentences: 22 | newsentence = [] 23 | 24 | annotated_tokens = [seq for seq in annotated_sentence.split('\n')] 25 | for idx, annotated_token in enumerate(annotated_tokens): 26 | annotations = annotated_token.split('\t') 27 | #print annotations 28 | if len(annotations)==1: 29 | # assume this is always caused by having spaces instead 30 | # of tabs: 31 | annotations = annotations[0].split(annotations[0].count(' ')*' ') 32 | 33 | # Sometimes there are commas incorrectly tokenized, eliminate 34 | # NOTE: only use this if have already eliminated the commas 35 | # corresponding to entity names. 36 | 37 | if len(annotations[0])>1 and annotations[0][-1]==',' and '.,' not in annotations[0]: 38 | newsentence.append((annotations[0][:-1],annotations[1]) ) 39 | newsentence.append((u',', annotations[1])) 40 | badlist.append( (annotations[0][:-1],annotations[1]) ) 41 | print annotations 42 | else: 43 | newsentence.append((annotations[0],annotations[1])) 44 | 45 | newsentences.append(newsentence) 46 | 47 | return badlist, newsentences 48 | 49 | def savefile(newsentences): 50 | with open(newfile,'a+') as nfd: 51 | for sent in newsentences: 52 | nfd.write('\n') 53 | for iobword in sent: 54 | nfd.write(iobword[0]+'\t'+iobword[1]+'\n') 55 | 56 | -------------------------------------------------------------------------------- /data/NIST_IEER/README.rst: -------------------------------------------------------------------------------- 1 | The NIST 1999 IE-ER Dataset 2 | =========================== 3 | 4 | We used the portion of the NIST 1999 IE-ER Evaluation Dataset which is available 5 | in NLTK (the NEWSWIRE development test data, originally in subdirectory 6 | /ie_er_99/english/devtest/newswire/*.ref.nwt 7 | 8 | 9 | The files can be found in the list of nltk corpora. The files are: 10 | 11 | 'APW_19980314': 'Associated Press Weekly, 14 March 1998', 12 | 'APW_19980424': 'Associated Press Weekly, 24 April 1998', 13 | 'APW_19980429': 'Associated Press Weekly, 29 April 1998', 14 | 'NYT_19980315': 'New York Times, 15 March 1998', 15 | 'NYT_19980403': 'New York Times, 3 April 1998', 16 | 'NYT_19980407': 'New York Times, 7 April 1998', 17 | 18 | These are located in $NLTK_DATA/corpora/ieer, where NLTK_DATA is the directory 19 | containing the nltk datasets. 20 | 21 | For more information on the dataset, see: 22 | 23 | http://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm 24 | 25 | 26 | How the data was converted to CONLL format 27 | ------------------------------------------ 28 | 29 | In case one would like to replicate this: 30 | 31 | 1. There was an error (one nested ORG "Smithsonian", right after: 32 | 'the main book publishing arm' in file ieer/NYT_19980407 33 | This was fixed manually. 34 | Replace the original ieer/NYT_19980407 in the nltk corpora directory with 35 | this one. 36 | 37 | 2. Use the script makeconll.py in the utils directory. From Python 2: 38 | import makeconll 39 | makeconll.write_all_to_conll() 40 | 41 | This should write the 6 files to the data directory. Altogether there are 42 | about 50,000 tokens here. DO NOT RE-RUN (it will overwrite the files). 43 | 44 | 3. I fixed several errors caused by the nltk corpus reader: 45 | 46 | - & -> & 47 | - &UR; and &LR; removed (I checked for QR, QC, HT, MD, but they were not present) 48 | - Incorrect sentence segmentation 49 | - : and ; attached to words 50 | - Removed incorrectly tokenized commas [for entities] 51 | - periods at end of sentence, incorrectly tokenized 52 | - parentheses "(" and ")" incorrectly tokenized 53 | - !, ? incorrectly tokenized 54 | - `` and '' 55 | - Proper tokenization for 's 56 | - Also proper tokenization for n't, 'v, 'm, 'll, 'd, 'r etc 57 | 58 | 4. Used script quick_comma_fix.py to fix comma errors. 59 | 60 | Removal of entities 61 | ------------------- 62 | 63 | There were under 20 instances of TIME, so these were removed. 64 | 65 | Remarks 66 | ------- 67 | 68 | The annotation scheme seems very similar to MUC. In addition to the 7 MUC 69 | entity types, there are 3 new entity types here: 70 | 71 | DUR (DURATION), MEA (MEASURE), and CAR (CARDINALITY). 72 | 73 | Note: this corpus does not overlap in any way with MUC. This appeared after 74 | MUC 7 and uses news articles from a different year. 75 | 76 | 77 | -------------------------------------------------------------------------------- /data/Ritter/CONLL-format/data/dev/README: -------------------------------------------------------------------------------- 1 | Download the data from 2 | 3 | http://kimi.ml.cmu.edu/transfer/data.tar.gz 4 | 5 | and put file ner.test.dev in this directory. 6 | 7 | Then delete this file. 8 | -------------------------------------------------------------------------------- /data/Ritter/CONLL-format/data/test/README: -------------------------------------------------------------------------------- 1 | Download the data from 2 | 3 | http://kimi.ml.cmu.edu/transfer/data.tar.gz 4 | 5 | and put file ner.test.txt in this directory. 6 | 7 | Then delete this file. 8 | -------------------------------------------------------------------------------- /data/Ritter/CONLL-format/data/train/README: -------------------------------------------------------------------------------- 1 | Download the data from 2 | 3 | http://kimi.ml.cmu.edu/transfer/data.tar.gz 4 | 5 | and put file ner.train.txt in this directory. 6 | 7 | Then delete this file. 8 | -------------------------------------------------------------------------------- /data/Ritter/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = person 4 | geo-loc 5 | facility 6 | company 7 | sportsteam 8 | musicartist 9 | product 10 | tvshow 11 | movie 12 | other 13 | 14 | [IOB-format] 15 | 16 | IOB = IOB2 17 | 18 | [file-settings] 19 | 20 | sep = multispace 21 | iob_pos = 1 22 | word_pos = 0 23 | pos_pos = none 24 | -------------------------------------------------------------------------------- /data/Ritter/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | person 4 | geo-loc 5 | facility 6 | company 7 | sportsteam 8 | musicartist 9 | product 10 | tvshow 11 | movie 12 | other 13 | 14 | [format] 15 | 16 | IOB2 17 | 18 | [notes] 19 | 20 | The Twitter Ritter corpus is the same as the training portion of WNUT16. 21 | The training/test/dev split 80%/10%/10%. We are using exactly 22 | the same data (and split) as the paper of Yang et al, "Transfer Learning for 23 | Sequence Tagging with Hierarchical Recurrent Networks", downloaded from 24 | http://kimi.ml.cmu.edu/transfer/data.tar.gz 25 | 26 | [content] 27 | 28 | Twitter text. 29 | 30 | [stats] 31 | 32 | 33 | tokens sentences 34 | ------------------------- 35 | train 36936 1900 36 | test 4921 254 37 | dev 4612 240 38 | ------------------------- 39 | all 46469 2394 40 | 41 | 42 | Entities: 43 | train test dev all 44 | ----------------------------------------- 45 | person 352 42 55 449 46 | geo-loc 218 33 25 276 47 | facility 84 10 10 104 48 | company 149 14 8 171 49 | sportsteam 40 4 7 51 50 | musicartist 40 3 12 55 51 | product 77 5 15 97 52 | tvshow 24 5 5 34 53 | movie 28 4 2 34 54 | other 188 17 20 225 55 | -------------------------------------------------------------------------------- /data/Ritter/README.rst: -------------------------------------------------------------------------------- 1 | Ritter Twitter Dataset 2 | ====================== 3 | 4 | Ritter's Twitter corpus is the same as the training portion of WNUT16 (though 5 | with sentences in a different order). 6 | 7 | The training/test/dev split here (80%/10%/10%) was used in the paper of 8 | Yang et al, "Transfer Learning for Sequence Tagging with Hierarchical 9 | Recurrent Networks". 10 | 11 | This data was downloaded in January 2018 from: 12 | 13 | http://kimi.ml.cmu.edu/transfer/data.tar.gz 14 | -------------------------------------------------------------------------------- /data/SEC-filings/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = PER 4 | LOC 5 | ORG 6 | MISC 7 | 8 | [IOB-format] 9 | 10 | IOB = IO 11 | 12 | [file-settings] 13 | 14 | sep = ' ' 15 | iob_pos = 3 16 | word_pos = 0 17 | pos_pos = 1 18 | -------------------------------------------------------------------------------- /data/SEC-filings/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | PER 4 | LOC 5 | ORG 6 | MISC 7 | 8 | [format] 9 | 10 | IO 11 | 12 | [content] 13 | 14 | [stats] 15 | 16 | tokens sentences 17 | -------------------------- 18 | train 41010 1164 19 | test 13246 303 20 | -------------------------- 21 | all 54256 1467 22 | 23 | [notes] 24 | 25 | train: FIN5 26 | test: FIN3 27 | -------------------------------------------------------------------------------- /data/SEC-filings/NOTE.txt: -------------------------------------------------------------------------------- 1 | This dataset was download from: 2 | 3 | https://people.eng.unimelb.edu.au/tbaldwin/resources/finance-sec/ 4 | 5 | The License is contained in the README.txt. 6 | 7 | For more information on the dataset, see 8 | 9 | Julio Cesar Salinas Alvarado, Karin Verspoor and Timothy Baldwin (2015) 10 | Domain Adaption of Named Entity Recognition to Support Credit Risk Assessment, 11 | In Proceedings of 13th annual workshop of The Australasian Language Technology 12 | Association, Sydney, Australia. 13 | 14 | -------------------------------------------------------------------------------- /data/SEC-filings/README.txt: -------------------------------------------------------------------------------- 1 | The dataset is generated using CoNll2003 data and financial documents obtained 2 | from U.S. Security and Exchange Commission (SEC) filings. 3 | 4 | LICENSE: 5 | 6 | This dataset is made available under the terms of the Creative Commons 7 | Attribution 3.0 Unported licence 8 | (http://creativecommons.org/licenses/by/3.0/), with attribution via citation 9 | of the following paper, which describes the dataset in full detail: 10 | 11 | Julio Cesar Salinas Alvarado, Karin Verspoor and Timothy Baldwin (2015) Domain 12 | Adaption of Named Entity Recognition to Support Credit Risk Assessment, In 13 | Proceedings of 13th annual workshop of The Australasian Language Technology 14 | Association, Sydney, Australia. 15 | 16 | CONTACTS: 17 | 18 | Julio Salinas (jsalinas@student.unimelb.edu.au) 19 | Karin Verspoor (karin.verspoor@unimelb.edu.au) 20 | Tim Baldwin (tb@ldwin.net) 21 | -------------------------------------------------------------------------------- /data/WNUT17/CITATION: -------------------------------------------------------------------------------- 1 | If you use this data, please cite the task paper: 2 | 3 | Leon Derczynski, Eric Nichols, Marieke van Erp, Nut Limsopatham (2017) "Results of the WNUT2017 Shared Task on Novel and Emerging Entity Recognition", in Proceedings of the 3rd Workshop on Noisy, User-generated Text. 4 | 5 | -------------------------------------------------------------------------------- /data/WNUT17/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = person 4 | location 5 | group 6 | corporation 7 | product 8 | creative-work 9 | 10 | [IOB-format] 11 | 12 | IOB = IOB2 13 | 14 | [file-settings] 15 | 16 | sep = tab 17 | iob_pos = 1 18 | word_pos = 0 19 | pos_pos = none 20 | -------------------------------------------------------------------------------- /data/WNUT17/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | person 4 | location 5 | group 6 | corporation 7 | product 8 | creative-work 9 | 10 | [format] 11 | 12 | IOB2 13 | 14 | [notes] 15 | 16 | [content] 17 | 18 | Twitter 19 | 20 | [stats] 21 | 22 | tokens sentences 23 | ------------------------- 24 | train 62730 3394 25 | test 23394 1287 26 | dev 15733 1009 27 | all 101857 5690 28 | 29 | Entities: 30 | train test dev all 31 | -------------------------------------------- 32 | person 660 429 470 1559 33 | location 548 150 74 772 34 | group 264 165 39 468 35 | corporation 221 66 34 321 36 | product 142 127 114 383 37 | creative-work 140 142 104 386 38 | 39 | -------------------------------------------------------------------------------- /data/WNUT17/LICENSE: -------------------------------------------------------------------------------- 1 | License 2 | ------- 3 | 4 | This dataset is distributed under the [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) license. 5 | 6 | If you use this data, please cite the relevant paper: 7 | 8 | Leon Derczynski, Eric Nichols, Marieke van Erp, Nut Limsopatham; 2017. 9 | Results of the WNUT2017 Shared Task on Novel and Emerging Entity Recognition. 10 | In Proceedings of the Workshop on Noisy, User-generated Text, at EMNLP. 11 | (http://noisy-text.github.io/2017/pdf/WNUT18.pdf) 12 | 13 | 14 | Modifications 15 | ------------- 16 | 17 | A couple minor modifications were made; these are specified in file README.rst. 18 | 19 | -------------------------------------------------------------------------------- /data/WNUT17/README.rst: -------------------------------------------------------------------------------- 1 | WNUT 17 Emerging Entities Dataset 2 | ================================= 3 | 4 | The is the dataset for the WNUT 17 Emerging Entities task. It contains text 5 | from Twitter, Stack Overflow responses, YouTube comments, and Reddit comments. 6 | 7 | This was downloaded from: 8 | 9 | http://noisy-text.github.io/2017/emerging-rare-entities.html 10 | 11 | Specifically, the following (train, dev, test) files: 12 | 13 | http://noisy-text.github.io/2017/files/wnut17train.conll 14 | http://noisy-text.github.io/2017/files/emerging.dev.conll 15 | http://noisy-text.github.io/2017/files/emerging.test.annotated 16 | 17 | The same data can also be downloaded from: 18 | 19 | https://github.com/leondz/emerging_entities_17 20 | 21 | (The files were identical.) For more information about this dataset, see: 22 | 23 | http://noisy-text.github.io/2017/files/README.md 24 | 25 | and the following paper: 26 | 27 | Leon Derczynski, Eric Nichols, Marieke van Erp, Nut Limsopatham (2017) 28 | "Results of the WNUT2017 Shared Task on Novel and Emerging Entity Recognition", 29 | in Proceedings of the 3rd Workshop on Noisy, User-generated Text 30 | 31 | Annotation guidelines 32 | --------------------- 33 | 34 | See http://noisy-text.github.io/2017/files/ 35 | 36 | See also 37 | -------- 38 | 39 | The main site has links to papers for the conference. 40 | Also, see the related site for WNUT 2016. 41 | 42 | Data Clearning 43 | ============== 44 | 45 | The following minor changes were made: 46 | 47 | 1. Removed the trailing whitespaces from wnut17train.conll 48 | 2. Replaced \u201c' (starting quotes ") with " (in both dev and test files) 49 | 3. Replaced ending quotes " with " (dev and test files) 50 | 4. The dev file cannot be read because of "" Byte Order Mark (BOM) 51 | character. They were found with /[\uFEFF] and removed. 52 | 5. Fixed the following in file emerging.dev.conll: 53 | I-creative-work -> B-creative-work on line 1674 ("Turn") 54 | 55 | The files emerging.dev.conll, emerging.test.annotated and wnut17train.conll 56 | are in CONLL-format/data 57 | 58 | 59 | License 60 | ------- 61 | 62 | This data is distributed under the [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) license. 63 | 64 | If you use this data, please cite the relevant paper: 65 | 66 | Leon Derczynski, Eric Nichols, Marieke van Erp, Nut Limsopatham; 2017. 67 | Results of the WNUT2017 Shared Task on Novel and Emerging Entity Recognition. 68 | In Proceedings of the Workshop on Noisy, User-generated Text, at EMNLP. 69 | (http://noisy-text.github.io/2017/pdf/WNUT18.pdf) 70 | 71 | 72 | -------------------------------------------------------------------------------- /data/conll2003/CONLL-format/data/README: -------------------------------------------------------------------------------- 1 | Due to licensing restrictions, we cannot share the CONLL 2003 dataset. 2 | 3 | It can be built from the Reuters corpus following the directions here: 4 | 5 | https://www.clips.uantwerpen.be/conll2003/ner/ 6 | 7 | In addition, several github repositories also provide the CONLL 2003 dataset. 8 | 9 | 10 | Place files eng.train, eng.testa (dev data), eng.testb (test data) here, 11 | and then delete this file. 12 | 13 | 14 | -------------------------------------------------------------------------------- /data/conll2003/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = PER 4 | LOC 5 | ORG 6 | MISC 7 | 8 | [IOB-format] 9 | 10 | IOB = IOB1 11 | 12 | [file-settings] 13 | 14 | sep = ' ' 15 | iob_pos = 3 16 | word_pos = 0 17 | pos_pos = 1 18 | -------------------------------------------------------------------------------- /data/conll2003/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | PER 4 | LOC 5 | ORG 6 | MISC 7 | 8 | [format] 9 | 10 | IOB1 11 | 12 | [content] 13 | 14 | [stats] 15 | 16 | tokens sentences 17 | ----------------------------------- 18 | dev: 51361 3250 19 | test: 46434 3453 20 | train: 203612 14035 21 | ----------------------------------- 22 | all: 301407 20738 23 | 24 | 25 | dev test train all 26 | -------------------------------- 27 | PER: 1842 1617 6600 10059 28 | LOC: 1837 1668 7140 10645 29 | ORG: 1341 1661 6321 9323 30 | MISC: 922 702 3438 5062 31 | -------------------------------- 32 | ALL: 5942 5648 23499 35089 33 | 34 | [notes] 35 | 36 | dev set = test a 37 | test set = test b 38 | -------------------------------------------------------------------------------- /data/conll2003/README.rst: -------------------------------------------------------------------------------- 1 | NOTE 2 | ---- 3 | 4 | This is the original CONLL dataset, except for a couple 5 | corrections listed below: 6 | 7 | * Fixed tokenization of the following form: 8 | [X][.] -> [X.], and associated sentence tokenization errors, for: 9 | 10 | - Rep, Sen 11 | - U.S 12 | -------------------------------------------------------------------------------- /data/gmb/gmb-1.0.0/CONLL-format/data_original/README: -------------------------------------------------------------------------------- 1 | Put the CONLL-formatted data here. Then delete this file. 2 | -------------------------------------------------------------------------------- /data/gmb/gmb-1.0.0/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = PER 4 | LOC 5 | ORG 6 | PCT 7 | MON 8 | TIM 9 | DAT 10 | 11 | [IOB-format] 12 | 13 | IOB = IOB2 14 | 15 | [file-settings] 16 | 17 | sep = tab 18 | iob_pos = 1 19 | word_pos = 0 20 | pos_pos = none 21 | -------------------------------------------------------------------------------- /data/gmb/gmb-1.0.0/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | PER 4 | LOC 5 | ORG 6 | PCT 7 | MON 8 | TIM 9 | DAT 10 | 11 | [format] 12 | 13 | IO 14 | 15 | [content] 16 | 17 | [stats] 18 | 19 | tokens sentences documents 20 | ----------------------------------------- 21 | VOA: 75407 3877 898 22 | CIA: 7039 344 98 23 | MASC: 282 18 4 24 | total: 82728 4239 1000 25 | 26 | VOA CIA MASC all 27 | ----------------------------------------- 28 | PER: 1091 34 7 1132 29 | LOC: 2909 248 6 3163 30 | ORG: 1355 191 9 1555 31 | PCT: 63 11 0 74 32 | MON: 124 6 8 138 33 | TIM: 58 2 1 61 34 | DAT: 1486 265 4 1755 35 | 36 | [notes] 37 | 38 | - There were 5 instances of TTL, and 1 of ART; these should be removed. 39 | - This is not a 'gold' corpus; it was not fully hand-annotated. 40 | -------------------------------------------------------------------------------- /data/gmb/gmb-1.0.0/README.rst: -------------------------------------------------------------------------------- 1 | THE GMB dataset 2 | =============== 3 | 4 | The GMB dataset can be downloaded from: 5 | 6 | http://gmb.let.rug.nl/data.php 7 | 8 | Specifically: 9 | 10 | http://gmb.let.rug.nl/releases/gmb-1.0.0.zip 11 | 12 | Changes 13 | ------- 14 | 15 | Be aware of the following modifications: 16 | 17 | 1. There was only one instance of label "ART", in: 18 | "The Who is currently touring in support of [Endless Wire]_ART, its 19 | first album since 1982." 20 | 21 | This label was removed [located in p01/d0488] 22 | 23 | 2. To Split data into the sub-domains CIA, VOA and MASC, 24 | use the following: 25 | 26 | grep -lrnw . -e "Voice of America" | cut -d'/' -f-3 | cut -c 3- | xargs -I '{}' mkdir -p VOA/'{}'; mv ./'{}' VOA/'{}' 27 | 28 | grep -lrnw . -e "Voice of America" | cut -d'/' -f-3 | cut -c 3- | xargs -I '{}' mv ./'{}' VOA/'{}' 29 | 30 | 3. There are only five instances of "TTL", so these were removed. They were in: 31 | - IDB [President] Luis Alberto Moreno -> changed TTL to PER [p16/d0527] 32 | - [President] Bush signed a new anti-genetic [p73/d0102/] -> TTL to PER 33 | - [Professor] Comerio talks about the scope [p75/d0108/] -> TTL to PER 34 | - Mounting calls for [Senator] Hillary Clinton [p60/d0292] -> TTL to PER 35 | - [Rep.] Tony Hall, D-Ohio, urges the United (MASC) [p03/d0686] -> TTL to PER 36 | 37 | Note - all except for p03/d0686 were in VOA. 38 | 39 | -------------------------------------------------------------------------------- /data/i2b2_2006/CONLL-format/data/test/README: -------------------------------------------------------------------------------- 1 | Put the CONLL-formatted test data here; then remove this file. 2 | -------------------------------------------------------------------------------- /data/i2b2_2006/CONLL-format/data/train/README: -------------------------------------------------------------------------------- 1 | Put the CONLL-formatted train data here; then remove this file. 2 | -------------------------------------------------------------------------------- /data/i2b2_2006/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = DATE 4 | DOCTOR 5 | HOSPITAL 6 | ID 7 | LOCATION 8 | PATIENT 9 | PHONE 10 | 11 | [IOB-format] 12 | 13 | IOB = none 14 | 15 | [file-settings] 16 | 17 | sep = tab 18 | iob_pos = 1 19 | word_pos = 0 20 | pos_pos = none 21 | -------------------------------------------------------------------------------- /data/i2b2_2006/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | DATE 4 | DOCTOR 5 | HOSPITAL 6 | ID 7 | LOCATION 8 | PATIENT 9 | PHONE 10 | 11 | [format] 12 | 13 | none 14 | 15 | [notes] 16 | 17 | [content] 18 | 19 | [stats] 20 | 21 | sentences tokens 22 | ------------------------------- 23 | train 49941 395272 24 | test 18095 166312 25 | ------------------------------- 26 | all 68036 561584 27 | 28 | Entities: 29 | train test all 30 | --------------------------------- 31 | DATE 5167 1931 7098 32 | DOCTOR 2681 1069 3750 33 | HOSPITAL 1723 675 2398 34 | ID 3666 1143 4809 35 | LOCATION 144 119 263 36 | PATIENT 684 245 929 37 | PHONE 174 58 232 38 | 39 | -------------------------------------------------------------------------------- /data/i2b2_2006/README.rst: -------------------------------------------------------------------------------- 1 | The i2b2 2006 dataset 2 | --------------------- 3 | 4 | The i2b2 2006 Deidentification and Smoking Challenge dataset 5 | (NLP Data Set #1B) can be obtained at: 6 | 7 | https://www.i2b2.org/NLP/DataSets/Main.php 8 | 9 | A data use agreement needs to be signed, so the dataset could not be 10 | included here. 11 | 12 | The dataset includes 889 de-identified discharge summaries with 13 | de-identification challenge annotations. 14 | 15 | 16 | Downloading the data 17 | -------------------- 18 | 19 | Download and unzip the following three files: 20 | 21 | deid_surrogate_test_all_groundtruth_version2.xml 22 | deid_surrogate_test_all_version2.xml 23 | deid_surrogate_train_all_version2.xml 24 | 25 | Move them to the data directory. 26 | 27 | Cleaning the data 28 | ----------------- 29 | 30 | 1. The original train xml file is not correct -- we needed to fix the 31 | mismatched tag on line 25722. 32 | 33 | 2. There were less than 20 mentions of 'AGE', so these were replaced by 'O'. 34 | 35 | Script to convert to CONLL 36 | -------------------------- 37 | 38 | There is a script, i2b2toconll.py (in the utils directory) that can be used 39 | to convert the data to CONLL format. 40 | 41 | To use it: 42 | 43 | import i2b2toconll 44 | i2b2toconll.write_all_to_conll() 45 | 46 | Note: one of the three files contains test data which is unlabelled, so the 47 | CONLL output from this can be removed. 48 | 49 | Cite as 50 | ------- 51 | 52 | Is using the i2b2 2006 dataset, please cite as: 53 | 54 | Uzuner O, Juo Y, Szolovits P, "Evaluating the state-of-the-art in automatic 55 | de-identification". J Am Med Inform Assoc. 2007, 14(5):550-63 56 | 57 | -------------------------------------------------------------------------------- /data/i2b2_2014/CONLL-format/data/dev/README: -------------------------------------------------------------------------------- 1 | Put the CONLL-formatted dev (validation) data here; then remove this file. 2 | -------------------------------------------------------------------------------- /data/i2b2_2014/CONLL-format/data/test/README: -------------------------------------------------------------------------------- 1 | Put the CONLL-formatted test data here; then remove this file. 2 | -------------------------------------------------------------------------------- /data/i2b2_2014/CONLL-format/data/train/README: -------------------------------------------------------------------------------- 1 | Put the CONLL-formatted train data here; then remove this file. 2 | -------------------------------------------------------------------------------- /data/i2b2_2014/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = USERNAME 4 | CITY 5 | PATIENT 6 | ZIP 7 | DOCTOR 8 | COUNTRY 9 | HOSPITAL 10 | PROFESSION 11 | PHONE 12 | STATE 13 | STREET 14 | MEDICALRECORD 15 | DATE 16 | ORGANIZATION 17 | AGE 18 | IDNUM 19 | 20 | [IOB-format] 21 | 22 | IOB = IOB2 23 | 24 | [file-settings] 25 | 26 | sep = ' ' 27 | iob_pos = 4 28 | word_pos = 0 29 | pos_pos = none 30 | -------------------------------------------------------------------------------- /data/i2b2_2014/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | USERNAME 4 | CITY 5 | PATIENT 6 | ZIP 7 | DOCTOR 8 | COUNTRY 9 | HOSPITAL 10 | PROFESSION 11 | PHONE 12 | STATE 13 | STREET 14 | MEDICALRECORD 15 | DATE 16 | ORGANIZATION 17 | AGE 18 | IDNUM 19 | 20 | [format] 21 | 22 | none 23 | 24 | [notes] 25 | 26 | [content] 27 | 28 | [stats] 29 | 30 | sentences tokens 31 | ---------------------------- 32 | train 22348 443503 33 | test 22335 414072 34 | dev 11306 198558 35 | ---------------------------- 36 | all 55989 1056133 37 | 38 | Entities all 39 | -------------------------- 40 | AGE 1997 41 | USERNAME 356 42 | IDNUM 441 43 | PATIENT 2196 44 | DOCTOR 4798 45 | PROFESSION 413 46 | HOSPITAL 2273 47 | STREET 352 48 | STATE 504 49 | ZIP 352 50 | CITY 653 51 | COUNTRY 183 52 | ORGANIZATION 206 53 | DATE 12343 54 | MEDICALRECORD 1028 55 | PHONE 521 56 | 57 | -------------------------------------------------------------------------------- /data/i2b2_2014/README.rst: -------------------------------------------------------------------------------- 1 | Corpus 2 | ------ 3 | 4 | The 2014 De-identification dataset can be obtained at: 5 | 6 | https://www.i2b2.org/NLP/DataSets/ 7 | 8 | Specifically, download the "2014 De-identification and Heart Disease Risk 9 | Factors Challenge. A data use agreement needs to be signed, so the dataset 10 | could not be included here. 11 | 12 | Converting to CONLL format 13 | -------------------------- 14 | 15 | To obtain the i2b2 2014 deidentification corpus in CONLL format, we used 16 | the tools bundled with NeuroNER, available at: 17 | 18 | https://github.com/Franck-Dernoncourt/NeuroNER 19 | 20 | 1. First follow the instructions in NeuroNER/data/i2b2_2014_deid/readme.md 21 | Run the python script xml_to_brat.py (this requires Python 3). 22 | 23 | 2. Use the script brat_to_conll.py located in NeuroNER/src 24 | 25 | Specifically: 26 | 27 | We used 'spacy' (not 'stanford') for tokenizing. We 28 | downloaded the english language model en_core_web_sm-1.2.0.tar.gz from here: 29 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz 30 | 31 | Then: 32 | 33 | python3 -m spacy link en_core_web_sm en_default 34 | 35 | Then run the following in Python 3: 36 | 37 | import brat_to_conll 38 | brat_to_conll.brat_to_conll(input_folder, output_filepath, 'spacy', 'en_default') 39 | 40 | Run it three times (for the training, testing, and dev data) using the 41 | appropriate input_folder and output_filepath names. 42 | 43 | Note on train/dev split 44 | ----------------------- 45 | 46 | Note: according to the xml_to_brat.py script: 47 | 48 | training-PHI-Gold-Set1 = training set 49 | training-PHI-Gold-Set2 = dev/validation 50 | 51 | It appears this is what was used in the paper Lee et al (2017), "Transfer 52 | Learning for Named Entity Recognition with Neural Networks". They mention 53 | "60% [of train set] corresponds to the full official train set". The only 54 | way this makes sense is as the fraction train/(train+dev), which is closer to 55 | 66% (sentence level). 56 | 57 | We could not find this training/dev split mentioned in any other documentation 58 | or papers related to the corpus. 59 | 60 | Cleaning the data 61 | ----------------- 62 | 63 | The last few lines of file 180-03.xml are not formatted correctly in the 64 | final output; these were corrected manually. 65 | 66 | Several entity types had too few (<20) instances and were removed. These were 67 | changed as follows: 68 | 69 | HEALTHPLAN -> O # 1 mention 70 | URL -> O # 2 mentions 71 | FAX -> O # 10 mentions 72 | EMAIL -> O # 5 mentions 73 | DEVICE -> O # 7 mentions 74 | LOCATION_OTHER -> 0 # 17 mentions 75 | BIOID -> IDNUM # 1 mention 76 | 77 | Remarks 78 | ------- 79 | 80 | There are still some sentence segmentation errors in the CONLL-formated files. 81 | 82 | Cite as 83 | ------- 84 | 85 | If using the i2b2 2014 dataset, please cite as: 86 | 87 | - Stubbs A, Uzuner O. (2015). Annotating longitudinal clinical narratives for 88 | de-identification: The 2014 i2b2/UTHealth corpus 89 | (http://www.ncbi.nlm.nih.gov/pubmed/26319540.). 90 | J Biomed Inform. 2015 Aug 28. PII: S1532-0464(15)00182-3. DOI: 10.1016/j.jbi.2015.07.020. 91 | 92 | - Stubbs A, Kotfila C, Uzuner O. (2015). Automated systems for the 93 | de-identification of longitudinal clinical narratives: Overview of 2014 94 | i2b2/UTHealth shared task Track 1 95 | (http://www.ncbi.nlm.nih.gov/pubmed/26225918). 96 | J Biomed Inform. 2015 Jul 28. PII: S1532-0464(15)00117-3. 97 | DOI: 10.1016/j.jbi.2015.06.007. 98 | 99 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/Australian_Department_of_Foreign_Affairs/2FA0B45CA064AF09334944B6D68B5987.conll: -------------------------------------------------------------------------------- 1 | The B-Organisation 2 | Members I-Organisation 3 | of I-Organisation 4 | the I-Organisation 5 | Security I-Organisation 6 | Council I-Organisation 7 | are O 8 | encouraged O 9 | by O 10 | the O 11 | decision O 12 | of O 13 | Iraqi B-Person 14 | President I-Person 15 | Fuad I-Person 16 | Masum I-Person 17 | to O 18 | nominate O 19 | a O 20 | new O 21 | Prime O 22 | Minister O 23 | - O 24 | designate O 25 | . O 26 | 27 | The O 28 | nomination O 29 | of O 30 | a B-Person 31 | new I-Person 32 | Prime I-Person 33 | Minister I-Person 34 | - I-Person 35 | designate I-Person 36 | is O 37 | an O 38 | important O 39 | step O 40 | toward O 41 | the O 42 | formation O 43 | of O 44 | an B-Organisation 45 | inclusive I-Organisation 46 | Government I-Organisation 47 | that O 48 | represents O 49 | all O 50 | segments O 51 | of O 52 | the B-Organisation 53 | Iraqi I-Organisation 54 | population I-Organisation 55 | and O 56 | that O 57 | contributes O 58 | to O 59 | finding O 60 | a O 61 | viable O 62 | and O 63 | sustainable O 64 | solution O 65 | to O 66 | the O 67 | country O 68 | ’ O 69 | s O 70 | current O 71 | challenges O 72 | . O 73 | 74 | The B-Organisation 75 | Members I-Organisation 76 | of I-Organisation 77 | the I-Organisation 78 | Security I-Organisation 79 | Council I-Organisation 80 | urge O 81 | the B-Person 82 | Prime I-Person 83 | Minister I-Person 84 | - I-Person 85 | designate I-Person 86 | , I-Person 87 | Haider I-Person 88 | al I-Person 89 | - I-Person 90 | Abadi I-Person 91 | , O 92 | to O 93 | work O 94 | swiftly O 95 | to O 96 | form O 97 | such O 98 | a O 99 | Government O 100 | as O 101 | quickly O 102 | as O 103 | possible O 104 | and O 105 | within O 106 | the O 107 | constitutional O 108 | time O 109 | - O 110 | frame O 111 | . O 112 | 113 | The B-Organisation 114 | Members I-Organisation 115 | of I-Organisation 116 | the I-Organisation 117 | Security I-Organisation 118 | Council I-Organisation 119 | urge O 120 | all B-Organisation 121 | political I-Organisation 122 | parties I-Organisation 123 | and O 124 | their B-Organisation 125 | supporters I-Organisation 126 | to O 127 | remain O 128 | calm O 129 | and O 130 | respect O 131 | the O 132 | political O 133 | process O 134 | governed O 135 | by O 136 | the B-DocumentReference 137 | Constitution I-DocumentReference 138 | . O 139 | 140 | The B-Organisation 141 | Members I-Organisation 142 | of I-Organisation 143 | the I-Organisation 144 | Security I-Organisation 145 | Council I-Organisation 146 | reiterate O 147 | their O 148 | support O 149 | for O 150 | Iraq B-Organisation 151 | ’ O 152 | s O 153 | democratic O 154 | process O 155 | and O 156 | their O 157 | support O 158 | for O 159 | the B-Organisation 160 | Iraq I-Organisation 161 | people I-Organisation 162 | in O 163 | their O 164 | fight O 165 | against O 166 | terrorism O 167 | , O 168 | in O 169 | particular O 170 | against O 171 | ISIL B-Organisation 172 | . O 173 | 174 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/Australian_Department_of_Foreign_Affairs/58A3656BDCA7B89C38CE5A1985834740.conll: -------------------------------------------------------------------------------- 1 | Since O 2 | the O 3 | Syrian B-Nationality 4 | conflict O 5 | began O 6 | , O 7 | the B-Organisation 8 | Australian I-Organisation 9 | Government I-Organisation 10 | has O 11 | responded O 12 | with O 13 | over O 14 | $ B-Money 15 | 135 I-Money 16 | million I-Money 17 | for O 18 | humanitarian O 19 | assistance O 20 | . O 21 | 22 | At O 23 | the O 24 | request O 25 | of O 26 | DFAT B-Organisation 27 | ’ I-Organisation 28 | s I-Organisation 29 | Middle I-Organisation 30 | East I-Organisation 31 | Branch I-Organisation 32 | , O 33 | ODE B-Organisation 34 | has O 35 | completed O 36 | an O 37 | evaluation O 38 | examining O 39 | the O 40 | effectiveness O 41 | of O 42 | this O 43 | response O 44 | . O 45 | 46 | This O 47 | evaluation O 48 | considers O 49 | both O 50 | the O 51 | efficacy O 52 | of O 53 | material O 54 | assistance O 55 | provided O 56 | and O 57 | that O 58 | of O 59 | Australia B-Organisation 60 | ’ O 61 | s O 62 | diplomatic O 63 | efforts O 64 | . O 65 | 66 | It O 67 | also O 68 | identifies O 69 | some O 70 | important O 71 | ways O 72 | in O 73 | which O 74 | Australia B-Organisation 75 | ’ O 76 | s O 77 | response O 78 | to O 79 | this O 80 | , O 81 | and O 82 | other O 83 | , O 84 | protracted O 85 | crises O 86 | might O 87 | be O 88 | further O 89 | strengthened O 90 | . O 91 | 92 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/Australian_Department_of_Foreign_Affairs/D863D678C3A4BD8EDFCE2B7D27881974.conll: -------------------------------------------------------------------------------- 1 | The B-Location 2 | area I-Location 3 | of I-Location 4 | Mosul I-Location 5 | district I-Location 6 | , I-Location 7 | Ninewa I-Location 8 | province I-Location 9 | in O 10 | Iraq B-Location 11 | , O 12 | has O 13 | been O 14 | declared O 15 | a O 16 | designated O 17 | area O 18 | under O 19 | section B-DocumentReference 20 | 119 I-DocumentReference 21 | . I-DocumentReference 22 | 3 I-DocumentReference 23 | of I-DocumentReference 24 | the I-DocumentReference 25 | Criminal I-DocumentReference 26 | Code I-DocumentReference 27 | . O 28 | 29 | This O 30 | means O 31 | it O 32 | will O 33 | be O 34 | an O 35 | offence O 36 | under O 37 | Australian B-Nationality 38 | law O 39 | to O 40 | enter O 41 | , O 42 | or O 43 | remain O 44 | in O 45 | , O 46 | Mosul B-Location 47 | district I-Location 48 | without O 49 | a O 50 | legitimate O 51 | purpose O 52 | . O 53 | 54 | Declaring O 55 | Mosul B-Location 56 | district I-Location 57 | under O 58 | the O 59 | criminal O 60 | code O 61 | follows O 62 | the O 63 | declaration O 64 | of O 65 | Al B-Location 66 | - I-Location 67 | Raqqa I-Location 68 | province I-Location 69 | in O 70 | Syria B-Location 71 | in O 72 | December B-Temporal 73 | 2014 I-Temporal 74 | . O 75 | 76 | The B-Location 77 | city I-Location 78 | of I-Location 79 | Mosul I-Location 80 | is O 81 | the O 82 | largest O 83 | city O 84 | controlled O 85 | by O 86 | Daesh B-Organisation 87 | in O 88 | Iraq B-Location 89 | . O 90 | 91 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/Australian_Department_of_Foreign_Affairs/DEC7A9B44536B3E01C85E64BAF2E07A6.conll: -------------------------------------------------------------------------------- 1 | Foreign B-Person 2 | Minister I-Person 3 | Julie I-Person 4 | Bishop I-Person 5 | met O 6 | with O 7 | the O 8 | Minister B-Person 9 | of I-Person 10 | Foreign I-Person 11 | Affairs I-Person 12 | of I-Person 13 | Iraq I-Person 14 | , O 15 | HE B-Person 16 | Dr I-Person 17 | Ibrahim I-Person 18 | Al I-Person 19 | - I-Person 20 | Ja I-Person 21 | ’ I-Person 22 | afari I-Person 23 | , O 24 | in O 25 | Canberra B-Location 26 | on O 27 | 12 B-Temporal 28 | February I-Temporal 29 | to O 30 | discuss O 31 | bilateral O 32 | relations O 33 | and O 34 | reaffirm O 35 | our O 36 | shared O 37 | commitment O 38 | to O 39 | combating O 40 | the O 41 | terrorist O 42 | threat O 43 | of O 44 | Daesh B-Organisation 45 | ( O 46 | also O 47 | known O 48 | as O 49 | ISIL B-Organisation 50 | ) O 51 | . O 52 | 53 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/BBC_Online/DC40EB7B4BDB1254DAF8A430A8E5383B.conll: -------------------------------------------------------------------------------- 1 | At O 2 | least O 3 | 43 B-Quantity 4 | people B-Organisation 5 | have O 6 | been O 7 | killed O 8 | in O 9 | a O 10 | car B-Weapon 11 | bomb I-Weapon 12 | blast O 13 | in O 14 | the B-Location 15 | rebel I-Location 16 | - I-Location 17 | held I-Location 18 | Syrian I-Location 19 | town I-Location 20 | of I-Location 21 | Azaz I-Location 22 | , O 23 | near O 24 | the B-Location 25 | Turkish I-Location 26 | border I-Location 27 | . O 28 | 29 | The O 30 | explosion O 31 | occurred O 32 | outside O 33 | a B-Location 34 | courthouse I-Location 35 | in O 36 | the B-Location 37 | town I-Location 38 | , O 39 | just O 40 | 7km B-Quantity 41 | ( O 42 | four B-Quantity 43 | miles I-Quantity 44 | ) O 45 | from O 46 | the B-Location 47 | Turkish I-Location 48 | frontier I-Location 49 | . O 50 | 51 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/CENTCOM/28829CF80D445874061C5948E6A3751D.conll: -------------------------------------------------------------------------------- 1 | On O 2 | Monday B-Temporal 3 | , I-Temporal 4 | Dec. I-Temporal 5 | 26 I-Temporal 6 | , O 7 | Abu B-Person 8 | Jandal I-Person 9 | al I-Person 10 | - I-Person 11 | Kuwaiti I-Person 12 | , O 13 | an O 14 | ISIL B-Organisation 15 | gang O 16 | leader O 17 | in O 18 | Raqqah B-Location 19 | was O 20 | struck O 21 | and O 22 | killed O 23 | by O 24 | a O 25 | coalition B-Organisation 26 | airstrike B-Weapon 27 | near O 28 | Tabqa B-Location 29 | Dam I-Location 30 | , I-Location 31 | Syria I-Location 32 | . O 33 | 34 | He B-Person 35 | was O 36 | a O 37 | previous O 38 | member O 39 | of O 40 | the B-Organisation 41 | ISIL I-Organisation 42 | ' I-Organisation 43 | s I-Organisation 44 | War I-Organisation 45 | Committee I-Organisation 46 | and O 47 | was O 48 | involved O 49 | in O 50 | their O 51 | retaking O 52 | of O 53 | Palmyra B-Location 54 | , I-Location 55 | Syria I-Location 56 | , O 57 | before O 58 | being O 59 | reassigned O 60 | to O 61 | Tabqa B-Location 62 | to O 63 | try O 64 | to O 65 | improve O 66 | ISIL B-Organisation 67 | ' O 68 | s O 69 | defenses O 70 | against O 71 | the B-Organisation 72 | Syrian I-Organisation 73 | Democratic I-Organisation 74 | Forces I-Organisation 75 | . O 76 | 77 | Abu B-Person 78 | Jandal I-Person 79 | was O 80 | involved O 81 | in O 82 | the O 83 | use O 84 | of O 85 | suicide B-MilitaryPlatform 86 | vehicles I-MilitaryPlatform 87 | , O 88 | IEDs B-Weapon 89 | and O 90 | chemical B-Weapon 91 | weapons I-Weapon 92 | against O 93 | the B-Organisation 94 | SDF I-Organisation 95 | . O 96 | 97 | Because O 98 | of O 99 | his B-Person 100 | associations O 101 | with O 102 | ISIL B-Organisation 103 | terror O 104 | attack O 105 | planners O 106 | and O 107 | War B-Organisation 108 | Council I-Organisation 109 | members O 110 | , O 111 | his B-Person 112 | death O 113 | will O 114 | degrade O 115 | ISIL B-Organisation 116 | ' O 117 | s O 118 | ability O 119 | to O 120 | defend O 121 | Raqqah B-Location 122 | and O 123 | launch O 124 | external O 125 | operations O 126 | against O 127 | the B-Organisation 128 | West I-Organisation 129 | . O 130 | 131 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/CENTCOM/482C10613016D2F430C464B2D13A8F41.conll: -------------------------------------------------------------------------------- 1 | On O 2 | Dec. B-Temporal 3 | 6th I-Temporal 4 | , O 5 | Iraqi B-Organisation 6 | Security I-Organisation 7 | Forces I-Organisation 8 | attempted O 9 | to O 10 | seize O 11 | the B-Location 12 | Al I-Location 13 | Salem I-Location 14 | hospital I-Location 15 | complex I-Location 16 | from O 17 | ISIL B-Organisation 18 | fighters I-Organisation 19 | in O 20 | the B-Location 21 | heart I-Location 22 | of I-Location 23 | East I-Location 24 | Mosul I-Location 25 | . O 26 | 27 | ISIL B-Organisation 28 | was O 29 | using O 30 | the B-Location 31 | hospital I-Location 32 | as O 33 | a O 34 | base O 35 | of O 36 | operations O 37 | and O 38 | command O 39 | and O 40 | control O 41 | headquarters O 42 | . O 43 | 44 | After O 45 | seizing O 46 | the B-Location 47 | area I-Location 48 | , O 49 | the B-Organisation 50 | ISF I-Organisation 51 | fought O 52 | off O 53 | several O 54 | counterattacks O 55 | and O 56 | six B-Quantity 57 | VBIEDs B-Weapon 58 | the B-Temporal 59 | following I-Temporal 60 | day I-Temporal 61 | before O 62 | retrograding O 63 | a O 64 | short O 65 | distance O 66 | , O 67 | under O 68 | heavy O 69 | enemy O 70 | fire O 71 | , O 72 | to O 73 | strengthen O 74 | their B-Location 75 | position I-Location 76 | . O 77 | 78 | On O 79 | Dec. B-Temporal 80 | 7th I-Temporal 81 | , O 82 | after O 83 | Iraqi B-Organisation 84 | forces I-Organisation 85 | continued O 86 | to O 87 | receive O 88 | heavy O 89 | and O 90 | sustained O 91 | machine B-Weapon 92 | gun I-Weapon 93 | and O 94 | rocket B-Weapon 95 | propelled I-Weapon 96 | grenade I-Weapon 97 | fire O 98 | from O 99 | ISIL B-Organisation 100 | fighters I-Organisation 101 | in O 102 | a B-Location 103 | building I-Location 104 | on I-Location 105 | the I-Location 106 | hospital I-Location 107 | complex I-Location 108 | , O 109 | they O 110 | requested O 111 | immediate O 112 | support O 113 | from O 114 | the B-Organisation 115 | Coalition I-Organisation 116 | . O 117 | 118 | In O 119 | support O 120 | of O 121 | the B-Organisation 122 | Iraqi I-Organisation 123 | Security I-Organisation 124 | Forces I-Organisation 125 | , O 126 | Coalition O 127 | aircraft O 128 | conducted O 129 | a O 130 | precision O 131 | strike O 132 | on O 133 | the B-Location 134 | location I-Location 135 | to O 136 | target O 137 | enemy B-Organisation 138 | fighters I-Organisation 139 | firing O 140 | on O 141 | Iraqi B-Organisation 142 | forces I-Organisation 143 | . O 144 | 145 | The B-Organisation 146 | Coalition I-Organisation 147 | complies O 148 | with O 149 | the O 150 | Law O 151 | of O 152 | Armed O 153 | Conflict O 154 | and O 155 | takes O 156 | all O 157 | feasible O 158 | precautions O 159 | during O 160 | the O 161 | planning O 162 | and O 163 | execution O 164 | of O 165 | airstrikes O 166 | to O 167 | reduce O 168 | the O 169 | risk O 170 | of O 171 | harm O 172 | to O 173 | non O 174 | - O 175 | combatants O 176 | . O 177 | 178 | We B-Organisation 179 | will O 180 | continue O 181 | to O 182 | strike O 183 | ISIL B-Organisation 184 | military O 185 | targets O 186 | in O 187 | support O 188 | of O 189 | our B-Organisation 190 | partners I-Organisation 191 | in O 192 | order O 193 | to O 194 | defeat O 195 | ISIL B-Organisation 196 | in O 197 | Iraq B-Location 198 | . O 199 | 200 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/CENTCOM/5FBCC6E1D5B4B2A699196670789C6B0B.conll: -------------------------------------------------------------------------------- 1 | The B-Organisation 2 | Department I-Organisation 3 | of I-Organisation 4 | Defense I-Organisation 5 | announced O 6 | today B-Temporal 7 | the O 8 | death O 9 | of O 10 | a B-Person 11 | soldier I-Person 12 | who O 13 | was O 14 | supporting O 15 | Operation O 16 | Inherent O 17 | Resolve O 18 | . O 19 | 20 | Spc O 21 | . O 22 | 23 | Isiah B-Person 24 | L I-Person 25 | . I-Person 26 | Booker I-Person 27 | , O 28 | of O 29 | Cibolo B-Location 30 | , I-Location 31 | Texas I-Location 32 | , O 33 | died O 34 | Jan. B-Temporal 35 | 7 I-Temporal 36 | , O 37 | in O 38 | Jordan B-Location 39 | , O 40 | in O 41 | a O 42 | non O 43 | - O 44 | combat O 45 | related O 46 | incident O 47 | . O 48 | 49 | The O 50 | incident O 51 | is O 52 | under O 53 | investigation O 54 | . O 55 | 56 | Booker B-Person 57 | was O 58 | assigned O 59 | to O 60 | 2nd B-Organisation 61 | Battalion I-Organisation 62 | , O 63 | 5th B-Organisation 64 | Special I-Organisation 65 | Forces I-Organisation 66 | Group I-Organisation 67 | , O 68 | Fort B-Location 69 | Campbell I-Location 70 | , I-Location 71 | Kentucky I-Location 72 | . O 73 | 74 | For O 75 | more O 76 | information O 77 | , O 78 | media B-Organisation 79 | may O 80 | contact O 81 | the O 82 | U.S. B-Organisation 83 | Army I-Organisation 84 | Special I-Organisation 85 | Operations I-Organisation 86 | Command I-Organisation 87 | Public I-Organisation 88 | Affairs I-Organisation 89 | Office I-Organisation 90 | at O 91 | 910 O 92 | - O 93 | 432 O 94 | - O 95 | 3383 O 96 | . O 97 | 98 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/CENTCOM/68A04D0D3111DBD88DE961B5727ADFA4.conll: -------------------------------------------------------------------------------- 1 | Gen. B-Person 2 | Joseph I-Person 3 | L. I-Person 4 | Votel I-Person 5 | , O 6 | U.S. B-Person 7 | Central I-Person 8 | Command I-Person 9 | commander I-Person 10 | , O 11 | released O 12 | the O 13 | following O 14 | statement O 15 | today B-Temporal 16 | : O 17 | 18 | " O 19 | We B-Organisation 20 | mourn O 21 | the O 22 | loss O 23 | of O 24 | one O 25 | of O 26 | our O 27 | own O 28 | , O 29 | Navy B-Person 30 | Special I-Person 31 | Warfare I-Person 32 | Operator I-Person 33 | 1st I-Person 34 | Class I-Person 35 | Charles I-Person 36 | H. I-Person 37 | Keating I-Person 38 | IV I-Person 39 | , O 40 | who O 41 | was O 42 | killed O 43 | yesterday B-Temporal 44 | in O 45 | northern B-Location 46 | Iraq I-Location 47 | . O 48 | 49 | Petty B-Person 50 | Officer I-Person 51 | Keating I-Person 52 | was O 53 | advising O 54 | and O 55 | assisting O 56 | our B-Organisation 57 | Kurdish I-Organisation 58 | partners I-Organisation 59 | , O 60 | in O 61 | support O 62 | of O 63 | Operation O 64 | Inherent O 65 | Resolve O 66 | . O 67 | " O 68 | 69 | " O 70 | Keating B-Person 71 | ' O 72 | s O 73 | honorable O 74 | and O 75 | selfless O 76 | service O 77 | was O 78 | in O 79 | keeping O 80 | with O 81 | the O 82 | finest O 83 | traditions O 84 | of O 85 | military O 86 | service O 87 | and O 88 | certainly O 89 | will O 90 | not O 91 | be O 92 | forgotten O 93 | as O 94 | we B-Organisation 95 | continue O 96 | to O 97 | put O 98 | pressure O 99 | on O 100 | and O 101 | ultimately O 102 | eliminate O 103 | the B-Organisation 104 | terrorists I-Organisation 105 | associated I-Organisation 106 | with I-Organisation 107 | ISIL I-Organisation 108 | . O 109 | 110 | On O 111 | behalf O 112 | of O 113 | the O 114 | men O 115 | and O 116 | women O 117 | of O 118 | U.S. B-Organisation 119 | Central I-Organisation 120 | Command I-Organisation 121 | , O 122 | I B-Person 123 | extend O 124 | my O 125 | personal O 126 | gratitude O 127 | and O 128 | our O 129 | deepest O 130 | condolences O 131 | to O 132 | his O 133 | family O 134 | , O 135 | friends O 136 | and O 137 | fellow O 138 | service B-Organisation 139 | members I-Organisation 140 | . O 141 | " O 142 | 143 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/CENTCOM/C044623AD872323AF717F26F21E9352B.conll: -------------------------------------------------------------------------------- 1 | U.S. B-Organisation 2 | forces I-Organisation 3 | will O 4 | join O 5 | Jordan B-Organisation 6 | in O 7 | Exercise O 8 | Eager O 9 | Lion O 10 | 16 O 11 | , O 12 | one O 13 | of O 14 | U.S. B-Organisation 15 | Central I-Organisation 16 | Command I-Organisation 17 | ' O 18 | s O 19 | premiere O 20 | exercises O 21 | , O 22 | in O 23 | the B-Location 24 | Hashemite I-Location 25 | Kingdom I-Location 26 | of I-Location 27 | Jordan I-Location 28 | from O 29 | May B-Temporal 30 | 15 I-Temporal 31 | - I-Temporal 32 | 24 I-Temporal 33 | . O 34 | 35 | Exercise O 36 | Eager O 37 | Lion O 38 | 16 O 39 | will O 40 | consist O 41 | of O 42 | a O 43 | week O 44 | - O 45 | long O 46 | series O 47 | of O 48 | simulated O 49 | scenarios O 50 | to O 51 | facilitate O 52 | a O 53 | coordinated O 54 | partnered O 55 | military O 56 | response O 57 | to O 58 | conventional O 59 | and O 60 | unconventional O 61 | threats O 62 | . O 63 | 64 | The O 65 | scenarios O 66 | developed O 67 | will O 68 | include O 69 | border O 70 | security O 71 | , O 72 | command O 73 | and O 74 | control O 75 | , O 76 | cyber O 77 | defense O 78 | and O 79 | battlespace O 80 | management O 81 | . O 82 | 83 | About O 84 | 6 B-Quantity 85 | , I-Quantity 86 | 000 I-Quantity 87 | military O 88 | personnel O 89 | , O 90 | including O 91 | representatives O 92 | from O 93 | USCENTCOM B-Organisation 94 | headquarters O 95 | and O 96 | its O 97 | air O 98 | , O 99 | land O 100 | and O 101 | maritime O 102 | components O 103 | , O 104 | will O 105 | support O 106 | the O 107 | exercise O 108 | . O 109 | 110 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/CENTCOM/E9035FF1D0DA74A11674C82688C05C51.conll: -------------------------------------------------------------------------------- 1 | U.S. B-Organisation 2 | Air I-Organisation 3 | Forces I-Organisation 4 | Central I-Organisation 5 | Command I-Organisation 6 | will O 7 | release O 8 | the O 9 | results O 10 | of O 11 | the O 12 | September B-Temporal 13 | 17 I-Temporal 14 | Dayr B-Location 15 | az I-Location 16 | Zawr I-Location 17 | , I-Location 18 | Syria I-Location 19 | airstrike O 20 | investigation O 21 | on O 22 | a O 23 | live O 24 | , O 25 | on O 26 | - O 27 | the O 28 | - O 29 | record O 30 | conference O 31 | call O 32 | open O 33 | to O 34 | journalists B-Organisation 35 | , O 36 | starting O 37 | at O 38 | 10 B-Temporal 39 | a.m. I-Temporal 40 | Nov. B-Temporal 41 | 29 I-Temporal 42 | , I-Temporal 43 | 2016 I-Temporal 44 | . O 45 | 46 | During O 47 | the O 48 | call O 49 | , O 50 | Brig. B-Person 51 | Gen. O 52 | Richard B-Person 53 | ' I-Person 54 | Tex I-Person 55 | ' I-Person 56 | Coe I-Person 57 | , O 58 | the O 59 | lead O 60 | investigating O 61 | officer O 62 | , O 63 | will O 64 | present O 65 | the O 66 | findings O 67 | and O 68 | recommendations O 69 | . O 70 | 71 | Questions O 72 | from O 73 | the B-Organisation 74 | Pentagon I-Organisation 75 | Press I-Organisation 76 | to O 77 | the B-Person 78 | investigating I-Person 79 | officer I-Person 80 | will I-Person 81 | follow I-Person 82 | Brig I-Person 83 | . I-Person 84 | 85 | Gen. B-Person 86 | Coe I-Person 87 | ’ O 88 | s O 89 | extended O 90 | opening O 91 | statement O 92 | , O 93 | and O 94 | Col. B-Person 95 | John I-Person 96 | J. I-Person 97 | Thomas I-Person 98 | , O 99 | Central B-Person 100 | Command I-Person 101 | spokesman I-Person 102 | will O 103 | moderate O 104 | . O 105 | 106 | The O 107 | call O 108 | is O 109 | scheduled O 110 | to O 111 | last O 112 | 45 B-Temporal 113 | minutes I-Temporal 114 | . O 115 | 116 | Those O 117 | able O 118 | to O 119 | participate O 120 | should O 121 | dial O 122 | 888 O 123 | - O 124 | 847 O 125 | - O 126 | 6592 O 127 | . O 128 | 129 | When O 130 | prompted O 131 | , O 132 | enter O 133 | Passcode B-DocumentReference 134 | 3080897 I-DocumentReference 135 | . O 136 | 137 | It O 138 | is O 139 | recommended O 140 | to O 141 | dial O 142 | in O 143 | 15 B-Temporal 144 | minutes I-Temporal 145 | prior O 146 | to O 147 | the O 148 | call O 149 | . O 150 | 151 | Each O 152 | individual O 153 | calling O 154 | in O 155 | will O 156 | be O 157 | asked O 158 | to O 159 | identify O 160 | themselves O 161 | and O 162 | media O 163 | affiliation O 164 | into O 165 | a O 166 | recording O 167 | before O 168 | being O 169 | automatically O 170 | connected O 171 | . O 172 | 173 | Pentagon B-Organisation 174 | Press I-Organisation 175 | will O 176 | gather O 177 | around O 178 | the O 179 | call O 180 | in O 181 | the B-Location 182 | Press I-Location 183 | Briefing I-Location 184 | Room I-Location 185 | and O 186 | be O 187 | called O 188 | on O 189 | to O 190 | ask O 191 | their O 192 | questions O 193 | by O 194 | Navy B-Person 195 | Capt. I-Person 196 | Jeff B-Person 197 | Davis I-Person 198 | who O 199 | will O 200 | also O 201 | be O 202 | in O 203 | the B-Location 204 | room I-Location 205 | at O 206 | the B-Location 207 | Pentagon I-Location 208 | . O 209 | 210 | Time O 211 | permitting O 212 | , O 213 | journalists B-Organisation 214 | outside O 215 | the B-Organisation 216 | Pentagon I-Organisation 217 | will O 218 | also O 219 | be O 220 | able O 221 | to O 222 | ask O 223 | questions O 224 | . O 225 | 226 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/CENTCOM/F7893F2F4E38C8E00C7D22E22E100428.conll: -------------------------------------------------------------------------------- 1 | U.S. B-Person 2 | Army I-Person 3 | General I-Person 4 | Joseph I-Person 5 | Votel I-Person 6 | , O 7 | commander O 8 | , O 9 | U.S. B-Organisation 10 | Central I-Organisation 11 | Command I-Organisation 12 | recently O 13 | met O 14 | with O 15 | U.S. B-Organisation 16 | Air I-Organisation 17 | Force I-Organisation 18 | civilian I-Organisation 19 | employees I-Organisation 20 | and O 21 | provided O 22 | an O 23 | overview O 24 | of O 25 | the O 26 | CENTCOM B-Organisation 27 | vision O 28 | , O 29 | the O 30 | current O 31 | mission O 32 | , O 33 | and O 34 | strategic O 35 | approach O 36 | . O 37 | 38 | Votel B-Person 39 | explained O 40 | U.S. B-Organisation 41 | Central I-Organisation 42 | Command I-Organisation 43 | ’ O 44 | s O 45 | top O 46 | priorities O 47 | : O 48 | 49 | - O 50 | Ensure O 51 | an O 52 | effective O 53 | posture O 54 | . O 55 | 56 | - O 57 | Strengthen O 58 | relationships O 59 | with O 60 | allies B-Organisation 61 | . O 62 | 63 | - O 64 | Deter O 65 | and O 66 | counter O 67 | state O 68 | aggression O 69 | . O 70 | 71 | - O 72 | Disrupt O 73 | and O 74 | counter O 75 | violent O 76 | extremism O 77 | . O 78 | 79 | Votel B-Person 80 | expressed O 81 | his O 82 | desire O 83 | that O 84 | the O 85 | command O 86 | attracts O 87 | and O 88 | retains O 89 | the O 90 | very O 91 | best O 92 | people O 93 | to O 94 | support O 95 | these O 96 | critical O 97 | priorities O 98 | and O 99 | that O 100 | they O 101 | are O 102 | fully O 103 | enabled O 104 | to O 105 | develop O 106 | and O 107 | contribute O 108 | to O 109 | the O 110 | mission O 111 | . O 112 | 113 | He B-Person 114 | further O 115 | stressed O 116 | the O 117 | need O 118 | to O 119 | develop O 120 | our O 121 | future O 122 | leaders O 123 | through O 124 | timely O 125 | feedback O 126 | from O 127 | supervisors O 128 | , O 129 | mentorship O 130 | programs O 131 | , O 132 | and O 133 | opportunities O 134 | and O 135 | resources O 136 | for O 137 | growth O 138 | , O 139 | while O 140 | noting O 141 | the O 142 | importance O 143 | of O 144 | timely O 145 | and O 146 | appropriate O 147 | recognition O 148 | for O 149 | superior O 150 | performance O 151 | throughout O 152 | an O 153 | employee O 154 | ’ O 155 | s O 156 | tenure O 157 | with O 158 | the B-Organisation 159 | command I-Organisation 160 | . O 161 | 162 | The B-Organisation 163 | command I-Organisation 164 | intends O 165 | to O 166 | hold O 167 | three O 168 | town O 169 | halls O 170 | next B-Temporal 171 | year I-Temporal 172 | with O 173 | varying O 174 | agendas O 175 | to O 176 | convey O 177 | useful O 178 | information O 179 | to O 180 | the B-Organisation 181 | civilian I-Organisation 182 | workforce I-Organisation 183 | and O 184 | to O 185 | take O 186 | questions O 187 | and O 188 | recommendations O 189 | . O 190 | 191 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/09FF200CBB76594D688F8EB565717543.conll: -------------------------------------------------------------------------------- 1 | Minister B-Person 2 | for I-Person 3 | the I-Person 4 | Middle I-Person 5 | East I-Person 6 | , O 7 | Tobias B-Person 8 | Ellwood I-Person 9 | , O 10 | responds O 11 | to O 12 | the O 13 | attack O 14 | in O 15 | the B-Location 16 | Karrada I-Location 17 | district I-Location 18 | of I-Location 19 | Baghdad I-Location 20 | 21 | Foreign B-Person 22 | Office I-Person 23 | Minister I-Person 24 | , O 25 | Tobias B-Person 26 | Ellwood I-Person 27 | , O 28 | said O 29 | : O 30 | 31 | The O 32 | attack O 33 | was O 34 | a O 35 | horrific O 36 | act O 37 | of O 38 | terror O 39 | against O 40 | innocent O 41 | people O 42 | in O 43 | the O 44 | early B-Temporal 45 | hours I-Temporal 46 | of I-Temporal 47 | Sunday I-Temporal 48 | morning O 49 | in O 50 | Karrada B-Location 51 | , O 52 | Baghdad B-Location 53 | . O 54 | 55 | I B-Person 56 | offer O 57 | my O 58 | sincere O 59 | condolences O 60 | to O 61 | the O 62 | families O 63 | and O 64 | friends O 65 | of O 66 | the O 67 | victims O 68 | , O 69 | and O 70 | hope O 71 | those O 72 | injured O 73 | in O 74 | the O 75 | attack O 76 | recover O 77 | quickly O 78 | . O 79 | 80 | Daesh B-Organisation 81 | targeted O 82 | families B-Organisation 83 | out O 84 | shopping O 85 | for O 86 | Eid O 87 | and O 88 | those O 89 | celebrating O 90 | suhur O 91 | . O 92 | 93 | It O 94 | violated O 95 | the O 96 | peace O 97 | of O 98 | Ramadan B-Temporal 99 | . O 100 | 101 | The B-Organisation 102 | UK I-Organisation 103 | stands O 104 | by O 105 | Iraq B-Organisation 106 | to O 107 | defeat O 108 | Daesh B-Organisation 109 | and O 110 | end O 111 | this O 112 | violence O 113 | . O 114 | 115 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/1101048768E50FA1F8DA3F7B7CD105AB.conll: -------------------------------------------------------------------------------- 1 | Ministers B-Organisation 2 | discussed O 3 | the O 4 | Syria B-Location 5 | crisis O 6 | and O 7 | conflict O 8 | in O 9 | Ukraine B-Location 10 | 11 | Boris O 12 | Johnson O 13 | 14 | A O 15 | Foreign O 16 | Office O 17 | spokesman O 18 | said O 19 | : O 20 | “ O 21 | The B-Person 22 | Foreign I-Person 23 | Secretary I-Person 24 | met O 25 | Russian B-Person 26 | Foreign I-Person 27 | Minister I-Person 28 | Sergei I-Person 29 | Lavrov I-Person 30 | earlier O 31 | today O 32 | on O 33 | the O 34 | margins O 35 | of O 36 | the O 37 | UNGA B-Organisation 38 | meeting O 39 | in O 40 | New O 41 | York O 42 | , O 43 | their O 44 | first O 45 | face O 46 | - O 47 | to O 48 | - O 49 | face O 50 | meeting O 51 | since O 52 | Boris B-Person 53 | Johnson I-Person 54 | was O 55 | appointed O 56 | . O 57 | 58 | “ O 59 | The B-Organisation 60 | ministers I-Organisation 61 | discussed O 62 | the O 63 | crisis O 64 | in O 65 | Syria B-Location 66 | and O 67 | the O 68 | increase O 69 | in O 70 | violence O 71 | in O 72 | recent B-Temporal 73 | days I-Temporal 74 | , O 75 | including O 76 | the O 77 | recent O 78 | unacceptable O 79 | attack O 80 | on O 81 | an O 82 | aid O 83 | convoy O 84 | . O 85 | 86 | The B-Person 87 | Foreign I-Person 88 | Secretary I-Person 89 | pressed O 90 | for O 91 | Russia B-Organisation 92 | to O 93 | use O 94 | its O 95 | influence O 96 | constructively O 97 | and O 98 | underlined O 99 | the O 100 | need O 101 | for O 102 | the B-Organisation 103 | international I-Organisation 104 | community I-Organisation 105 | to O 106 | work O 107 | together O 108 | to O 109 | resolve O 110 | the O 111 | conflict O 112 | . O 113 | 114 | “ O 115 | Mr B-Person 116 | Johnson I-Person 117 | also O 118 | raised O 119 | the O 120 | situation O 121 | in O 122 | Ukraine B-Location 123 | , O 124 | where O 125 | there O 126 | are O 127 | continued O 128 | violations O 129 | of O 130 | the B-DocumentReference 131 | Minsk I-DocumentReference 132 | agreements I-DocumentReference 133 | , O 134 | and O 135 | stressed O 136 | the O 137 | need O 138 | for O 139 | a O 140 | sustainable O 141 | ceasefire O 142 | in O 143 | Eastern B-Location 144 | Ukraine I-Location 145 | . O 146 | 147 | “ O 148 | The B-Person 149 | Foreign I-Person 150 | Secretary I-Person 151 | was O 152 | clear O 153 | that O 154 | while O 155 | ongoing O 156 | disagreements O 157 | mean O 158 | it O 159 | is O 160 | too O 161 | early O 162 | to O 163 | normalise O 164 | diplomatic O 165 | relations O 166 | , O 167 | it O 168 | is O 169 | right O 170 | that O 171 | both B-Organisation 172 | countries I-Organisation 173 | continue O 174 | to O 175 | discuss O 176 | important O 177 | global O 178 | issues O 179 | . O 180 | ” O 181 | 182 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/1563238E6CF89746FDD44909F11EE319.conll: -------------------------------------------------------------------------------- 1 | Minister B-Person 2 | for I-Person 3 | the I-Person 4 | Middle I-Person 5 | East I-Person 6 | Tobias I-Person 7 | Ellwood I-Person 8 | calls O 9 | ISIL B-Organisation 10 | presence O 11 | in O 12 | Palmyra B-Location 13 | a O 14 | tragedy O 15 | 16 | Commenting O 17 | on O 18 | reports O 19 | that O 20 | ISIL B-Organisation 21 | forces I-Organisation 22 | have O 23 | reached O 24 | the B-Location 25 | UNESCO I-Location 26 | World I-Location 27 | Heritage I-Location 28 | Site I-Location 29 | at I-Location 30 | Palmyra I-Location 31 | in O 32 | Syria B-Location 33 | , O 34 | Foreign B-Person 35 | Office I-Person 36 | Minister I-Person 37 | Tobias I-Person 38 | Ellwood I-Person 39 | said O 40 | : O 41 | 42 | ISIL B-Organisation 43 | ’ O 44 | s O 45 | presence O 46 | in O 47 | the B-Location 48 | ancient I-Location 49 | site I-Location 50 | of I-Location 51 | Palmyra I-Location 52 | is O 53 | a O 54 | tragedy O 55 | for O 56 | all B-Organisation 57 | Syrians I-Organisation 58 | and O 59 | its O 60 | destruction O 61 | would O 62 | be O 63 | a O 64 | terrible O 65 | act O 66 | of O 67 | vandalism O 68 | . O 69 | 70 | This O 71 | is O 72 | part O 73 | of O 74 | a O 75 | much O 76 | wider O 77 | human O 78 | catastrophe O 79 | : O 80 | hundreds B-Quantity 81 | of I-Quantity 82 | thousands I-Quantity 83 | of O 84 | Syrians B-Organisation 85 | have O 86 | been O 87 | killed O 88 | at O 89 | the O 90 | hands O 91 | of O 92 | Assad B-Person 93 | and O 94 | ISIL B-Organisation 95 | , O 96 | millions B-Quantity 97 | have O 98 | been O 99 | displaced O 100 | and O 101 | historic O 102 | cities O 103 | such O 104 | as O 105 | Aleppo B-Location 106 | have O 107 | been O 108 | reduced O 109 | to O 110 | rubble O 111 | by O 112 | regime B-Organisation 113 | bombardment B-Weapon 114 | . O 115 | 116 | The B-Organisation 117 | UK I-Organisation 118 | is O 119 | working O 120 | with O 121 | its B-Organisation 122 | allies I-Organisation 123 | to O 124 | support O 125 | the B-Organisation 126 | moderate I-Organisation 127 | Syrian I-Organisation 128 | Opposition I-Organisation 129 | to O 130 | defend O 131 | the B-Organisation 132 | Syrian I-Organisation 133 | people I-Organisation 134 | from O 135 | Assad B-Person 136 | ’ O 137 | s O 138 | inhumanity O 139 | and O 140 | ISIL B-Organisation 141 | ’ O 142 | s O 143 | savagery O 144 | . O 145 | 146 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/1718A3656CA87E3826A28BC3D041F024.conll: -------------------------------------------------------------------------------- 1 | Philip B-Person 2 | Hammond I-Person 3 | condemns O 4 | apparent O 5 | barrel B-Weapon 6 | bomb I-Weapon 7 | attacks O 8 | in O 9 | Syria B-Location 10 | which O 11 | have O 12 | killed O 13 | and O 14 | injured O 15 | more O 16 | innocent O 17 | civilians O 18 | 19 | The B-Person 20 | Foreign I-Person 21 | Secretary I-Person 22 | Philip I-Person 23 | Hammond I-Person 24 | said O 25 | : O 26 | 27 | I B-Person 28 | am O 29 | appalled O 30 | by O 31 | this O 32 | latest O 33 | brutal O 34 | attack O 35 | by O 36 | Assad B-Organisation 37 | ’ I-Organisation 38 | s I-Organisation 39 | regime I-Organisation 40 | , O 41 | just O 42 | over O 43 | a O 44 | week O 45 | after O 46 | video O 47 | footage O 48 | emerged O 49 | showing O 50 | Syrian B-Organisation 51 | regime I-Organisation 52 | aircrew I-Organisation 53 | dropping O 54 | barrel B-Weapon 55 | bombs I-Weapon 56 | out O 57 | of O 58 | helicopters B-MilitaryPlatform 59 | . O 60 | 61 | This O 62 | is O 63 | further O 64 | shocking O 65 | proof O 66 | of O 67 | the O 68 | horrific O 69 | and O 70 | indiscriminate O 71 | methods O 72 | the B-Organisation 73 | Assad I-Organisation 74 | regime I-Organisation 75 | is O 76 | using O 77 | to O 78 | kill O 79 | and O 80 | injure O 81 | innocent O 82 | civilians B-Organisation 83 | , O 84 | including O 85 | children B-Organisation 86 | . O 87 | 88 | Our O 89 | position O 90 | remains O 91 | as O 92 | strong O 93 | as O 94 | ever O 95 | – O 96 | we B-Organisation 97 | will O 98 | continue O 99 | to O 100 | call O 101 | for O 102 | a O 103 | political O 104 | transition O 105 | to O 106 | a O 107 | future O 108 | in O 109 | which O 110 | Assad B-Person 111 | has O 112 | no O 113 | part O 114 | . O 115 | 116 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/208C0A8E536C75AB9D151D7030276BE9.conll: -------------------------------------------------------------------------------- 1 | The B-Organisation 2 | UN I-Organisation 3 | Security I-Organisation 4 | Council I-Organisation 5 | voted O 6 | unanimously O 7 | to O 8 | adopt O 9 | Resolution B-DocumentReference 10 | 2249 I-DocumentReference 11 | , O 12 | which O 13 | condemned O 14 | the O 15 | 13 B-Temporal 16 | November I-Temporal 17 | attacks O 18 | in O 19 | Paris B-Location 20 | and O 21 | others O 22 | in O 23 | the B-Location 24 | Middle I-Location 25 | East I-Location 26 | , O 27 | including O 28 | those O 29 | in O 30 | Sousse B-Location 31 | on O 32 | 25 B-Temporal 33 | June I-Temporal 34 | in O 35 | which O 36 | 30 O 37 | Britons O 38 | were O 39 | murdered O 40 | . O 41 | 42 | Foreign B-Person 43 | Secretary I-Person 44 | Philip I-Person 45 | Hammond I-Person 46 | said O 47 | : O 48 | 49 | The O 50 | horrific O 51 | attacks O 52 | in O 53 | Paris B-Location 54 | have O 55 | stiffened O 56 | the O 57 | resolve O 58 | of O 59 | the B-Organisation 60 | international I-Organisation 61 | community I-Organisation 62 | in O 63 | its O 64 | determination O 65 | to O 66 | defeat O 67 | ISIL B-Organisation 68 | , O 69 | whose O 70 | acts O 71 | of O 72 | terror O 73 | in O 74 | Europe B-Location 75 | and O 76 | the B-Location 77 | Middle I-Location 78 | East I-Location 79 | aim O 80 | to O 81 | challenge O 82 | our O 83 | basic O 84 | values O 85 | and O 86 | way O 87 | of O 88 | life O 89 | . O 90 | 91 | The B-Organisation 92 | United I-Organisation 93 | Nations I-Organisation 94 | Security I-Organisation 95 | Council I-Organisation 96 | has O 97 | demonstrated O 98 | the O 99 | unity O 100 | of O 101 | the B-Organisation 102 | international I-Organisation 103 | community I-Organisation 104 | in O 105 | working O 106 | to O 107 | defeat O 108 | ISIL B-Organisation 109 | . O 110 | 111 | We B-Organisation 112 | must O 113 | all O 114 | act O 115 | by O 116 | continuing O 117 | to O 118 | cut O 119 | off O 120 | its O 121 | finance O 122 | , O 123 | severing O 124 | its O 125 | access O 126 | to O 127 | oil O 128 | and O 129 | stemming O 130 | the O 131 | flow O 132 | of O 133 | foreign B-Organisation 134 | fighters I-Organisation 135 | , O 136 | in O 137 | addition O 138 | to O 139 | the O 140 | military O 141 | action O 142 | we O 143 | are O 144 | taking O 145 | against O 146 | this O 147 | evil O 148 | ideology O 149 | . O 150 | 151 | ISIL B-Organisation 152 | ’ O 153 | s O 154 | murderous O 155 | violence O 156 | requires O 157 | a O 158 | strong O 159 | security O 160 | response O 161 | . O 162 | 163 | That O 164 | is O 165 | why O 166 | the B-Organisation 167 | UK I-Organisation 168 | is O 169 | working O 170 | as O 171 | part O 172 | of O 173 | the B-Organisation 174 | Global I-Organisation 175 | Coalition I-Organisation 176 | to O 177 | degrade O 178 | and O 179 | defeat O 180 | ISIL B-Organisation 181 | in O 182 | Syria B-Location 183 | and O 184 | Iraq B-Location 185 | and O 186 | , O 187 | when O 188 | necessary O 189 | , O 190 | striking O 191 | against O 192 | those O 193 | who O 194 | pose O 195 | a O 196 | threat O 197 | to O 198 | the B-Organisation 199 | UK I-Organisation 200 | , O 201 | and O 202 | our O 203 | people O 204 | and O 205 | interests O 206 | . O 207 | 208 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/32AACD4CCB88ACB2C0FAB156E7BDCC2F.conll: -------------------------------------------------------------------------------- 1 | The B-Organisation 2 | UN I-Organisation 3 | have O 4 | confirmed O 5 | reports O 6 | that O 7 | ISIL B-Organisation 8 | has O 9 | killed O 10 | 70 B-Quantity 11 | members B-Organisation 12 | of I-Organisation 13 | the I-Organisation 14 | Al I-Organisation 15 | bu I-Organisation 16 | Nimr I-Organisation 17 | tribe I-Organisation 18 | in O 19 | Anbar B-Location 20 | , I-Location 21 | Iraq I-Location 22 | . O 23 | 24 | Minister B-Person 25 | for I-Person 26 | the I-Person 27 | Middle I-Person 28 | East I-Person 29 | , O 30 | Tobias B-Person 31 | Ellwood I-Person 32 | said O 33 | : O 34 | 35 | I B-Person 36 | am O 37 | deeply O 38 | concerned O 39 | and O 40 | shocked O 41 | by O 42 | reports O 43 | that O 44 | ISIL B-Organisation 45 | has O 46 | killed O 47 | 70 B-Quantity 48 | members B-Organisation 49 | of O 50 | the B-Organisation 51 | Al I-Organisation 52 | Bu I-Organisation 53 | Nimr I-Organisation 54 | tribe I-Organisation 55 | in O 56 | Anbar B-Location 57 | province I-Location 58 | , O 59 | Iraq B-Location 60 | . O 61 | 62 | We B-Organisation 63 | have O 64 | been O 65 | clear O 66 | , O 67 | defeating O 68 | ISIL B-Organisation 69 | will O 70 | take O 71 | time O 72 | and O 73 | patience O 74 | but O 75 | it O 76 | is O 77 | a O 78 | fight O 79 | we B-Organisation 80 | must O 81 | win O 82 | . O 83 | 84 | They B-Organisation 85 | falsely O 86 | use O 87 | the O 88 | name O 89 | of O 90 | Islam B-Nationality 91 | to O 92 | commit O 93 | atrocities O 94 | and O 95 | have O 96 | once O 97 | again O 98 | slaughtered O 99 | Sunnis O 100 | , O 101 | those O 102 | they B-Organisation 103 | claim O 104 | to O 105 | be O 106 | defending O 107 | . O 108 | 109 | Over O 110 | the B-Temporal 111 | last I-Temporal 112 | year I-Temporal 113 | the B-Organisation 114 | Global I-Organisation 115 | Coalition I-Organisation 116 | has O 117 | launched O 118 | almost O 119 | 8 B-Quantity 120 | , I-Quantity 121 | 000 I-Quantity 122 | strikes O 123 | against O 124 | ISIL B-Organisation 125 | in O 126 | Iraq B-Location 127 | and O 128 | Syria B-Location 129 | and O 130 | have O 131 | taken O 132 | back O 133 | 30 O 134 | % O 135 | of O 136 | territory O 137 | ISIL B-Organisation 138 | once O 139 | controlled O 140 | in O 141 | Iraq B-Location 142 | . O 143 | 144 | United O 145 | , O 146 | we B-Organisation 147 | continue O 148 | to O 149 | work O 150 | to O 151 | defeat O 152 | ISIL B-Organisation 153 | on O 154 | all O 155 | fronts O 156 | : O 157 | not O 158 | just O 159 | militarily O 160 | , O 161 | but O 162 | cutting O 163 | off O 164 | their O 165 | finance O 166 | , O 167 | reducing O 168 | the O 169 | influx O 170 | of O 171 | fighters B-Organisation 172 | , O 173 | challenging O 174 | the O 175 | ideology O 176 | and O 177 | providing O 178 | humanitarian O 179 | assistance O 180 | to O 181 | those O 182 | in O 183 | need O 184 | . O 185 | 186 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/33A216AD3A006BFF03390675E6C56594.conll: -------------------------------------------------------------------------------- 1 | FCO B-Person 2 | Minister I-Person 3 | Tobias I-Person 4 | Ellwood I-Person 5 | welcomes O 6 | new O 7 | budget O 8 | deal O 9 | as O 10 | first O 11 | step O 12 | towards O 13 | negotiating O 14 | a O 15 | solution O 16 | to O 17 | the O 18 | issue O 19 | of O 20 | federal O 21 | budget O 22 | allocations O 23 | 24 | Tobias B-Person 25 | Ellwood I-Person 26 | , I-Person 27 | Foreign I-Person 28 | Office I-Person 29 | Minister I-Person 30 | for I-Person 31 | the I-Person 32 | Middle I-Person 33 | East I-Person 34 | , O 35 | said O 36 | : O 37 | 38 | I B-Person 39 | welcome O 40 | the O 41 | agreement O 42 | reached O 43 | yesterday B-Temporal 44 | between O 45 | the B-Organisation 46 | Government I-Organisation 47 | of I-Organisation 48 | Iraq I-Organisation 49 | and O 50 | the B-Organisation 51 | Kurdistan I-Organisation 52 | Regional I-Organisation 53 | Government I-Organisation 54 | on O 55 | the O 56 | issue O 57 | of O 58 | federal O 59 | budget O 60 | allocations O 61 | . O 62 | 63 | Whilst O 64 | this O 65 | is O 66 | an O 67 | interim O 68 | deal O 69 | , O 70 | it O 71 | is O 72 | a O 73 | positive O 74 | and O 75 | welcome O 76 | first O 77 | step O 78 | which O 79 | demonstrates O 80 | that O 81 | both B-Organisation 82 | parties I-Organisation 83 | are O 84 | ready O 85 | and O 86 | willing O 87 | to O 88 | negotiate O 89 | seriously O 90 | and O 91 | in O 92 | good O 93 | faith O 94 | . O 95 | 96 | I B-Person 97 | encourage O 98 | the B-Organisation 99 | Government I-Organisation 100 | of I-Organisation 101 | Iraq I-Organisation 102 | and O 103 | the B-Organisation 104 | Kurdistan I-Organisation 105 | Regional I-Organisation 106 | Government I-Organisation 107 | to O 108 | now O 109 | work O 110 | together O 111 | to O 112 | find O 113 | a O 114 | long O 115 | - O 116 | term O 117 | , O 118 | comprehensive O 119 | and O 120 | constitutional O 121 | solution O 122 | to O 123 | this O 124 | issue O 125 | . O 126 | 127 | This O 128 | will O 129 | require O 130 | flexibility O 131 | and O 132 | compromise O 133 | on O 134 | both O 135 | sides O 136 | ; O 137 | however O 138 | I B-Person 139 | have O 140 | no O 141 | doubt O 142 | that O 143 | a O 144 | resolution O 145 | will O 146 | bring O 147 | benefits O 148 | to O 149 | Iraqis B-Organisation 150 | across O 151 | the O 152 | country O 153 | . O 154 | 155 | ISIL B-Organisation 156 | is O 157 | a O 158 | threat O 159 | to O 160 | all O 161 | Iraqis O 162 | and O 163 | it O 164 | is O 165 | important O 166 | that O 167 | Iraq B-Location 168 | ’ O 169 | s O 170 | communities O 171 | unite O 172 | in O 173 | the O 174 | face O 175 | of O 176 | this O 177 | common O 178 | enemy O 179 | . O 180 | 181 | I B-Person 182 | hope O 183 | that O 184 | yesterday B-Temporal 185 | ’ O 186 | s O 187 | agreement O 188 | can O 189 | contribute O 190 | positively O 191 | to O 192 | this O 193 | process O 194 | . O 195 | 196 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/3A7EC63500380D5B35C211D4FF5E20EA.conll: -------------------------------------------------------------------------------- 1 | Minister B-Person 2 | Tobias I-Person 3 | Ellwood I-Person 4 | welcomes O 5 | the O 6 | Government B-Organisation 7 | of I-Organisation 8 | Iraq I-Organisation 9 | ’ O 10 | s O 11 | liberation O 12 | of O 13 | Tikrit B-Location 14 | and O 15 | reiterates O 16 | UK B-Organisation 17 | support O 18 | for O 19 | Iraq B-Location 20 | in O 21 | the O 22 | fight O 23 | against O 24 | ISIL B-Organisation 25 | . O 26 | 27 | Speaking O 28 | today B-Temporal 29 | , O 30 | Minister B-Person 31 | Ellwood I-Person 32 | said O 33 | : O 34 | 35 | I B-Person 36 | welcome O 37 | the O 38 | news O 39 | that O 40 | the B-Organisation 41 | Government I-Organisation 42 | of I-Organisation 43 | Iraq I-Organisation 44 | has O 45 | raised O 46 | the O 47 | Iraqi B-Nationality 48 | flag O 49 | again O 50 | in O 51 | Tikrit B-Location 52 | . O 53 | 54 | The B-Organisation 55 | UK I-Organisation 56 | is O 57 | committed O 58 | to O 59 | supporting O 60 | the B-Organisation 61 | Iraqi I-Organisation 62 | government I-Organisation 63 | against O 64 | the O 65 | terrorism O 66 | of O 67 | the O 68 | vile O 69 | organisation O 70 | ISIL B-Organisation 71 | . O 72 | 73 | However O 74 | whilst O 75 | this O 76 | is O 77 | a O 78 | critical O 79 | achievement O 80 | it O 81 | is O 82 | a O 83 | first O 84 | step O 85 | in O 86 | a O 87 | long O 88 | campaign O 89 | . O 90 | 91 | We B-Organisation 92 | are O 93 | proud O 94 | that O 95 | the B-Organisation 96 | international I-Organisation 97 | coalition I-Organisation 98 | has O 99 | supported O 100 | the B-Organisation 101 | Iraqi I-Organisation 102 | Security I-Organisation 103 | Forces I-Organisation 104 | in O 105 | its O 106 | efforts O 107 | to O 108 | defeat O 109 | ISIL B-Organisation 110 | and O 111 | despite O 112 | their O 113 | propaganda O 114 | machine O 115 | , O 116 | ISIL B-Organisation 117 | are O 118 | losing O 119 | this O 120 | fight O 121 | . O 122 | 123 | I B-Person 124 | echo O 125 | Prime B-Person 126 | Minister I-Person 127 | Abadi I-Person 128 | ’ O 129 | s O 130 | recent O 131 | call O 132 | that O 133 | all O 134 | parties O 135 | respect O 136 | the O 137 | human O 138 | rights O 139 | of O 140 | citizens B-Organisation 141 | in O 142 | Tikrit B-Location 143 | who O 144 | have O 145 | already O 146 | suffered O 147 | so O 148 | much O 149 | at O 150 | the O 151 | hands O 152 | of O 153 | brutal O 154 | ISIL B-Organisation 155 | terrorists O 156 | . O 157 | 158 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/7064D852A8BF7FB8126978D3AC40E4FC.conll: -------------------------------------------------------------------------------- 1 | Minister B-Person 2 | for I-Person 3 | the I-Person 4 | Middle I-Person 5 | East I-Person 6 | Tobias I-Person 7 | Ellwood I-Person 8 | comments O 9 | on O 10 | reports O 11 | of O 12 | violations O 13 | of O 14 | the O 15 | ceasefire O 16 | in O 17 | Syria B-Location 18 | announced O 19 | by O 20 | Russia B-Organisation 21 | and O 22 | Turkey B-Organisation 23 | on O 24 | 29 B-Temporal 25 | December I-Temporal 26 | . O 27 | 28 | Minister B-Person 29 | Ellwood I-Person 30 | said O 31 | : O 32 | 33 | We B-Organisation 34 | are O 35 | concerned O 36 | by O 37 | reports O 38 | of O 39 | ceasefire O 40 | violations O 41 | and O 42 | call O 43 | on O 44 | all O 45 | parties O 46 | to O 47 | respect O 48 | the O 49 | ceasefire O 50 | , O 51 | particularly O 52 | around O 53 | the B-Location 54 | Wadi I-Location 55 | Barada I-Location 56 | area I-Location 57 | of O 58 | Damascus B-Location 59 | , O 60 | where O 61 | civilians O 62 | are O 63 | facing O 64 | ongoing O 65 | shelling O 66 | and O 67 | airstrikes O 68 | by O 69 | the B-Organisation 70 | Asad I-Organisation 71 | regime I-Organisation 72 | and O 73 | militias B-Organisation 74 | like I-Organisation 75 | Hezbollah I-Organisation 76 | . O 77 | 78 | More O 79 | than O 80 | five B-Quantity 81 | million I-Quantity 82 | people B-Organisation 83 | across O 84 | Damascus B-Location 85 | also O 86 | remain O 87 | without O 88 | access O 89 | to O 90 | the O 91 | central O 92 | water O 93 | supply O 94 | . O 95 | 96 | It O 97 | is O 98 | vital O 99 | that O 100 | all O 101 | guarantors B-Organisation 102 | of I-Organisation 103 | the I-Organisation 104 | ceasefire I-Organisation 105 | use O 106 | their O 107 | influence O 108 | to O 109 | ensure O 110 | it O 111 | is O 112 | adhered O 113 | to O 114 | , O 115 | and O 116 | that O 117 | all B-Organisation 118 | parties I-Organisation 119 | take O 120 | greater O 121 | steps O 122 | to O 123 | ensure O 124 | humanitarian O 125 | aid O 126 | reaches O 127 | all O 128 | those O 129 | in O 130 | need O 131 | across O 132 | Syria B-Location 133 | . O 134 | 135 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/7148F3D092DA3E560332528D7167DF0F.conll: -------------------------------------------------------------------------------- 1 | Philip B-Person 2 | Hammond I-Person 3 | welcomes O 4 | agreement O 5 | between O 6 | Iraqi O 7 | and O 8 | Kurdistan B-Organisation 9 | Regional I-Organisation 10 | governments I-Organisation 11 | on O 12 | oil O 13 | exports O 14 | and O 15 | revenue O 16 | sharing O 17 | . O 18 | 19 | The O 20 | Foreign O 21 | Secretary O 22 | Philip O 23 | Hammond O 24 | said O 25 | : O 26 | 27 | I B-Person 28 | very O 29 | much O 30 | welcome O 31 | the O 32 | agreement O 33 | on O 34 | 2 B-Temporal 35 | December I-Temporal 36 | between O 37 | the B-Organisation 38 | Federal I-Organisation 39 | Government I-Organisation 40 | of I-Organisation 41 | Iraq I-Organisation 42 | , O 43 | led O 44 | by O 45 | Prime B-Person 46 | Minister I-Person 47 | Haider I-Person 48 | Al I-Person 49 | Abadi I-Person 50 | , O 51 | and O 52 | the B-Organisation 53 | Kurdistan I-Organisation 54 | Regional I-Organisation 55 | Government I-Organisation 56 | , O 57 | represented O 58 | by O 59 | Prime B-Person 60 | Minister I-Person 61 | Nechirvan I-Person 62 | Barzani I-Person 63 | , O 64 | on O 65 | detailed O 66 | arrangements O 67 | on O 68 | oil O 69 | exports O 70 | and O 71 | revenue O 72 | sharing O 73 | . O 74 | 75 | Iraq B-Organisation 76 | faces O 77 | significant O 78 | challenges O 79 | and O 80 | all B-Organisation 81 | communities I-Organisation 82 | must O 83 | cooperate O 84 | to O 85 | defeat O 86 | ISIL B-Organisation 87 | militarily O 88 | , O 89 | assist O 90 | the O 91 | displaced O 92 | and O 93 | provide O 94 | an O 95 | inclusive O 96 | future O 97 | for O 98 | the B-Organisation 99 | country I-Organisation 100 | . O 101 | 102 | I B-Person 103 | welcome O 104 | the O 105 | important O 106 | steps O 107 | that O 108 | have O 109 | been O 110 | taken O 111 | by O 112 | Prime B-Person 113 | Minister I-Person 114 | Abadi I-Person 115 | and O 116 | his B-Organisation 117 | government I-Organisation 118 | in O 119 | recent O 120 | weeks O 121 | in O 122 | support O 123 | of O 124 | this O 125 | , O 126 | including O 127 | today B-DocumentReference 128 | ’ I-DocumentReference 129 | s I-DocumentReference 130 | announcement I-DocumentReference 131 | . O 132 | 133 | I B-Person 134 | congratulate O 135 | all O 136 | concerned O 137 | for O 138 | their O 139 | work O 140 | to O 141 | resolve O 142 | this O 143 | long O 144 | - O 145 | standing O 146 | disagreement O 147 | . O 148 | 149 | The B-Organisation 150 | UK I-Organisation 151 | has O 152 | supported O 153 | these O 154 | efforts O 155 | at O 156 | dialogue O 157 | throughout O 158 | . O 159 | 160 | I B-Person 161 | hope O 162 | that O 163 | all B-Organisation 164 | Iraqi I-Organisation 165 | leaders I-Organisation 166 | will O 167 | follow O 168 | this O 169 | constructive O 170 | approach O 171 | to O 172 | addressing O 173 | practical O 174 | challenges O 175 | , O 176 | in O 177 | the O 178 | interests O 179 | of O 180 | national O 181 | reconciliation O 182 | and O 183 | unity O 184 | against O 185 | a B-Organisation 186 | common I-Organisation 187 | enemy I-Organisation 188 | who O 189 | are O 190 | a O 191 | threat O 192 | to O 193 | all O 194 | . O 195 | 196 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/74E7DCD5876B4EF1E95448F0E0C073CD.conll: -------------------------------------------------------------------------------- 1 | Minister B-Person 2 | for I-Person 3 | the I-Person 4 | Middle I-Person 5 | East I-Person 6 | Tobias B-Person 7 | Ellwood I-Person 8 | marks O 9 | one B-Temporal 10 | year I-Temporal 11 | since O 12 | Yazidi B-Nationality 13 | plight O 14 | on O 15 | Mount B-Location 16 | Sinjar I-Location 17 | at O 18 | the O 19 | hands O 20 | of O 21 | ISIL B-Organisation 22 | . O 23 | 24 | Speaking O 25 | today O 26 | Mr B-Person 27 | Ellwood I-Person 28 | said O 29 | : O 30 | 31 | A B-Temporal 32 | year I-Temporal 33 | ago I-Temporal 34 | , O 35 | the B-Organisation 36 | UK I-Organisation 37 | ’ I-Organisation 38 | s I-Organisation 39 | brave I-Organisation 40 | Armed I-Organisation 41 | Forces I-Organisation 42 | and O 43 | aid B-Organisation 44 | workers I-Organisation 45 | played O 46 | a O 47 | crucial O 48 | role O 49 | in O 50 | getting O 51 | vital O 52 | supplies O 53 | to O 54 | innocent O 55 | civilians O 56 | suffering O 57 | at O 58 | the O 59 | hands O 60 | of O 61 | ISIL B-Organisation 62 | . O 63 | 64 | The B-Organisation 65 | Yazidi I-Organisation 66 | community I-Organisation 67 | were O 68 | exposed O 69 | , O 70 | starving O 71 | and O 72 | dying O 73 | of O 74 | thirst O 75 | when O 76 | they O 77 | were O 78 | forced O 79 | up O 80 | Mount B-Location 81 | Sinjar I-Location 82 | after O 83 | fleeing O 84 | from O 85 | their B-Location 86 | homes I-Location 87 | . O 88 | 89 | The B-Organisation 90 | UK I-Organisation 91 | Government I-Organisation 92 | conducted O 93 | seven B-Quantity 94 | airdrops O 95 | of O 96 | life O 97 | - O 98 | saving O 99 | humanitarian O 100 | aid O 101 | . O 102 | 103 | We B-Organisation 104 | will O 105 | continue O 106 | to O 107 | protect O 108 | all O 109 | those O 110 | persecuted O 111 | by O 112 | these B-Organisation 113 | brutal I-Organisation 114 | terrorists I-Organisation 115 | as O 116 | part O 117 | of O 118 | a B-Organisation 119 | 63 I-Organisation 120 | nation I-Organisation 121 | coalition I-Organisation 122 | designed O 123 | to O 124 | defeat O 125 | ISIL B-Organisation 126 | ’ O 127 | s O 128 | poisonous O 129 | ideology O 130 | . O 131 | 132 | We B-Organisation 133 | acknowledge O 134 | that O 135 | we B-Organisation 136 | need O 137 | to O 138 | do O 139 | more O 140 | in O 141 | the O 142 | fight O 143 | against O 144 | violent O 145 | Islamist B-Nationality 146 | extremism O 147 | . O 148 | 149 | Be O 150 | in O 151 | no O 152 | doubt O 153 | , O 154 | we B-Organisation 155 | are O 156 | committed O 157 | to O 158 | destroying O 159 | the O 160 | terrorist O 161 | scourge O 162 | of O 163 | ISIL B-Organisation 164 | that O 165 | threatens O 166 | the O 167 | lives O 168 | of O 169 | people B-Organisation 170 | from I-Organisation 171 | all I-Organisation 172 | communities I-Organisation 173 | in I-Organisation 174 | both I-Organisation 175 | Iraq I-Organisation 176 | and I-Organisation 177 | Syria I-Organisation 178 | . O 179 | 180 | Defeating O 181 | ISIL B-Organisation 182 | will O 183 | take O 184 | time O 185 | and O 186 | patience O 187 | but O 188 | it O 189 | is O 190 | a O 191 | fight O 192 | we B-Organisation 193 | must O 194 | win O 195 | . O 196 | 197 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/821F5BBD9F50DD7301C12ECCC926B00C.conll: -------------------------------------------------------------------------------- 1 | Minister B-Person 2 | for I-Person 3 | the I-Person 4 | Middle I-Person 5 | East I-Person 6 | Tobias I-Person 7 | Ellwood I-Person 8 | comments O 9 | following O 10 | an O 11 | attack O 12 | on O 13 | a B-Location 14 | petrol I-Location 15 | station I-Location 16 | and O 17 | restaurant B-Location 18 | in O 19 | al B-Location 20 | - I-Location 21 | Hilla I-Location 22 | in O 23 | Iraq B-Location 24 | 25 | Minister B-Person 26 | Tobias I-Person 27 | Ellwood I-Person 28 | said O 29 | : O 30 | 31 | I B-Person 32 | utterly O 33 | condemn O 34 | the O 35 | shocking O 36 | attack O 37 | on O 38 | a B-Location 39 | petrol I-Location 40 | station I-Location 41 | and O 42 | restaurant B-Location 43 | at O 44 | al B-Location 45 | - I-Location 46 | Hilla I-Location 47 | earlier O 48 | today O 49 | , O 50 | for O 51 | which O 52 | Daesh B-Organisation 53 | has O 54 | claimed O 55 | responsibility O 56 | . O 57 | 58 | The B-Organisation 59 | UK I-Organisation 60 | continues O 61 | to O 62 | stand O 63 | shoulder O 64 | to O 65 | shoulder O 66 | with O 67 | the B-Organisation 68 | government I-Organisation 69 | and O 70 | people B-Organisation 71 | of I-Organisation 72 | Iraq I-Organisation 73 | in O 74 | the O 75 | fight O 76 | against O 77 | terrorism O 78 | . O 79 | 80 | Such O 81 | an O 82 | attack O 83 | shows O 84 | the O 85 | desperation O 86 | of O 87 | Daesh B-Organisation 88 | as O 89 | they O 90 | see O 91 | their O 92 | so O 93 | - O 94 | called O 95 | caliphate O 96 | crumble O 97 | around O 98 | them O 99 | . O 100 | 101 | My B-Person 102 | thoughts O 103 | are O 104 | with O 105 | the B-Organisation 106 | family I-Organisation 107 | and I-Organisation 108 | friends I-Organisation 109 | of O 110 | those B-Organisation 111 | who I-Organisation 112 | have I-Organisation 113 | been I-Organisation 114 | killed I-Organisation 115 | and I-Organisation 116 | injured I-Organisation 117 | in O 118 | such O 119 | a O 120 | horrific O 121 | act O 122 | . O 123 | 124 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/85F0CF36CAEE839799563350B30B96B1.conll: -------------------------------------------------------------------------------- 1 | Boris B-Person 2 | Johnson I-Person 3 | met O 4 | the B-Person 5 | Iraqi I-Person 6 | Foreign I-Person 7 | Minister I-Person 8 | , O 9 | Ibrahim B-Person 10 | al I-Person 11 | - I-Person 12 | Jaafari I-Person 13 | in O 14 | London B-Location 15 | today B-Temporal 16 | . O 17 | 18 | A B-Person 19 | Foreign I-Person 20 | Office I-Person 21 | spokesman I-Person 22 | said O 23 | : O 24 | 25 | “ O 26 | The B-Person 27 | Foreign I-Person 28 | Secretary I-Person 29 | met O 30 | the B-Person 31 | Iraqi I-Person 32 | Foreign I-Person 33 | Minister I-Person 34 | , O 35 | Ibrahim B-Person 36 | al I-Person 37 | - I-Person 38 | Jaafari I-Person 39 | , O 40 | to O 41 | discuss O 42 | a O 43 | range O 44 | of O 45 | issues O 46 | including O 47 | the O 48 | ongoing O 49 | fight O 50 | and O 51 | the O 52 | progress O 53 | being O 54 | made O 55 | against O 56 | Daesh B-Organisation 57 | in O 58 | Iraq B-Location 59 | . O 60 | 61 | They B-Organisation 62 | also O 63 | discussed O 64 | military O 65 | , O 66 | humanitarian O 67 | and O 68 | stabilisation O 69 | planning O 70 | for O 71 | forthcoming O 72 | operations O 73 | to O 74 | free O 75 | the B-Location 76 | city I-Location 77 | of I-Location 78 | Mosul I-Location 79 | . O 80 | 81 | “ O 82 | The B-Person 83 | Foreign I-Person 84 | Secretary I-Person 85 | underlined O 86 | the B-Organisation 87 | UK I-Organisation 88 | ’ O 89 | s O 90 | commitment O 91 | to O 92 | supporting O 93 | the B-Organisation 94 | government I-Organisation 95 | of I-Organisation 96 | Iraq I-Organisation 97 | to O 98 | defeat O 99 | terrorism O 100 | and O 101 | to O 102 | give O 103 | all O 104 | Iraqis B-Organisation 105 | new O 106 | hope O 107 | for O 108 | the O 109 | future O 110 | . O 111 | 112 | They B-Organisation 113 | also O 114 | agreed O 115 | to O 116 | work O 117 | together O 118 | with O 119 | international B-Organisation 120 | partners I-Organisation 121 | and O 122 | at O 123 | the B-Organisation 124 | UN I-Organisation 125 | on O 126 | a O 127 | global O 128 | campaign O 129 | to O 130 | bring O 131 | Daesh B-Organisation 132 | to O 133 | justice O 134 | . O 135 | ” O 136 | 137 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/8A716D1B7D81B723DFD8361BAEA100DC.conll: -------------------------------------------------------------------------------- 1 | Minister B-Person 2 | for I-Person 3 | the I-Person 4 | Middle I-Person 5 | East I-Person 6 | Tobias I-Person 7 | Ellwood I-Person 8 | has O 9 | commented O 10 | on O 11 | the O 12 | long O 13 | - O 14 | overdue O 15 | delivery O 16 | of O 17 | aid O 18 | to O 19 | civilians B-Organisation 20 | in O 21 | Madaya B-Location 22 | . O 23 | 24 | Foreign B-Person 25 | Office I-Person 26 | Minister I-Person 27 | Tobias I-Person 28 | Ellwood I-Person 29 | said O 30 | : O 31 | 32 | I O 33 | am O 34 | deeply O 35 | relieved O 36 | by O 37 | the O 38 | delivery O 39 | of O 40 | life O 41 | - O 42 | saving O 43 | food O 44 | and O 45 | aid O 46 | supplies O 47 | to O 48 | Madaya B-Location 49 | . O 50 | 51 | Although O 52 | long O 53 | overdue O 54 | , O 55 | these O 56 | supplies O 57 | will O 58 | help O 59 | relieve O 60 | the O 61 | suffering O 62 | of O 63 | 40 B-Organisation 64 | , I-Organisation 65 | 000 I-Organisation 66 | civilians I-Organisation 67 | besieged O 68 | within O 69 | the B-Location 70 | city I-Location 71 | . O 72 | 73 | The B-Organisation 74 | UN I-Organisation 75 | Security I-Organisation 76 | Council I-Organisation 77 | is O 78 | clear O 79 | that O 80 | the O 81 | use O 82 | of O 83 | starvation O 84 | of O 85 | civilians O 86 | as O 87 | a O 88 | method O 89 | of O 90 | combat O 91 | , O 92 | including O 93 | the O 94 | besiegement O 95 | of O 96 | populated B-Location 97 | areas I-Location 98 | , O 99 | is O 100 | unacceptable O 101 | . O 102 | 103 | But O 104 | last B-Temporal 105 | year I-Temporal 106 | just O 107 | 10 B-Quantity 108 | per I-Quantity 109 | cent I-Quantity 110 | of O 111 | UN B-Organisation 112 | convoy O 113 | requests O 114 | to O 115 | besieged B-Location 116 | areas I-Location 117 | were O 118 | approved O 119 | by O 120 | the B-Organisation 121 | Asad I-Organisation 122 | regime I-Organisation 123 | . O 124 | 125 | The B-Organisation 126 | International I-Organisation 127 | Community I-Organisation 128 | must O 129 | do O 130 | all O 131 | it B-Organisation 132 | can O 133 | to O 134 | pressure O 135 | the B-Organisation 136 | Asad I-Organisation 137 | regime I-Organisation 138 | to O 139 | lift O 140 | sieges O 141 | in O 142 | place O 143 | across O 144 | Syria B-Location 145 | and O 146 | sustain O 147 | humanitarian O 148 | access O 149 | . O 150 | 151 | The B-Organisation 152 | UK I-Organisation 153 | is O 154 | at O 155 | the O 156 | forefront O 157 | of O 158 | global O 159 | efforts O 160 | to O 161 | provide O 162 | humanitarian O 163 | aid O 164 | having O 165 | committed O 166 | over O 167 | £ O 168 | 1 O 169 | . O 170 | 1bn O 171 | to O 172 | vulnerable O 173 | and O 174 | displaced O 175 | people O 176 | inside O 177 | Syria B-Location 178 | and O 179 | to O 180 | refugees B-Organisation 181 | in O 182 | neighbouring B-Location 183 | countries I-Location 184 | . O 185 | 186 | In O 187 | February B-Temporal 188 | we O 189 | will O 190 | bring O 191 | together O 192 | world B-Organisation 193 | leaders I-Organisation 194 | in O 195 | London B-Location 196 | to O 197 | identify O 198 | long O 199 | - O 200 | term O 201 | solutions O 202 | to O 203 | help O 204 | address O 205 | the O 206 | needs O 207 | of O 208 | those O 209 | affected O 210 | by O 211 | the O 212 | crisis O 213 | . O 214 | 215 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/C35F45FE7CBA87AE07646066842C91E0.conll: -------------------------------------------------------------------------------- 1 | Foreign B-Person 2 | Secretary I-Person 3 | Boris I-Person 4 | Johnson I-Person 5 | welcomes O 6 | the O 7 | UN B-DocumentReference 8 | Board I-DocumentReference 9 | of I-DocumentReference 10 | Inquiry I-DocumentReference 11 | ’ I-DocumentReference 12 | s I-DocumentReference 13 | summary I-DocumentReference 14 | of I-DocumentReference 15 | its I-DocumentReference 16 | investigation I-DocumentReference 17 | into O 18 | the O 19 | attack O 20 | on O 21 | an B-Organisation 22 | aid I-Organisation 23 | convoy I-Organisation 24 | , O 25 | which O 26 | took O 27 | place O 28 | in O 29 | Urem B-Location 30 | al I-Location 31 | - I-Location 32 | Kubra I-Location 33 | , I-Location 34 | northern I-Location 35 | Syria I-Location 36 | , O 37 | on O 38 | 19 B-Temporal 39 | September I-Temporal 40 | . O 41 | 42 | The B-Person 43 | Foreign I-Person 44 | Secretary I-Person 45 | said O 46 | : O 47 | 48 | I B-Person 49 | welcome O 50 | the B-DocumentReference 51 | Board I-DocumentReference 52 | of I-DocumentReference 53 | Inquiry I-DocumentReference 54 | ’ I-DocumentReference 55 | s I-DocumentReference 56 | report I-DocumentReference 57 | , O 58 | which O 59 | found O 60 | that O 61 | the B-Organisation 62 | aid I-Organisation 63 | convoy I-Organisation 64 | was O 65 | attacked O 66 | from O 67 | the O 68 | air O 69 | , O 70 | making O 71 | it O 72 | clear O 73 | that O 74 | the O 75 | strike O 76 | was O 77 | carried O 78 | out O 79 | by O 80 | forces O 81 | of O 82 | either O 83 | the O 84 | Asad B-Organisation 85 | regime I-Organisation 86 | or O 87 | Russia O 88 | . O 89 | 90 | This O 91 | was O 92 | one O 93 | of O 94 | the O 95 | most O 96 | appalling O 97 | incidents O 98 | of O 99 | the O 100 | conflict O 101 | so O 102 | far O 103 | . O 104 | 105 | Whether O 106 | deliberate O 107 | or O 108 | accidental O 109 | , O 110 | this O 111 | was O 112 | an O 113 | indefensible O 114 | attack O 115 | , O 116 | with O 117 | at O 118 | least O 119 | ten B-Quantity 120 | people B-Organisation 121 | killed O 122 | and O 123 | vital O 124 | aid O 125 | to O 126 | vulnerable B-Organisation 127 | people I-Organisation 128 | destroyed O 129 | . O 130 | 131 | Over B-Quantity 132 | half I-Quantity 133 | a I-Quantity 134 | million I-Quantity 135 | Syrians B-Organisation 136 | are O 137 | still O 138 | in O 139 | areas O 140 | besieged O 141 | by O 142 | the B-Organisation 143 | regime I-Organisation 144 | – O 145 | many O 146 | have O 147 | received O 148 | no O 149 | aid O 150 | since O 151 | June B-Temporal 152 | . O 153 | 154 | The B-Organisation 155 | regime I-Organisation 156 | and O 157 | its B-Organisation 158 | backers I-Organisation 159 | must O 160 | comply O 161 | with O 162 | their O 163 | obligations O 164 | , O 165 | including O 166 | the O 167 | recently O 168 | adopted O 169 | UN B-Organisation 170 | Security I-Organisation 171 | Council I-Organisation 172 | Resolution O 173 | 2328 O 174 | , O 175 | to O 176 | allow O 177 | the O 178 | UN B-Organisation 179 | unconditional O 180 | , O 181 | safe O 182 | and O 183 | unhindered O 184 | access O 185 | across O 186 | Syria B-Location 187 | . O 188 | 189 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/C3BB2D0EE548765402C961463415E4CA.conll: -------------------------------------------------------------------------------- 1 | Philip B-Person 2 | Hammond I-Person 3 | welcomes O 4 | UNSC B-Organisation 5 | support O 6 | for O 7 | new O 8 | talks O 9 | to O 10 | end O 11 | Syrian B-Nationality 12 | conflict O 13 | . O 14 | 15 | Foreign B-Person 16 | Secretary I-Person 17 | Philip I-Person 18 | Hammond I-Person 19 | welcomed O 20 | the O 21 | unanimous O 22 | adoption O 23 | of O 24 | UN B-DocumentReference 25 | Security I-DocumentReference 26 | Council I-DocumentReference 27 | Resolution I-DocumentReference 28 | 2254 I-DocumentReference 29 | as O 30 | an O 31 | important O 32 | step O 33 | towards O 34 | bringing O 35 | the O 36 | warring O 37 | parties O 38 | in O 39 | Syria B-Location 40 | to O 41 | the O 42 | negotiating O 43 | table O 44 | and O 45 | to O 46 | a O 47 | transition O 48 | away O 49 | from O 50 | the B-Organisation 51 | Assad I-Organisation 52 | regime I-Organisation 53 | to O 54 | an O 55 | inclusive O 56 | Syrian O 57 | government O 58 | . O 59 | 60 | Mr B-Person 61 | Hammond I-Person 62 | said O 63 | : O 64 | 65 | Syria B-Location 66 | has O 67 | for O 68 | too O 69 | long O 70 | been O 71 | the O 72 | world O 73 | ’ O 74 | s O 75 | biggest O 76 | humanitarian O 77 | and O 78 | security O 79 | crisis O 80 | . O 81 | 82 | The B-Organisation 83 | international I-Organisation 84 | community I-Organisation 85 | has O 86 | now O 87 | come O 88 | together O 89 | to O 90 | work O 91 | to O 92 | end O 93 | the O 94 | bloody O 95 | civil O 96 | war O 97 | in O 98 | Syria B-Location 99 | and O 100 | has O 101 | paved O 102 | the O 103 | way O 104 | for O 105 | talks O 106 | amongst O 107 | the B-Organisation 108 | Syrian I-Organisation 109 | parties I-Organisation 110 | that O 111 | will O 112 | see O 113 | a O 114 | transition O 115 | away O 116 | from O 117 | the O 118 | murderous O 119 | regime B-Organisation 120 | of I-Organisation 121 | Assad I-Organisation 122 | . O 123 | 124 | This O 125 | resolution O 126 | gives O 127 | us B-Organisation 128 | a O 129 | timetable O 130 | and O 131 | a O 132 | clear O 133 | way O 134 | forward O 135 | . O 136 | 137 | Of O 138 | course O 139 | there O 140 | are O 141 | many O 142 | challenges O 143 | ahead O 144 | but O 145 | the O 146 | world O 147 | has O 148 | taken O 149 | a O 150 | great O 151 | step O 152 | forward O 153 | to O 154 | resolving O 155 | the O 156 | Syrian B-Nationality 157 | crisis O 158 | . O 159 | 160 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/C66A9E77E1142C3F0909657DDDD475A5.conll: -------------------------------------------------------------------------------- 1 | Tobias B-Person 2 | Ellwood I-Person 3 | , O 4 | Minister B-Person 5 | for I-Person 6 | the I-Person 7 | Middle I-Person 8 | East I-Person 9 | welcomes O 10 | the O 11 | liberation O 12 | of O 13 | Sinjar B-Location 14 | and O 15 | reiterates O 16 | support O 17 | for O 18 | the B-Organisation 19 | Government I-Organisation 20 | of I-Organisation 21 | Iraq I-Organisation 22 | in O 23 | the O 24 | fight O 25 | against O 26 | ISIL B-Organisation 27 | 28 | Minister B-Person 29 | Ellwood I-Person 30 | said O 31 | : O 32 | 33 | This O 34 | is O 35 | a O 36 | significant O 37 | day O 38 | ; O 39 | Sinjar B-Location 40 | has O 41 | been O 42 | liberated O 43 | after O 44 | its O 45 | violent O 46 | and O 47 | unwanted O 48 | occupation O 49 | by O 50 | the B-Organisation 51 | terrorist I-Organisation 52 | group I-Organisation 53 | ISIL I-Organisation 54 | . O 55 | 56 | I B-Person 57 | commend O 58 | Kurdish B-Organisation 59 | and I-Organisation 60 | Yezidi I-Organisation 61 | fighters I-Organisation 62 | for O 63 | their O 64 | determined O 65 | and O 66 | sustained O 67 | efforts O 68 | to O 69 | reclaim O 70 | the B-Location 71 | town I-Location 72 | . O 73 | 74 | For O 75 | ISIL B-Organisation 76 | , O 77 | this O 78 | means O 79 | yet O 80 | another O 81 | military O 82 | defeat O 83 | in O 84 | Iraq B-Location 85 | . O 86 | 87 | They B-Organisation 88 | have O 89 | now O 90 | lost O 91 | more O 92 | than O 93 | 30 B-Quantity 94 | percent I-Quantity 95 | of O 96 | the B-Location 97 | territory I-Location 98 | they B-Organisation 99 | once O 100 | held O 101 | there O 102 | . O 103 | 104 | The B-Organisation 105 | Coalition I-Organisation 106 | ’ O 107 | s O 108 | strategy O 109 | to O 110 | defeat O 111 | ISIL B-Organisation 112 | is O 113 | succeeding O 114 | . O 115 | 116 | Sinjar B-Location 117 | is O 118 | free O 119 | because O 120 | Royal B-Organisation 121 | Air I-Organisation 122 | Force I-Organisation 123 | and O 124 | other B-Organisation 125 | Coalition I-Organisation 126 | forces I-Organisation 127 | provided O 128 | vital O 129 | air O 130 | support O 131 | for O 132 | local B-Organisation 133 | , I-Organisation 134 | legitimate I-Organisation 135 | ground I-Organisation 136 | forces I-Organisation 137 | , O 138 | focused O 139 | on O 140 | defeating O 141 | ISIL B-Organisation 142 | and O 143 | minimising O 144 | civilian O 145 | casualties O 146 | . O 147 | 148 | The O 149 | UK O 150 | remains O 151 | steadfast O 152 | in O 153 | its O 154 | support O 155 | of O 156 | the B-Organisation 157 | Government I-Organisation 158 | of I-Organisation 159 | Iraq I-Organisation 160 | and O 161 | the B-Organisation 162 | Iraqi I-Organisation 163 | people I-Organisation 164 | in O 165 | their O 166 | fight O 167 | against O 168 | ISIL B-Organisation 169 | . O 170 | 171 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/CBD73D24324D3C3A3A76AC430E05A069.conll: -------------------------------------------------------------------------------- 1 | Foreign B-Person 2 | Secretary I-Person 3 | welcomes O 4 | new O 5 | Syria B-Organisation 6 | sanctions O 7 | listings O 8 | that O 9 | apply O 10 | further O 11 | pressure O 12 | on O 13 | Assad B-Organisation 14 | ’ I-Organisation 15 | s I-Organisation 16 | brutal I-Organisation 17 | regime I-Organisation 18 | . O 19 | 20 | Philip B-Person 21 | Hammond I-Person 22 | said O 23 | : O 24 | The O 25 | UK O 26 | has O 27 | worked O 28 | tirelessly O 29 | with O 30 | other O 31 | member B-Organisation 32 | states I-Organisation 33 | to O 34 | secure O 35 | 13 B-Quantity 36 | new O 37 | EU B-Organisation 38 | sanctions O 39 | listings O 40 | on O 41 | Syria B-Location 42 | . O 43 | 44 | We B-Organisation 45 | are O 46 | targeting O 47 | developers B-Organisation 48 | , I-Organisation 49 | proliferators I-Organisation 50 | and I-Organisation 51 | users I-Organisation 52 | of O 53 | chemical B-Weapon 54 | weapons I-Weapon 55 | , O 56 | along O 57 | with O 58 | businessmen B-Organisation 59 | and O 60 | companies B-Organisation 61 | supporting O 62 | the O 63 | brutal O 64 | shabiha B-Organisation 65 | militias I-Organisation 66 | . O 67 | 68 | We B-Organisation 69 | have O 70 | also O 71 | agreed O 72 | to O 73 | target O 74 | individuals O 75 | supplying O 76 | oil O 77 | to O 78 | the O 79 | regime O 80 | , O 81 | including O 82 | George B-Person 83 | Haswani I-Person 84 | , O 85 | a O 86 | middleman O 87 | buying O 88 | oil O 89 | from O 90 | ISIL B-Organisation 91 | on O 92 | behalf O 93 | of O 94 | the B-Organisation 95 | regime I-Organisation 96 | . O 97 | 98 | This O 99 | listing O 100 | gives O 101 | yet O 102 | another O 103 | indication O 104 | that O 105 | Assad B-Person 106 | ’ O 107 | s O 108 | “ O 109 | war O 110 | ” O 111 | on O 112 | ISIL B-Organisation 113 | is O 114 | a O 115 | sham O 116 | and O 117 | that O 118 | he B-Person 119 | supports O 120 | them O 121 | financially O 122 | . O 123 | 124 | These O 125 | sanctions O 126 | show O 127 | that O 128 | EU B-Organisation 129 | is O 130 | united O 131 | in O 132 | its O 133 | condemnation O 134 | of O 135 | Assad B-Person 136 | ’ O 137 | s O 138 | brutal O 139 | policies O 140 | . O 141 | 142 | We B-Organisation 143 | will O 144 | continue O 145 | applying O 146 | pressure O 147 | to O 148 | the B-Organisation 149 | regime I-Organisation 150 | until O 151 | it O 152 | reassesses O 153 | its O 154 | position O 155 | , O 156 | ends O 157 | the O 158 | violence O 159 | and O 160 | engages O 161 | in O 162 | meaningful O 163 | negotiations O 164 | with O 165 | the O 166 | moderate O 167 | opposition O 168 | . O 169 | 170 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/UK_Government/D23AD6D80730BEF27D23CDD2287AC200.conll: -------------------------------------------------------------------------------- 1 | Minister B-Person 2 | for I-Person 3 | the I-Person 4 | Middle I-Person 5 | East I-Person 6 | Tobias I-Person 7 | Ellwood I-Person 8 | comments O 9 | on O 10 | the O 11 | start O 12 | of O 13 | the B-Organisation 14 | Iraqi I-Organisation 15 | mission I-Organisation 16 | to O 17 | recapture O 18 | the B-Location 19 | city I-Location 20 | of I-Location 21 | Mosul I-Location 22 | in O 23 | Iraq B-Location 24 | . O 25 | 26 | Minister B-Person 27 | Ellwood I-Person 28 | said O 29 | : O 30 | 31 | The O 32 | start O 33 | of O 34 | Iraqi B-Nationality 35 | operations O 36 | to O 37 | retake O 38 | the B-Location 39 | city I-Location 40 | of I-Location 41 | Mosul I-Location 42 | marks O 43 | another O 44 | step O 45 | forward O 46 | towards O 47 | clearing O 48 | Daesh B-Organisation 49 | from O 50 | Iraq B-Location 51 | . O 52 | 53 | After O 54 | 2 B-Temporal 55 | years I-Temporal 56 | of O 57 | brutal O 58 | rule O 59 | , O 60 | the B-Organisation 61 | people I-Organisation 62 | of I-Organisation 63 | Mosul I-Organisation 64 | can O 65 | start O 66 | to O 67 | have O 68 | hope O 69 | for O 70 | a O 71 | better O 72 | future O 73 | . O 74 | 75 | This O 76 | will O 77 | be O 78 | the O 79 | greatest O 80 | challenge O 81 | that O 82 | Iraq B-Organisation 83 | ’ I-Organisation 84 | s I-Organisation 85 | Security I-Organisation 86 | Forces I-Organisation 87 | have O 88 | yet O 89 | encountered O 90 | – O 91 | they B-Organisation 92 | are O 93 | up O 94 | to O 95 | that O 96 | challenge O 97 | . O 98 | 99 | The B-Organisation 100 | UK I-Organisation 101 | , O 102 | as O 103 | part O 104 | of O 105 | the B-Organisation 106 | Global I-Organisation 107 | Coalition I-Organisation 108 | , O 109 | is O 110 | committed O 111 | to O 112 | continuing O 113 | to O 114 | provide O 115 | the B-Organisation 116 | government I-Organisation 117 | of I-Organisation 118 | Iraq I-Organisation 119 | with O 120 | military O 121 | , O 122 | humanitarian O 123 | and O 124 | stabilisation O 125 | support O 126 | . O 127 | 128 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/US_State_Department/108A8CFC05CC3139A6E65B7A4239F5C6.conll: -------------------------------------------------------------------------------- 1 | The B-Organisation 2 | United I-Organisation 3 | States I-Organisation 4 | strongly O 5 | condemns O 6 | the O 7 | barbaric O 8 | terrorist O 9 | attacks O 10 | in O 11 | Iraq B-Location 12 | today B-Temporal 13 | , O 14 | including O 15 | one O 16 | in O 17 | Baghdad B-Location 18 | claimed O 19 | by O 20 | Daesh B-Organisation 21 | that O 22 | specifically O 23 | targeted O 24 | mourners B-Organisation 25 | inside O 26 | a B-Location 27 | funeral I-Location 28 | tent I-Location 29 | . O 30 | 31 | We B-Organisation 32 | extend O 33 | our O 34 | deepest O 35 | condolences O 36 | to O 37 | the B-Organisation 38 | victims I-Organisation 39 | ’ I-Organisation 40 | families I-Organisation 41 | and I-Organisation 42 | friends I-Organisation 43 | . O 44 | 45 | This O 46 | attack O 47 | is O 48 | yet O 49 | another O 50 | sign O 51 | of O 52 | Daesh B-Organisation 53 | ’ O 54 | s O 55 | cowardice O 56 | and O 57 | contempt O 58 | for O 59 | human O 60 | life O 61 | and O 62 | their O 63 | attempt O 64 | to O 65 | sow O 66 | sectarian O 67 | discord O 68 | among O 69 | the B-Organisation 70 | people I-Organisation 71 | of I-Organisation 72 | Iraq I-Organisation 73 | . O 74 | 75 | It O 76 | only O 77 | underscores O 78 | the O 79 | importance O 80 | of O 81 | Coalition B-Organisation 82 | efforts O 83 | to O 84 | support O 85 | Iraqi B-Organisation 86 | security I-Organisation 87 | forces I-Organisation 88 | in O 89 | their O 90 | campaign O 91 | to O 92 | defeat O 93 | this B-Organisation 94 | terrorist I-Organisation 95 | group I-Organisation 96 | . O 97 | 98 | The B-Organisation 99 | United I-Organisation 100 | States I-Organisation 101 | remains O 102 | committed O 103 | to O 104 | that O 105 | goal O 106 | . O 107 | 108 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/US_State_Department/1F9A1D68D16594512C9CBAD02D53E4C1.conll: -------------------------------------------------------------------------------- 1 | We B-Organisation 2 | welcome O 3 | the O 4 | positive O 5 | outcome O 6 | of O 7 | the O 8 | gathering O 9 | of O 10 | the B-Organisation 11 | Syrian I-Organisation 12 | opposition I-Organisation 13 | in O 14 | Riyadh B-Location 15 | today O 16 | , O 17 | including O 18 | reaching O 19 | a O 20 | consensus O 21 | on O 22 | principles O 23 | for O 24 | a O 25 | pluralistic O 26 | and O 27 | democratic O 28 | Syria B-Location 29 | and O 30 | on O 31 | how O 32 | to O 33 | advance O 34 | a O 35 | political O 36 | settlement O 37 | to O 38 | end O 39 | the O 40 | conflict O 41 | in O 42 | Syria B-Location 43 | . O 44 | 45 | As O 46 | I B-Person 47 | conveyed O 48 | to O 49 | Foreign B-Person 50 | Minister I-Person 51 | Al I-Person 52 | - I-Person 53 | Jubeir I-Person 54 | today B-Temporal 55 | , O 56 | we B-Organisation 57 | appreciate O 58 | Saudi B-Organisation 59 | Arabia I-Organisation 60 | ’ I-Organisation 61 | s I-Organisation 62 | leadership I-Organisation 63 | in O 64 | convening O 65 | this O 66 | broad O 67 | and O 68 | representative O 69 | group B-Organisation 70 | of I-Organisation 71 | 116 I-Organisation 72 | participants I-Organisation 73 | , O 74 | who O 75 | agreed O 76 | today B-Temporal 77 | on O 78 | the O 79 | structure O 80 | of O 81 | their O 82 | negotiating O 83 | body O 84 | to O 85 | represent O 86 | them O 87 | in O 88 | the O 89 | political O 90 | process O 91 | . O 92 | 93 | We B-Organisation 94 | appreciate O 95 | that O 96 | this O 97 | extremely O 98 | diverse O 99 | group B-Organisation 100 | of I-Organisation 101 | Syrians I-Organisation 102 | put O 103 | aside O 104 | differences O 105 | in O 106 | the O 107 | interest O 108 | of O 109 | building O 110 | a O 111 | new O 112 | Syria B-Location 113 | . O 114 | 115 | With O 116 | the O 117 | progress O 118 | made O 119 | in O 120 | both O 121 | Vienna B-Location 122 | and O 123 | now O 124 | in O 125 | Riyadh B-Location 126 | , O 127 | the B-Organisation 128 | International I-Organisation 129 | Syria I-Organisation 130 | Support I-Organisation 131 | Group I-Organisation 132 | continues O 133 | to O 134 | build O 135 | a O 136 | foundation O 137 | for O 138 | constructive O 139 | negotiations O 140 | in O 141 | January B-Temporal 142 | under O 143 | UN B-Organisation 144 | auspices O 145 | , O 146 | regarding O 147 | a O 148 | political O 149 | transition O 150 | in O 151 | accordance O 152 | with O 153 | the B-DocumentReference 154 | Geneva I-DocumentReference 155 | Communique I-DocumentReference 156 | of I-DocumentReference 157 | 2012 I-DocumentReference 158 | . O 159 | 160 | While O 161 | this O 162 | important O 163 | step O 164 | forward O 165 | brings O 166 | us B-Organisation 167 | closer O 168 | to O 169 | starting O 170 | negotiations O 171 | between O 172 | the B-Organisation 173 | Syrian I-Organisation 174 | parties I-Organisation 175 | , O 176 | we B-Organisation 177 | recognize O 178 | the O 179 | difficult O 180 | work O 181 | ahead O 182 | , O 183 | and O 184 | remain O 185 | determined O 186 | to O 187 | continue O 188 | toward O 189 | a O 190 | political O 191 | settlement O 192 | that O 193 | brings O 194 | an O 195 | end O 196 | the O 197 | conflict O 198 | . O 199 | 200 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/US_State_Department/2C271BB862754AECF35665280FE7B93C.conll: -------------------------------------------------------------------------------- 1 | I B-Person 2 | welcome O 3 | today O 4 | ' O 5 | s O 6 | news O 7 | that O 8 | Syrian B-Organisation 9 | opposition I-Organisation 10 | forces I-Organisation 11 | liberated O 12 | the B-Location 13 | Syrian I-Location 14 | town I-Location 15 | of I-Location 16 | Dabiq I-Location 17 | from O 18 | ISIL B-Organisation 19 | control O 20 | , O 21 | aided O 22 | by O 23 | strong O 24 | support O 25 | from O 26 | our O 27 | ally O 28 | Turkey B-Organisation 29 | and O 30 | our B-Organisation 31 | international I-Organisation 32 | coalition I-Organisation 33 | . O 34 | 35 | This O 36 | is O 37 | more O 38 | than O 39 | just O 40 | the O 41 | latest O 42 | military O 43 | result O 44 | against O 45 | this B-Organisation 46 | barbaric I-Organisation 47 | group I-Organisation 48 | . O 49 | 50 | Dabiq B-Location 51 | held O 52 | symbolic O 53 | importance O 54 | to O 55 | ISIL B-Organisation 56 | . O 57 | 58 | The B-Organisation 59 | group I-Organisation 60 | carried O 61 | out O 62 | unspeakable O 63 | atrocities O 64 | in O 65 | Dabiq B-Location 66 | , O 67 | named O 68 | its B-DocumentReference 69 | English I-DocumentReference 70 | - I-DocumentReference 71 | language I-DocumentReference 72 | magazine I-DocumentReference 73 | after O 74 | the B-Location 75 | town I-Location 76 | and O 77 | claimed O 78 | it O 79 | would O 80 | be O 81 | the O 82 | site O 83 | of O 84 | a O 85 | final O 86 | victory O 87 | for O 88 | the B-Organisation 89 | so I-Organisation 90 | - I-Organisation 91 | called I-Organisation 92 | caliphate I-Organisation 93 | . O 94 | 95 | Instead O 96 | its O 97 | liberation O 98 | gives O 99 | the O 100 | campaign O 101 | to O 102 | deliver O 103 | ISIL B-Organisation 104 | a O 105 | lasting O 106 | defeat O 107 | new O 108 | momentum O 109 | in O 110 | Syria B-Location 111 | . O 112 | 113 | Again O 114 | I B-Person 115 | want O 116 | to O 117 | congratulate O 118 | the B-Organisation 119 | Syrians I-Organisation 120 | who O 121 | fought O 122 | to O 123 | free O 124 | Dabiq B-Location 125 | and O 126 | thank O 127 | our O 128 | ally O 129 | Turkey B-Organisation 130 | for O 131 | the O 132 | close O 133 | coordination O 134 | during O 135 | this O 136 | operation O 137 | . O 138 | ‎ O 139 | 140 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/US_State_Department/2C6CB502FAD51B6B4CA765E57E221BEA.conll: -------------------------------------------------------------------------------- 1 | Special B-Person 2 | Presidential I-Person 3 | Envoy I-Person 4 | for I-Person 5 | the I-Person 6 | Global I-Person 7 | Coalition I-Person 8 | to I-Person 9 | Counter I-Person 10 | ISIL I-Person 11 | Brett I-Person 12 | McGurk I-Person 13 | will O 14 | lead O 15 | an B-Organisation 16 | interagency I-Organisation 17 | delegation I-Organisation 18 | to O 19 | Berlin B-Location 20 | , I-Location 21 | Germany I-Location 22 | for O 23 | a O 24 | meeting O 25 | on O 26 | November B-Temporal 27 | 17 I-Temporal 28 | with O 29 | key B-Organisation 30 | members I-Organisation 31 | of I-Organisation 32 | the I-Organisation 33 | Counter I-Organisation 34 | ISIL I-Organisation 35 | Coalition I-Organisation 36 | . O 37 | 38 | This O 39 | meeting O 40 | will O 41 | be O 42 | an O 43 | opportunity O 44 | for O 45 | Coalition B-Organisation 46 | partners I-Organisation 47 | to O 48 | discuss O 49 | ongoing O 50 | efforts O 51 | for O 52 | Mosul B-Location 53 | and O 54 | Raqqah B-Location 55 | and O 56 | to O 57 | explore O 58 | ways O 59 | to O 60 | further O 61 | accelerate O 62 | the O 63 | campaign O 64 | against O 65 | Da B-Organisation 66 | ’ I-Organisation 67 | esh I-Organisation 68 | in O 69 | Iraq B-Location 70 | and O 71 | Syria B-Location 72 | and O 73 | its B-Organisation 74 | network I-Organisation 75 | outside O 76 | those O 77 | geographic O 78 | borders O 79 | . O 80 | 81 | It O 82 | will O 83 | include O 84 | a O 85 | detailed O 86 | discussion O 87 | on O 88 | the O 89 | priorities O 90 | needed O 91 | to O 92 | sustain O 93 | the O 94 | momentum O 95 | of O 96 | the O 97 | campaign O 98 | along O 99 | our O 100 | multiple O 101 | lines O 102 | of O 103 | effort O 104 | , O 105 | including O 106 | military O 107 | , O 108 | foreign B-Organisation 109 | terrorist I-Organisation 110 | fighters I-Organisation 111 | , O 112 | counterterrorist O 113 | financing O 114 | , O 115 | counter O 116 | - O 117 | messaging O 118 | and O 119 | stabilization O 120 | of O 121 | liberated B-Location 122 | areas I-Location 123 | . O 124 | 125 | The B-Organisation 126 | Coalition I-Organisation 127 | ’ I-Organisation 128 | s I-Organisation 129 | Small I-Organisation 130 | Group I-Organisation 131 | regularly O 132 | meets O 133 | to O 134 | coordinate O 135 | and O 136 | enhance O 137 | combined O 138 | efforts O 139 | to O 140 | counter O 141 | Da B-Organisation 142 | ’ I-Organisation 143 | esh I-Organisation 144 | . O 145 | 146 | The O 147 | last O 148 | meeting O 149 | of O 150 | the B-Organisation 151 | Small I-Organisation 152 | Group I-Organisation 153 | took O 154 | place O 155 | in O 156 | Washington B-Location 157 | , I-Location 158 | D I-Location 159 | . I-Location 160 | C I-Location 161 | . I-Location 162 | on O 163 | July B-Temporal 164 | 21 I-Temporal 165 | , I-Temporal 166 | 2016 I-Temporal 167 | , O 168 | at O 169 | the O 170 | Ministerial O 171 | level O 172 | . O 173 | 174 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/US_State_Department/31E63458C6BF8F6F1D50D7041E5B89C9.conll: -------------------------------------------------------------------------------- 1 | Recent O 2 | media B-DocumentReference 3 | reports I-DocumentReference 4 | have O 5 | speculated O 6 | on O 7 | the O 8 | role O 9 | that O 10 | international B-Organisation 11 | forces I-Organisation 12 | will O 13 | play O 14 | in O 15 | the O 16 | Iraqi O 17 | operation O 18 | to O 19 | liberate O 20 | Mosul B-Location 21 | . O 22 | 23 | As O 24 | we B-Organisation 25 | have O 26 | repeatedly O 27 | made O 28 | clear O 29 | , O 30 | the B-Organisation 31 | United I-Organisation 32 | States I-Organisation 33 | supports O 34 | Iraqi B-Nationality 35 | unity O 36 | and O 37 | sovereignty O 38 | . O 39 | 40 | To O 41 | that O 42 | end O 43 | , O 44 | we B-Organisation 45 | believe O 46 | all B-Organisation 47 | international I-Organisation 48 | forces I-Organisation 49 | in O 50 | Iraq B-Location 51 | should O 52 | be O 53 | there O 54 | with O 55 | the O 56 | approval O 57 | of O 58 | and O 59 | in O 60 | coordination O 61 | with O 62 | the B-Organisation 63 | Government I-Organisation 64 | of I-Organisation 65 | Iraq I-Organisation 66 | , O 67 | under O 68 | the O 69 | umbrella O 70 | of O 71 | the B-Organisation 72 | Coalition I-Organisation 73 | . O 74 | 75 | It O 76 | is O 77 | imperative O 78 | for O 79 | all B-Organisation 80 | parties I-Organisation 81 | to O 82 | coordinate O 83 | closely O 84 | over O 85 | the B-Temporal 86 | coming I-Temporal 87 | days I-Temporal 88 | and I-Temporal 89 | weeks I-Temporal 90 | to O 91 | ensure O 92 | unity O 93 | of O 94 | effort O 95 | in O 96 | defeating O 97 | Daesh B-Organisation 98 | and O 99 | to O 100 | provide O 101 | for O 102 | the O 103 | lasting O 104 | security O 105 | of O 106 | the B-Organisation 107 | Iraqi I-Organisation 108 | people I-Organisation 109 | . O 110 | 111 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/US_State_Department/340238F0CEF0581CBBA9106FFB2EA97C.conll: -------------------------------------------------------------------------------- 1 | Ambassador B-Person 2 | Robert I-Person 3 | Ford I-Person 4 | has O 5 | completed O 6 | his O 7 | consultations O 8 | in O 9 | Washington B-Location 10 | and O 11 | is O 12 | returning O 13 | to O 14 | Syria B-Location 15 | . O 16 | 17 | He B-Person 18 | will O 19 | continue O 20 | the O 21 | work O 22 | he B-Person 23 | was O 24 | doing O 25 | previously O 26 | ; O 27 | namely O 28 | , O 29 | delivering O 30 | the B-Organisation 31 | United I-Organisation 32 | States I-Organisation 33 | ’ O 34 | message O 35 | to O 36 | the B-Organisation 37 | people I-Organisation 38 | of I-Organisation 39 | Syria I-Organisation 40 | ; O 41 | providing O 42 | reliable O 43 | reporting O 44 | on O 45 | the O 46 | situation O 47 | on O 48 | the O 49 | ground O 50 | ; O 51 | and O 52 | engaging O 53 | with O 54 | the O 55 | full O 56 | spectrum O 57 | of O 58 | Syrian O 59 | society O 60 | on O 61 | how O 62 | to O 63 | end O 64 | the O 65 | bloodshed O 66 | and O 67 | achieve O 68 | a O 69 | peaceful O 70 | political O 71 | transition O 72 | . O 73 | 74 | We B-Organisation 75 | believe O 76 | his B-Person 77 | presence O 78 | in O 79 | the B-Location 80 | country I-Location 81 | is O 82 | among O 83 | the O 84 | most O 85 | effective O 86 | ways O 87 | to O 88 | send O 89 | the O 90 | message O 91 | that O 92 | the B-Organisation 93 | United I-Organisation 94 | States I-Organisation 95 | stands O 96 | with O 97 | the B-Organisation 98 | people I-Organisation 99 | of I-Organisation 100 | Syria I-Organisation 101 | . O 102 | 103 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/US_State_Department/4174EF50EF8D9024BCC261D602202738.conll: -------------------------------------------------------------------------------- 1 | Attribution O 2 | is O 3 | on O 4 | - O 5 | the O 6 | - O 7 | record O 8 | from O 9 | Spokesperson B-Person 10 | John I-Person 11 | Kirby I-Person 12 | . O 13 | 14 | Question O 15 | : O 16 | Will O 17 | the B-Organisation 18 | United I-Organisation 19 | States I-Organisation 20 | participate O 21 | in O 22 | the O 23 | Paris B-Location 24 | meeting O 25 | on O 26 | Syria B-Location 27 | on O 28 | October B-Temporal 29 | 27 I-Temporal 30 | , O 31 | who O 32 | will O 33 | attend O 34 | / O 35 | at O 36 | what O 37 | level O 38 | , O 39 | what O 40 | other O 41 | details O 42 | can O 43 | we O 44 | provide O 45 | , O 46 | etc O 47 | . O 48 | ? O 49 | 50 | Answer O 51 | : O 52 | Deputy B-Person 53 | Secretary I-Person 54 | of I-Person 55 | State I-Person 56 | Tony I-Person 57 | Blinken I-Person 58 | will O 59 | travel O 60 | to O 61 | Paris B-Location 62 | for O 63 | tomorrow B-Temporal 64 | ’ O 65 | s O 66 | multilateral O 67 | meeting O 68 | , O 69 | convened O 70 | by O 71 | the B-Organisation 72 | French I-Organisation 73 | government I-Organisation 74 | , O 75 | to O 76 | discuss O 77 | efforts O 78 | to O 79 | resolve O 80 | the O 81 | crisis O 82 | in O 83 | Syria B-Location 84 | . O 85 | 86 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/US_State_Department/487C0B93A959D64A34CA2F40EE82C18B.conll: -------------------------------------------------------------------------------- 1 | The B-Organisation 2 | United I-Organisation 3 | States I-Organisation 4 | condemns O 5 | in O 6 | the O 7 | strongest O 8 | terms O 9 | both O 10 | the O 11 | ISIL B-Organisation 12 | attack O 13 | this O 14 | morning O 15 | at O 16 | a O 17 | crowded O 18 | market B-Location 19 | place I-Location 20 | in O 21 | Baghdad B-Location 22 | , O 23 | and O 24 | the O 25 | car O 26 | bomb O 27 | attacks O 28 | in O 29 | Diyala B-Location 30 | Province I-Location 31 | on O 32 | Monday B-Temporal 33 | , O 34 | as O 35 | well O 36 | as O 37 | other O 38 | recent O 39 | terrorist O 40 | attacks O 41 | against O 42 | the O 43 | Iraqi B-Nationality 44 | people B-Organisation 45 | . O 46 | 47 | We B-Organisation 48 | express O 49 | our O 50 | deep O 51 | condolences O 52 | to O 53 | family B-Organisation 54 | and I-Organisation 55 | friends I-Organisation 56 | of I-Organisation 57 | the I-Organisation 58 | victims I-Organisation 59 | . O 60 | 61 | These O 62 | atrocities O 63 | show O 64 | once O 65 | again O 66 | the O 67 | utter O 68 | disregard O 69 | ISIL B-Organisation 70 | has O 71 | for O 72 | innocent O 73 | civilians O 74 | , O 75 | including O 76 | women O 77 | and O 78 | children O 79 | . O 80 | 81 | As O 82 | Iraqis O 83 | unite O 84 | against O 85 | ISIL B-Organisation 86 | and O 87 | turn O 88 | the O 89 | tide O 90 | on O 91 | the O 92 | battlefield O 93 | , O 94 | ISIL B-Organisation 95 | will O 96 | try O 97 | to O 98 | maintain O 99 | its O 100 | campaign O 101 | of O 102 | terror O 103 | to O 104 | sow O 105 | discord O 106 | among O 107 | the B-Organisation 108 | Iraqi I-Organisation 109 | people I-Organisation 110 | . O 111 | 112 | The O 113 | United O 114 | States O 115 | continues O 116 | to O 117 | stand O 118 | shoulder O 119 | to O 120 | shoulder O 121 | with O 122 | the B-Organisation 123 | Iraqi I-Organisation 124 | people I-Organisation 125 | as O 126 | they O 127 | confront O 128 | ISIL B-Organisation 129 | and O 130 | the O 131 | violence O 132 | it O 133 | represents O 134 | . O 135 | 136 | We B-Organisation 137 | remain O 138 | committed O 139 | to O 140 | working O 141 | with O 142 | Prime B-Person 143 | Minister I-Person 144 | Al I-Person 145 | - I-Person 146 | Abadi I-Person 147 | , O 148 | the B-Organisation 149 | Iraqi I-Organisation 150 | Security I-Organisation 151 | Forces I-Organisation 152 | , O 153 | and O 154 | our B-Organisation 155 | partners I-Organisation 156 | in O 157 | the B-Organisation 158 | international I-Organisation 159 | community I-Organisation 160 | to O 161 | support O 162 | the B-Organisation 163 | Government I-Organisation 164 | of I-Organisation 165 | Iraq I-Organisation 166 | in O 167 | defeating O 168 | ISIL B-Organisation 169 | and O 170 | holding O 171 | this B-Organisation 172 | terrorist I-Organisation 173 | organization I-Organisation 174 | accountable O 175 | for O 176 | the O 177 | atrocities O 178 | it O 179 | has O 180 | committed O 181 | . O 182 | 183 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/US_State_Department/6E62DC65EFDFB1B357175EB255F25C36.conll: -------------------------------------------------------------------------------- 1 | The B-Organisation 2 | United I-Organisation 3 | States I-Organisation 4 | strongly O 5 | condemns O 6 | ISIL B-Organisation 7 | ’ O 8 | s O 9 | multiple O 10 | suicide B-Weapon 11 | attacks I-Weapon 12 | in O 13 | the B-Location 14 | town I-Location 15 | of I-Location 16 | Tel I-Location 17 | Tamer I-Location 18 | in O 19 | Hasaka B-Location 20 | Province I-Location 21 | , O 22 | which O 23 | targeted O 24 | and O 25 | killed O 26 | scores O 27 | of O 28 | Syrian B-Organisation 29 | civilians I-Organisation 30 | , O 31 | including O 32 | at O 33 | a B-Location 34 | hospital I-Location 35 | and O 36 | marketplace B-Location 37 | . O 38 | 39 | We B-Organisation 40 | offer O 41 | our O 42 | deepest O 43 | condolences O 44 | to O 45 | the B-Organisation 46 | families I-Organisation 47 | of I-Organisation 48 | the I-Organisation 49 | victims I-Organisation 50 | of O 51 | this O 52 | barbaric O 53 | act O 54 | of O 55 | mass O 56 | murder O 57 | . O 58 | 59 | This O 60 | attack O 61 | once O 62 | again O 63 | displays O 64 | the O 65 | type O 66 | of O 67 | horrific O 68 | atrocities O 69 | that O 70 | ISIL B-Organisation 71 | has O 72 | perpetrated O 73 | against O 74 | tens B-Quantity 75 | of I-Quantity 76 | thousands I-Quantity 77 | of O 78 | innocent B-Organisation 79 | people I-Organisation 80 | across O 81 | Syria B-Location 82 | and O 83 | Iraq B-Location 84 | and O 85 | affirms O 86 | the O 87 | need O 88 | for O 89 | international O 90 | resolve O 91 | to O 92 | strengthen O 93 | those O 94 | on O 95 | the O 96 | ground O 97 | fighting O 98 | ISIL B-Organisation 99 | . O 100 | 101 | The B-Organisation 102 | United I-Organisation 103 | States I-Organisation 104 | remains O 105 | committed O 106 | to O 107 | working O 108 | with O 109 | our B-Organisation 110 | partners I-Organisation 111 | in O 112 | Syria B-Location 113 | , O 114 | Iraq B-Location 115 | , O 116 | and O 117 | the B-Organisation 118 | international I-Organisation 119 | community I-Organisation 120 | to O 121 | bring O 122 | an O 123 | end O 124 | to O 125 | ISIL B-Organisation 126 | ’ O 127 | s O 128 | depravity O 129 | and O 130 | ultimately O 131 | destroy O 132 | this B-Organisation 133 | terrorist I-Organisation 134 | organization I-Organisation 135 | . O 136 | 137 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/US_State_Department/9104BDCC8E9A1CD2090757B487421DC6.conll: -------------------------------------------------------------------------------- 1 | Knox B-Person 2 | Thames I-Person 3 | , O 4 | the B-Person 5 | Special I-Person 6 | Advisor I-Person 7 | to I-Person 8 | Ambassador I-Person 9 | Saperstein I-Person 10 | for I-Person 11 | Religious I-Person 12 | Minorities I-Person 13 | in I-Person 14 | the I-Person 15 | Near I-Person 16 | East I-Person 17 | and I-Person 18 | South I-Person 19 | and I-Person 20 | Central I-Person 21 | Asia I-Person 22 | , O 23 | will O 24 | travel O 25 | to O 26 | Turkey B-Location 27 | November B-Temporal 28 | 16 I-Temporal 29 | - I-Temporal 30 | 17 I-Temporal 31 | and O 32 | to O 33 | Lebanon B-Location 34 | November B-Temporal 35 | 18 I-Temporal 36 | - I-Temporal 37 | 19 I-Temporal 38 | . O 39 | 40 | In O 41 | Turkey B-Location 42 | , O 43 | Special B-Person 44 | Advisor I-Person 45 | Thames I-Person 46 | will O 47 | deliver O 48 | remarks O 49 | on O 50 | the B-Organisation 51 | Department I-Organisation 52 | ’ O 53 | s O 54 | work O 55 | to O 56 | support O 57 | religious B-Organisation 58 | minorities I-Organisation 59 | at O 60 | the B-Organisation 61 | G I-Organisation 62 | - I-Organisation 63 | 20 I-Organisation 64 | Interfaith I-Organisation 65 | Conference I-Organisation 66 | . O 67 | 68 | In O 69 | Lebanon B-Location 70 | , O 71 | Special B-Person 72 | Advisor I-Person 73 | Thames I-Person 74 | will O 75 | meet O 76 | with O 77 | religious B-Organisation 78 | and I-Organisation 79 | civic I-Organisation 80 | leaders I-Organisation 81 | to O 82 | discuss O 83 | interfaith O 84 | relations O 85 | within O 86 | the B-Organisation 87 | Lebanese I-Organisation 88 | religious I-Organisation 89 | communities I-Organisation 90 | and O 91 | to O 92 | address O 93 | the O 94 | plight O 95 | of O 96 | vulnerable O 97 | religious B-Organisation 98 | minority I-Organisation 99 | refugee I-Organisation 100 | populations I-Organisation 101 | coming O 102 | to O 103 | Lebanon B-Location 104 | from O 105 | Syria B-Location 106 | and O 107 | Iraq B-Location 108 | . O 109 | 110 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/US_State_Department/9DFBD0B607245D1B56B90FD64E9AB952.conll: -------------------------------------------------------------------------------- 1 | We B-Organisation 2 | condemn O 3 | in O 4 | the O 5 | strongest O 6 | possible O 7 | terms O 8 | terrorist O 9 | attacks O 10 | this B-Temporal 11 | weekend I-Temporal 12 | in O 13 | Baghdad B-Location 14 | and O 15 | Najaf B-Location 16 | . O 17 | 18 | On O 19 | Saturday B-Temporal 20 | , O 21 | Dae B-Organisation 22 | ’ I-Organisation 23 | sh I-Organisation 24 | claimed O 25 | dual O 26 | attacks O 27 | in O 28 | the O 29 | commercial O 30 | area O 31 | of O 32 | the O 33 | Al B-Location 34 | - I-Location 35 | Sinek I-Location 36 | neighborhood I-Location 37 | in O 38 | Baghdad B-Location 39 | which O 40 | killed O 41 | at O 42 | least O 43 | 28 B-Quantity 44 | people B-Organisation 45 | . O 46 | 47 | Today B-Temporal 48 | , O 49 | multiple O 50 | attacks O 51 | , O 52 | also O 53 | claimed O 54 | by O 55 | Da B-Organisation 56 | ’ I-Organisation 57 | esh I-Organisation 58 | , O 59 | have O 60 | killed O 61 | at B-Organisation 62 | least I-Organisation 63 | 35 I-Organisation 64 | more I-Organisation 65 | innocent I-Organisation 66 | civilians I-Organisation 67 | and O 68 | injured O 69 | scores B-Organisation 70 | of I-Organisation 71 | others I-Organisation 72 | . O 73 | 74 | We B-Organisation 75 | extend O 76 | our O 77 | deepest O 78 | condolences O 79 | to O 80 | the B-Organisation 81 | families I-Organisation 82 | of I-Organisation 83 | the I-Organisation 84 | victims I-Organisation 85 | and O 86 | hope O 87 | for O 88 | a O 89 | speedy O 90 | recovery O 91 | of O 92 | all B-Organisation 93 | those I-Organisation 94 | wounded I-Organisation 95 | . O 96 | 97 | These O 98 | vicious O 99 | acts O 100 | of O 101 | mass O 102 | murder O 103 | are O 104 | a O 105 | sobering O 106 | reminder O 107 | of O 108 | the O 109 | need O 110 | to O 111 | continue O 112 | coalition B-Organisation 113 | operations O 114 | against O 115 | Da B-Organisation 116 | ’ I-Organisation 117 | esh I-Organisation 118 | and O 119 | to O 120 | eliminate O 121 | the O 122 | threat O 123 | this B-Organisation 124 | terrorist I-Organisation 125 | group I-Organisation 126 | poses O 127 | . O 128 | 129 | The B-Organisation 130 | U.S. I-Organisation 131 | reaffirms O 132 | its O 133 | commitment O 134 | to O 135 | support O 136 | the B-Organisation 137 | government I-Organisation 138 | and O 139 | people B-Organisation 140 | of I-Organisation 141 | Iraq I-Organisation 142 | in O 143 | this O 144 | struggle O 145 | , O 146 | and O 147 | remains O 148 | steadfast O 149 | in O 150 | its O 151 | commitment O 152 | to O 153 | the O 154 | Global B-Organisation 155 | Coalition I-Organisation 156 | efforts O 157 | to O 158 | degrade O 159 | and O 160 | defeat O 161 | Da B-Organisation 162 | ’ I-Organisation 163 | esh I-Organisation 164 | . O 165 | 166 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/US_State_Department/C02FDFE461425ED8B64B982D05BD2D62.conll: -------------------------------------------------------------------------------- 1 | QUESTION O 2 | : O 3 | There O 4 | are O 5 | reports O 6 | of O 7 | U.S. B-Organisation 8 | , I-Organisation 9 | German I-Organisation 10 | , I-Organisation 11 | and I-Organisation 12 | Canadian I-Organisation 13 | companies I-Organisation 14 | hosting O 15 | Syrian B-Organisation 16 | Government I-Organisation 17 | websites O 18 | . O 19 | 20 | Does O 21 | hosting O 22 | a O 23 | Syrian B-Organisation 24 | Government I-Organisation 25 | website O 26 | constitute O 27 | a O 28 | sanctions O 29 | violation O 30 | ? O 31 | 32 | What O 33 | action O 34 | has O 35 | the B-Organisation 36 | State I-Organisation 37 | Department I-Organisation 38 | taken O 39 | to O 40 | investigate O 41 | these O 42 | reports O 43 | ? O 44 | 45 | ANSWER O 46 | : O 47 | U.S. B-Organisation 48 | companies I-Organisation 49 | are O 50 | prohibited O 51 | from O 52 | providing O 53 | web O 54 | - O 55 | hosting O 56 | or O 57 | any O 58 | other O 59 | service O 60 | to O 61 | the B-Organisation 62 | Syrian I-Organisation 63 | government I-Organisation 64 | under O 65 | Executive B-DocumentReference 66 | Order I-DocumentReference 67 | 13582 I-DocumentReference 68 | without O 69 | a O 70 | license O 71 | from O 72 | the B-Organisation 73 | Department I-Organisation 74 | of I-Organisation 75 | the I-Organisation 76 | Treasury I-Organisation 77 | . O 78 | 79 | We B-Organisation 80 | refer O 81 | questions O 82 | about O 83 | pending O 84 | or O 85 | ongoing O 86 | licensure O 87 | investigations O 88 | to O 89 | the B-Organisation 90 | Department I-Organisation 91 | of I-Organisation 92 | Treasury I-Organisation 93 | ’ I-Organisation 94 | s I-Organisation 95 | Office I-Organisation 96 | of I-Organisation 97 | Foreign I-Organisation 98 | Assets I-Organisation 99 | Control I-Organisation 100 | . O 101 | 102 | The B-Organisation 103 | U.S. I-Organisation 104 | government I-Organisation 105 | takes O 106 | all O 107 | reports O 108 | of O 109 | potential O 110 | sanctions O 111 | violations O 112 | seriously O 113 | . O 114 | 115 | Our B-Organisation 116 | policies O 117 | are O 118 | designed O 119 | to O 120 | assist O 121 | ordinary B-Organisation 122 | citizens I-Organisation 123 | who O 124 | are O 125 | exercising O 126 | their O 127 | fundamental O 128 | freedoms O 129 | of O 130 | expression O 131 | , O 132 | assembly O 133 | and O 134 | association O 135 | , O 136 | while O 137 | preventing O 138 | exports O 139 | of O 140 | goods O 141 | and O 142 | services O 143 | that O 144 | repressive B-Organisation 145 | regimes I-Organisation 146 | can O 147 | use O 148 | against O 149 | their B-Organisation 150 | people I-Organisation 151 | . O 152 | 153 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/US_State_Department/F669C19A4C3DB410BBEC57ECC5DC62B1.conll: -------------------------------------------------------------------------------- 1 | Deputy B-Person 2 | Secretary I-Person 3 | of I-Person 4 | State I-Person 5 | Antony I-Person 6 | Blinken I-Person 7 | will O 8 | travel O 9 | to O 10 | Iraq B-Location 11 | to O 12 | discuss O 13 | key O 14 | political O 15 | , O 16 | economic O 17 | , O 18 | and O 19 | security O 20 | issues O 21 | with O 22 | government B-Organisation 23 | officials I-Organisation 24 | . O 25 | 26 | In O 27 | Baghdad B-Location 28 | on O 29 | November B-Temporal 30 | 22 I-Temporal 31 | , O 32 | Deputy B-Person 33 | Secretary I-Person 34 | Blinken I-Person 35 | will O 36 | meet O 37 | with O 38 | national B-Organisation 39 | and I-Organisation 40 | provincial I-Organisation 41 | Iraqi I-Organisation 42 | officials I-Organisation 43 | to O 44 | discuss O 45 | a O 46 | range O 47 | of O 48 | economic O 49 | and O 50 | security O 51 | issues O 52 | . O 53 | 54 | He B-Person 55 | will O 56 | also O 57 | meet O 58 | with O 59 | senior B-Organisation 60 | U.S. I-Organisation 61 | military I-Organisation 62 | officials I-Organisation 63 | to O 64 | discuss O 65 | our O 66 | ongoing O 67 | support O 68 | for O 69 | the B-Organisation 70 | Iraqi I-Organisation 71 | Armed I-Organisation 72 | Forces I-Organisation 73 | , O 74 | Popular B-Organisation 75 | Mobilization I-Organisation 76 | Forces I-Organisation 77 | , O 78 | and O 79 | others B-Organisation 80 | in O 81 | support O 82 | of O 83 | the B-Organisation 84 | coalition I-Organisation 85 | campaign O 86 | against O 87 | Daesh B-Organisation 88 | . O 89 | 90 | Deputy B-Person 91 | Secretary I-Person 92 | Blinken I-Person 93 | will O 94 | then O 95 | visit O 96 | the B-Location 97 | Iraqi I-Location 98 | Kurdistan I-Location 99 | Region I-Location 100 | on O 101 | November B-Temporal 102 | 23 I-Temporal 103 | , O 104 | where O 105 | he B-Person 106 | will O 107 | meet O 108 | with O 109 | senior B-Organisation 110 | regional I-Organisation 111 | government I-Organisation 112 | officials I-Organisation 113 | to O 114 | continue O 115 | discussions O 116 | on O 117 | our O 118 | efforts O 119 | to O 120 | degrade O 121 | and O 122 | defeat O 123 | Daesh B-Organisation 124 | as O 125 | well O 126 | as O 127 | our O 128 | response O 129 | to O 130 | the O 131 | urgent O 132 | humanitarian O 133 | and O 134 | refugee O 135 | crises O 136 | in O 137 | Iraq B-Location 138 | . O 139 | 140 | Be O 141 | sure O 142 | to O 143 | follow O 144 | the B-Person 145 | Deputy I-Person 146 | Secretary I-Person 147 | on O 148 | Twitter O 149 | ( O 150 | @ B-Person 151 | ABlinken I-Person 152 | ) O 153 | and O 154 | on O 155 | the O 156 | State B-Organisation 157 | Department I-Organisation 158 | 's O 159 | Flickr O 160 | and O 161 | Instagram O 162 | for O 163 | more O 164 | updates O 165 | throughout O 166 | his O 167 | trip O 168 | . O 169 | 170 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/US_State_Department/FAB032C0995F606F54B1DD3EA8D61930.conll: -------------------------------------------------------------------------------- 1 | QUESTION O 2 | : O 3 | Secretary B-Person 4 | Kerry I-Person 5 | , O 6 | did O 7 | you O 8 | discuss O 9 | Aleppo B-Location 10 | ? O 11 | 12 | Were O 13 | there O 14 | any O 15 | agreements O 16 | ? O 17 | 18 | SECRETARY B-Person 19 | KERRY I-Person 20 | : O 21 | We B-Organisation 22 | obviously O 23 | talked O 24 | about O 25 | the O 26 | extraordinarily O 27 | dire O 28 | situation O 29 | in O 30 | Aleppo B-Location 31 | and O 32 | we B-Organisation 33 | exchanged O 34 | some O 35 | ideas O 36 | about O 37 | it O 38 | , O 39 | and O 40 | we B-Organisation 41 | intend O 42 | to O 43 | connect O 44 | in O 45 | the B-Temporal 46 | morning I-Temporal 47 | to O 48 | see O 49 | where O 50 | we B-Organisation 51 | are O 52 | on O 53 | it O 54 | . O 55 | 56 | MR B-Person 57 | KIRBY I-Person 58 | : O 59 | Thanks O 60 | , O 61 | everybody O 62 | . O 63 | 64 | Thank O 65 | you O 66 | . O 67 | 68 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/Wikipedia/0388FEF6427DE0223688EB04A8DA71E1.conll: -------------------------------------------------------------------------------- 1 | Hibhib B-Location 2 | ( O 3 | Arabic O 4 | : O 5 | ن O 6 | ا O 7 | ح O 8 | ي O 9 | ة O 10 | ه O 11 | ب O 12 | ه O 13 | ب O 14 | ‎ O 15 | ‎ O 16 | , O 17 | Hibhib B-Location 18 | Village I-Location 19 | ) O 20 | is O 21 | a O 22 | village O 23 | in O 24 | northern B-Location 25 | Iraq I-Location 26 | , O 27 | located O 28 | 8 B-Quantity 29 | km I-Quantity 30 | ( O 31 | 5 B-Quantity 32 | . I-Quantity 33 | 0 I-Quantity 34 | mi I-Quantity 35 | ) O 36 | northwest O 37 | of O 38 | Baquba B-Location 39 | . O 40 | 41 | It O 42 | is O 43 | predominately O 44 | Shia B-Nationality 45 | Arab I-Nationality 46 | . O 47 | [ O 48 | 1 O 49 | ] O 50 | 51 | The B-Person 52 | former I-Person 53 | leader I-Person 54 | of I-Person 55 | Al I-Person 56 | Qaeda I-Person 57 | in I-Person 58 | Iraq I-Person 59 | , O 60 | Jordanian O 61 | - O 62 | born O 63 | militant O 64 | Abu B-Person 65 | Musab I-Person 66 | al I-Person 67 | - I-Person 68 | Zarqawi I-Person 69 | , O 70 | maintained O 71 | a B-Location 72 | safehouse I-Location 73 | in O 74 | the B-Location 75 | village I-Location 76 | . O 77 | 78 | In O 79 | 2006 B-Temporal 80 | , O 81 | while O 82 | meeting O 83 | in O 84 | the B-Location 85 | safehouse I-Location 86 | with O 87 | his B-Person 88 | spiritual I-Person 89 | adviser I-Person 90 | , O 91 | he B-Person 92 | was O 93 | bombed O 94 | and O 95 | killed O 96 | by O 97 | U.S. B-MilitaryPlatform 98 | - I-MilitaryPlatform 99 | led I-MilitaryPlatform 100 | coalition I-MilitaryPlatform 101 | aircraft I-MilitaryPlatform 102 | . O 103 | [ O 104 | 2 O 105 | ] O 106 | 107 | Coordinates O 108 | : O 109 | 33 B-Location 110 | ° I-Location 111 | 47 I-Location 112 | ′ I-Location 113 | N I-Location 114 | 44 I-Location 115 | ° I-Location 116 | 30 I-Location 117 | ′ I-Location 118 | E I-Location 119 | / O 120 | 33 B-Location 121 | . I-Location 122 | 783 I-Location 123 | ° I-Location 124 | N I-Location 125 | 44 I-Location 126 | . I-Location 127 | 500 I-Location 128 | ° I-Location 129 | E I-Location 130 | / O 131 | 33 B-Location 132 | . I-Location 133 | 783 I-Location 134 | ; I-Location 135 | 44 I-Location 136 | . I-Location 137 | 500 I-Location 138 | 139 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/Wikipedia/22BCFA25729AF03473435DAAAE67F5B8.conll: -------------------------------------------------------------------------------- 1 | The B-Organisation 2 | Syrian I-Organisation 3 | Computer I-Organisation 4 | Society I-Organisation 5 | is O 6 | an O 7 | organization O 8 | in O 9 | Syria B-Location 10 | . O 11 | 12 | It B-Organisation 13 | was O 14 | founded O 15 | by O 16 | Bassel B-Person 17 | al I-Person 18 | - I-Person 19 | Assad I-Person 20 | in O 21 | 1989 B-Temporal 22 | , O 23 | and O 24 | was O 25 | subsequently O 26 | headed O 27 | by O 28 | his O 29 | brother O 30 | Bashar B-Person 31 | al I-Person 32 | - I-Person 33 | Assad I-Person 34 | , O 35 | [ O 36 | 1 O 37 | ] O 38 | who O 39 | would O 40 | later O 41 | become O 42 | the B-Person 43 | President I-Person 44 | of I-Person 45 | Syria I-Person 46 | . O 47 | 48 | It B-Organisation 49 | acts O 50 | as O 51 | Syria B-Location 52 | ' O 53 | s O 54 | domain O 55 | name O 56 | registration O 57 | authority O 58 | and O 59 | has O 60 | been O 61 | reported O 62 | to O 63 | be O 64 | closely O 65 | associated O 66 | with O 67 | the B-Organisation 68 | Syrian I-Organisation 69 | state I-Organisation 70 | . O 71 | [ O 72 | 2 O 73 | ] O 74 | 75 | " O 76 | In O 77 | May B-Temporal 78 | 2013 I-Temporal 79 | , O 80 | 700 B-Quantity 81 | domains I-Quantity 82 | registered O 83 | by O 84 | Syrians B-Organisation 85 | , O 86 | mostly O 87 | hosted O 88 | at O 89 | servers O 90 | with O 91 | IP O 92 | addresses O 93 | assigned O 94 | to O 95 | the B-Organisation 96 | Syrian I-Organisation 97 | Computer I-Organisation 98 | Society I-Organisation 99 | , O 100 | [ O 101 | 3 O 102 | ] O 103 | were O 104 | reported O 105 | to O 106 | have O 107 | been O 108 | seized O 109 | by O 110 | the O 111 | U.S. B-Organisation 112 | DNS I-Organisation 113 | infrastructure I-Organisation 114 | operator I-Organisation 115 | Network I-Organisation 116 | Solutions I-Organisation 117 | . O 118 | [ O 119 | 2 O 120 | ] O 121 | The O 122 | domain O 123 | names O 124 | were O 125 | marked O 126 | as O 127 | " O 128 | " O 129 | OFAC B-Organisation 130 | Holding O 131 | " O 132 | " O 133 | , O 134 | believed O 135 | to O 136 | be O 137 | a O 138 | reference O 139 | to O 140 | the B-Organisation 141 | U.S. I-Organisation 142 | federal I-Organisation 143 | government I-Organisation 144 | ' I-Organisation 145 | s I-Organisation 146 | Office I-Organisation 147 | of I-Organisation 148 | Foreign I-Organisation 149 | Assets I-Organisation 150 | Control I-Organisation 151 | . O 152 | [ O 153 | 3 O 154 | ] O 155 | " O 156 | 157 | Some B-Organisation 158 | members I-Organisation 159 | of O 160 | the B-Organisation 161 | Syrian I-Organisation 162 | Computer I-Organisation 163 | Society I-Organisation 164 | belonged O 165 | to O 166 | the O 167 | first O 168 | group O 169 | of O 170 | supporters O 171 | of O 172 | the B-Organisation 173 | Syrian I-Organisation 174 | Electronic I-Organisation 175 | Army I-Organisation 176 | . O 177 | [ O 178 | 4 O 179 | ] O 180 | 181 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data-by-source/conll/Wikipedia/99036C6E6DB96D33FA0830D3095E7B71.conll: -------------------------------------------------------------------------------- 1 | The B-Location 2 | Hamrin I-Location 3 | Mountains I-Location 4 | ( O 5 | Arabic B-Nationality 6 | : O 7 | ج O 8 | ب O 9 | ل O 10 | ح O 11 | م O 12 | ر O 13 | ي O 14 | ن O 15 | Jab O 16 | ā O 17 | l O 18 | Hamr O 19 | ī O 20 | n O 21 | , O 22 | Kurdish B-Nationality 23 | : O 24 | Ç O 25 | iyay O 26 | ê O 27 | Hemr O 28 | î O 29 | n O 30 | or O 31 | Ç O 32 | iyay O 33 | ê O 34 | n O 35 | Hemr O 36 | î O 37 | n O 38 | ) O 39 | are O 40 | a B-Location 41 | small I-Location 42 | mountain I-Location 43 | ridge I-Location 44 | in O 45 | northeast B-Location 46 | Iraq I-Location 47 | . O 48 | 49 | The B-Location 50 | westernmost I-Location 51 | ripple I-Location 52 | of I-Location 53 | the I-Location 54 | greater I-Location 55 | Zagros I-Location 56 | mountains I-Location 57 | , O 58 | [ O 59 | 1 O 60 | ] O 61 | the B-Location 62 | Hamrin I-Location 63 | mountains I-Location 64 | extend O 65 | from O 66 | the B-Location 67 | Diyala I-Location 68 | Province I-Location 69 | bordering O 70 | Iran B-Location 71 | , O 72 | northwest O 73 | to O 74 | the B-Location 75 | Tigris I-Location 76 | river I-Location 77 | , O 78 | crossing O 79 | northern B-Location 80 | Salah I-Location 81 | ad I-Location 82 | Din I-Location 83 | Province I-Location 84 | and O 85 | southern B-Location 86 | Kirkuk I-Location 87 | Province I-Location 88 | . O 89 | 90 | In O 91 | antiquity O 92 | , O 93 | the B-Location 94 | mountains I-Location 95 | were O 96 | part O 97 | of O 98 | the B-Location 99 | frontier I-Location 100 | region I-Location 101 | between O 102 | Babylonia B-Location 103 | to O 104 | the O 105 | south O 106 | and O 107 | Assyria B-Location 108 | to O 109 | the O 110 | north O 111 | . O 112 | 113 | Today B-Temporal 114 | , O 115 | the O 116 | area O 117 | forms O 118 | part O 119 | of O 120 | the O 121 | linguistic O 122 | boundary O 123 | between O 124 | most O 125 | of O 126 | Arabic B-Location 127 | - I-Location 128 | speaking I-Location 129 | Iraq I-Location 130 | and O 131 | Kurdish B-Location 132 | - I-Location 133 | speaking I-Location 134 | Iraq I-Location 135 | . O 136 | 137 | Coordinates O 138 | : O 139 | 35 O 140 | ° O 141 | 01 O 142 | ′ O 143 | 57 O 144 | ″ O 145 | N O 146 | 43 O 147 | ° O 148 | 38 O 149 | ′ O 150 | 47 O 151 | ″ O 152 | E O 153 | / O 154 | 35 B-Location 155 | . I-Location 156 | 0325 I-Location 157 | ° I-Location 158 | N I-Location 159 | 43 I-Location 160 | . I-Location 161 | 6463889 I-Location 162 | ° I-Location 163 | E I-Location 164 | / O 165 | 35 B-Location 166 | . I-Location 167 | 0325 I-Location 168 | ; I-Location 169 | 43 I-Location 170 | . I-Location 171 | 6463889 I-Location 172 | 173 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/data/README.md: -------------------------------------------------------------------------------- 1 | This directory is for the train and test split of the re3d data. 2 | 3 | To produce the train/test split used in the paper, run the following 4 | in Python 2 from within the src directory: 5 | 6 | ``` 7 | import stratified_split as s 8 | filedir = '../data/re3d/CONLL-format/data' 9 | TR, TE = s.write_new_split('re3d',200,filedir,'re3d',max_count=2) 10 | ``` 11 | 12 | This will create subdirectories `train` and `test`. 13 | 14 | -------------------------------------------------------------------------------- /data/re3d/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = DocumentReference 4 | Location 5 | MilitaryPlatform 6 | Money 7 | Nationality 8 | Organisation 9 | Person 10 | Quantity 11 | Temporal 12 | Weapon 13 | 14 | [IOB-format] 15 | 16 | IOB = IOB2 17 | 18 | [file-settings] 19 | 20 | sep = tab 21 | iob_pos = 1 22 | word_pos = 0 23 | pos_pos = none 24 | -------------------------------------------------------------------------------- /data/re3d/LICENSES/Australian_Department_of_Foreign_Affairs/ATTRIBUTION: -------------------------------------------------------------------------------- 1 | Documents included in this subset of data were retrieved from the Australian Department of Foreign Affairs website (http://dfat.gov.au). 2 | 3 | The document content was collected from their website, and annotated for entities and relations by Aleph Insights working with Committed Software and on behalf of the UK Defence Science and Technology Laboratory (Dstl). Only minor formatting changes were made to the content in order to get it into a suitable format for processing (e.g. documents.json) - any additional information, is stored separately (e.g. entities.json and relations.json). -------------------------------------------------------------------------------- /data/re3d/LICENSES/Australian_Department_of_Foreign_Affairs/LICENSE: -------------------------------------------------------------------------------- 1 | This dataset is released under the Creative Commons Attribution 3.0 Australia license. More information about this license can be found at https://creativecommons.org/licenses/by/3.0/au/, and the full license text can be read at https://creativecommons.org/licenses/by/3.0/au/legalcode. -------------------------------------------------------------------------------- /data/re3d/LICENSES/BBC_Online/ATTRIBUTION: -------------------------------------------------------------------------------- 1 | Documents included in this subset of data were retrieved from the BBC Online website (http://www.bbc.co.uk). 2 | 3 | The document content was collected from their website, and annotated for entities and relations by Aleph Insights working with Committed Software and on behalf of the UK Defence Science and Technology Laboratory (Dstl). Only minor formatting changes were made to the content in order to get it into a suitable format for processing (e.g. documents.json) - any additional information, is stored separately (e.g. entities.json and relations.json). -------------------------------------------------------------------------------- /data/re3d/LICENSES/BBC_Online/LICENSE: -------------------------------------------------------------------------------- 1 | This dataset is released under the Creative Commons Attribution-NonCommercial 3.0 license. More information about this license can be found at https://creativecommons.org/licenses/by-nc/3.0/, and the full license text can be read at https://creativecommons.org/licenses/by-nc/3.0/legalcode. -------------------------------------------------------------------------------- /data/re3d/LICENSES/CENTCOM/ATTRIBUTION: -------------------------------------------------------------------------------- 1 | Documents included in this subset of data were retrieved from the CENTCOM website (http://www.centcom.mil). 2 | 3 | The document content was collected from their website, and annotated for entities and relations by Aleph Insights working with Committed Software and on behalf of the UK Defence Science and Technology Laboratory (Dstl). Only minor formatting changes were made to the content in order to get it into a suitable format for processing (e.g. documents.json) - any additional information, is stored separately (e.g. entities.json and relations.json). -------------------------------------------------------------------------------- /data/re3d/LICENSES/CENTCOM/LICENSE: -------------------------------------------------------------------------------- 1 | This dataset is public domain. The following was retrieved from http://www.centcom.mil/Home/Privacy-and-Security/ on 3rd July 2017. 2 | 3 | The United States Central Command Homepage is provided as a public service by the United States Central Command Public Affairs Office. Information presented on this web site is considered public information and may be distributed or copied. Use of appropriate byline/photo/image credits is requested. For site management, information is collected for statistical purposes. This government computer system uses software programs to create summary statistics, which are used for such purposes as assessing what information is of most and least interest, determining technical design specifications, and identifying system performance or problem areas. For site security purposes and to ensure that this service remains available to all users, this government computer system employs software programs to monitor network traffic to identify unauthorized attempts to upload or change information, or otherwise cause damage. Except for authorized law enforcement investigations, no other attempts are made to identify individual users or their usage habits. Raw data logs are used for no other purposes and are scheduled for regular destruction in accordance with National Archives and Records Administration Guidelines. Unauthorized attempts to upload information or change information on this service are strictly prohibited and may be punishable under the Computer Fraud and Abuse Act of 1987 and the National Information Infrastructure Protection Act. If you have any questions or comments about the information presented here, please forward them to the United States Central Command Public Affairs Office. -------------------------------------------------------------------------------- /data/re3d/LICENSES/Delegation_of_the_European_Union_to_Syria/ATTRIBUTION: -------------------------------------------------------------------------------- 1 | Documents included in this subset of data were retrieved from the Delegation of the European Union to Syria website (http://eeas.europa.eu). 2 | 3 | The document content was collected from their website, and annotated for entities and relations by Aleph Insights working with Committed Software and on behalf of the UK Defence Science and Technology Laboratory (Dstl). Only minor formatting changes were made to the content in order to get it into a suitable format for processing (e.g. documents.json) - any additional information, is stored separately (e.g. entities.json and relations.json). -------------------------------------------------------------------------------- /data/re3d/LICENSES/Delegation_of_the_European_Union_to_Syria/LICENSE: -------------------------------------------------------------------------------- 1 | This dataset is licensed as per the copyright and legal statements set out on https://eeas.europa.eu/delegations/syria/8157/legal-notice_en. The following text was retrieved from the above website on 3rd July 2017. 2 | 3 | The EEAS website operates under the same legal notice of the EC website: http://ec.europa.eu/geninfo/legal_notices_en.htm 4 | 5 | Disclaimer 6 | 7 | The EEAS maintains this website to enhance public access to information about its initiatives and European Union policies in general. Our goal is to keep this information timely and accurate. If errors are brought to our attention, we will try to correct them. 8 | However the Commission accepts no responsibility or liability whatsoever with regard to the information on this site. 9 | 10 | This information is: 11 | 12 | * of a general nature only and is not intended to address the specific circumstances of any particular individual or entity; 13 | *not necessarily comprehensive, complete, accurate or up to date; 14 | *sometimes linked to external sites over which the Commission services have no control and for which the Commission assumes no responsibility; 15 | *not professional or legal advice (if you need specific advice, you should always consult a suitably qualified professional). 16 | 17 | Please note that it cannot be guaranteed that a document available on-line exactly reproduces an officially adopted text.Only the Official Journal of the European Union (its printed edition, or since 1 July 2013 its electronic edition made available on the EUR-Lex website), is authentic and produces legal effects. 18 | 19 | It is our goal to minimize disruption caused by technical errors. However some data or information on our site may have been created or structured in files or formats that are not error-free and we cannot guarantee that our service will not be interrupted or otherwise affected by such problems. The Commission accepts no responsibility with regard to such problems incurred as a result of using this site or any linked external sites. 20 | 21 | This disclaimer is not intended to limit the liability of the Commission in contravention of any requirements laid down in applicable national law nor to exclude its liability for matters which may not be excluded under that law. 22 | 23 | Personal Data Protection 24 | 25 | The protection of your privacy, including your personal data, is of great importance to the European External Action Service (EEAS), thereby reflecting the provisions of the Charter on Fundamental Rights of the European Union, and in particular Art. 8 thereof. 26 | 27 | Europa Analytics 28 | 29 | Europa Analytics is the corporate service which measures the effectiveness and efficiency of the European Commission's websites on EUROPA. 30 | 31 | Copyright notice 32 | 33 | Reuse is authorised, provided the source is acknowledged. The reuse policy of the European Commission is implemented by a Decision of 12 December 2011 34 | 35 | The general principle of reuse can be subject to conditions which may be specified in individual copyright notices. Therefore users are advised to refer to the copyright notices of the individual websites maintained under Europa and of the individual documents. Reuse is not applicable to documents subject to intellectual property rights of third parties. -------------------------------------------------------------------------------- /data/re3d/LICENSES/README: -------------------------------------------------------------------------------- 1 | The LICENSES for the datasets in ../CONLL-format/data-by-source/conll/ are 2 | contained in this directory. 3 | 4 | Each subdirectory also contains the original ATTRIBUTION file from 5 | https://github.com/dstl/re3d, with more information on the provenance of each 6 | dataset. 7 | 8 | The data from each entities.json and documents.json files was converted into 9 | CONLL format. More details on how we modified the data are given in 10 | ../README.rst. 11 | 12 | -------------------------------------------------------------------------------- /data/re3d/LICENSES/UK_Government/ATTRIBUTION: -------------------------------------------------------------------------------- 1 | Documents included in this subset of data were retrieved from the UK Government website (http://www.gov.uk). 2 | 3 | The document content was collected from their website, and annotated for entities and relations by Aleph Insights working with Committed Software and on behalf of the UK Defence Science and Technology Laboratory (Dstl). Only minor formatting changes were made to the content in order to get it into a suitable format for processing (e.g. documents.json) - any additional information, is stored separately (e.g. entities.json and relations.json). -------------------------------------------------------------------------------- /data/re3d/LICENSES/UK_Government/LICENSE: -------------------------------------------------------------------------------- 1 | This dataset contains public sector information licensed under the Open Government Licence v3.0. The full license text can be read at https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/. -------------------------------------------------------------------------------- /data/re3d/LICENSES/US_State_Department/ATTRIBUTION: -------------------------------------------------------------------------------- 1 | Documents included in this subset of data were retrieved from the US State Department website (http://www.defense.gov). 2 | 3 | The document content was collected from their website, and annotated for entities and relations by Aleph Insights working with Committed Software and on behalf of the UK Defence Science and Technology Laboratory (Dstl). Only minor formatting changes were made to the content in order to get it into a suitable format for processing (e.g. documents.json) - any additional information, is stored separately (e.g. entities.json and relations.json). -------------------------------------------------------------------------------- /data/re3d/LICENSES/Wikipedia/ATTRIBUTION: -------------------------------------------------------------------------------- 1 | Documents included in this subset of data were retrieved from the Wikipedia website (http://www.wikipedia.org). 2 | 3 | The document content was collected from their website, and annotated for entities and relations by Aleph Insights working with Committed Software and on behalf of the UK Defence Science and Technology Laboratory (Dstl). Only minor formatting changes were made to the content in order to get it into a suitable format for processing (e.g. documents.json) - any additional information, is stored separately (e.g. entities.json and relations.json). -------------------------------------------------------------------------------- /data/re3d/LICENSES/Wikipedia/LICENSE: -------------------------------------------------------------------------------- 1 | This dataset is released under the Creative Commons Attribution-ShareAlike 3.0. More information about this license can be found at https://creativecommons.org/licenses/by-sa/3.0/, and the full license text can be read at https://creativecommons.org/licenses/by-sa/3.0/legalcode. -------------------------------------------------------------------------------- /data/wikigold/CONLL-format/docs/corpusconfig.cfg: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | entities = PER 4 | LOC 5 | ORG 6 | MISC 7 | 8 | [IOB-format] 9 | 10 | IOB = IO 11 | 12 | [file-settings] 13 | 14 | sep = ' ' 15 | iob_pos = 1 16 | word_pos = 0 17 | pos_pos = none 18 | -------------------------------------------------------------------------------- /data/wikigold/CONLL-format/docs/entity-list.txt: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | PER 4 | LOC 5 | ORG 6 | MISC 7 | 8 | [format] 9 | 10 | IO 11 | 12 | [content] 13 | 14 | [stats] 15 | 16 | 39152 tokens; 1841 sentences 17 | 18 | Entities: 19 | --------- 20 | PER 934 21 | LOC 1014 22 | ORG 898 23 | MISC 712 24 | 25 | [notes] 26 | -------------------------------------------------------------------------------- /data/wikigold/LICENSE: -------------------------------------------------------------------------------- 1 | wikigold.conll.txt is licensed under CC-BY 4.0. To view this license, visit 2 | https://creativecommons.org/licenses/by/4.0/ 3 | 4 | -------------------------------------------------------------------------------- /data/wikigold/README.rst: -------------------------------------------------------------------------------- 1 | Wikigold dataset 2 | ================ 3 | 4 | wikigold.conll.txt is a manually annotated collection of Wikipedia text, 5 | presented in: 6 | 7 | - Dominic Balasuriya and Nicky Ringland and Joel Nothman and Tara Murphy and 8 | James R. Curran (2009), in Proceedings of the 2009 Workshop on The People's 9 | Web Meets NLP: Collaboratively Constructed Semantic Resources 10 | (http://www.aclweb.org/anthology/W/W09/W09-3302). 11 | 12 | 13 | The data is already in CONLL format. It was originally downloaded from: 14 | http://downloads.schwa.org/wikiner/wikigold.conll.txt 15 | 16 | The file wikigold.conll.txt can now be found at: 17 | 18 | https://figshare.com/articles/Learning_multilingual_named_entity_recognition_from_Wikipedia/5462500 19 | 20 | Attribution 21 | ----------- 22 | 23 | This dataset was presented in the following two publications: 24 | 25 | [1] Balasuriya, Dominic, et al. "Named entity recognition in wikipedia." 26 | Proceedings of the 2009 Workshop on The People's Web Meets NLP: 27 | Collaboratively Constructed Semantic Resources. Association for 28 | Computational Linguistics, 2009. 29 | 30 | [2] Nothman, Joel, et al. "Learning multilingual named entity recognition 31 | from Wikipedia." Artificial Intelligence 194 (2013): 151-175 32 | 33 | License 34 | ------- 35 | 36 | wikigold.conll.txt is licensed under CC-BY 4.0. To view this license, visit 37 | https://creativecommons.org/licenses/by/4.0/ 38 | 39 | 40 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn==0.18.2 2 | -------------------------------------------------------------------------------- /src/sentence_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions not related to I/O, that can be directly applied to a list 3 | of lists, where each sublist is a sentence, and contains entries of the form 4 | ((word, pos_tag, dom), iob) 5 | 6 | """ 7 | from collections import Counter 8 | import copy 9 | 10 | 11 | def flatten(sentences): 12 | """ Flatten a list. 13 | 14 | """ 15 | f = [i for sublist in sentences for i in sublist] 16 | return f 17 | 18 | 19 | def get_tagset(sentences, with_prefix): 20 | """ Returns the set of entity types appearing in the list of sentences. 21 | 22 | If with_prefix is True, it returns both the B- and I- versions for each 23 | entity found. If False, it merges them (i.e., removes the prefix and only 24 | returns the entity type). 25 | 26 | """ 27 | iobs = [iob for sent in sentences for (x,iob) in sent] 28 | tagset = set(iobs) 29 | if not with_prefix: 30 | tagset = set([t[2:] for t in list(tagset) if t != 'O']) 31 | return tagset 32 | 33 | 34 | def get_IOB_counts(sentences): 35 | """ Return a counter with IOB labels and their frequency. 36 | 37 | """ 38 | types2 =[ j[1] for sublist in sentences for j in sublist] #list of IOBs 39 | ner_tags = Counter() 40 | 41 | for i,x in enumerate(types2): 42 | ner_tags[x] += 1 43 | 44 | return ner_tags 45 | 46 | 47 | def get_word_counts(sentences, exclude_O = False): 48 | """ Return a Counter containing the number of times each word has appeared 49 | within the list of sentences. 50 | 51 | Parameters 52 | ---------- 53 | 54 | sentences : list 55 | List of lists (sentences) 56 | 57 | exclude_O : bool 58 | If True, does not count words that have 'O' labels; that is, only 59 | words corresponding to entities will be counted. 60 | 61 | """ 62 | f = flatten(sentences) 63 | 64 | if exclude_O: 65 | words = [x[0] for (x, iob) in f if iob != 'O'] 66 | else: 67 | words = [x[0] for (x, iob) in f] 68 | 69 | word_count = Counter() 70 | for x in words: 71 | word_count[x] += 1 72 | 73 | return word_count 74 | 75 | 76 | def remove_prefix(tok): 77 | if tok[:2] in {'B-', 'I-'}: 78 | tok = tok[2:] 79 | return tok 80 | 81 | 82 | def sents_no_prefix(sents): 83 | newsents = copy.deepcopy(sents) 84 | s22 = [[ [list(x[0]),x[1]] for x in s] for s in newsents] 85 | fixed = [[ (tuple(x[0]), remove_prefix(x[1])) for x in s] for s in s22] 86 | return fixed 87 | 88 | --------------------------------------------------------------------------------