├── .gitignore ├── README.md ├── example └── documents │ ├── wikinews-20070223-train.txt │ ├── wikinews-20070306-drain.txt │ ├── wikinews-20160706-telescope.txt │ └── wikinews-20160714-UN.txt ├── licenses ├── CC-SA-3.0 │ └── LICENSE.txt ├── LICENSE.md ├── gpl-3.0 │ └── LICENSE.txt ├── verbnet-license-3.0 │ └── LICENSE.txt └── wordnet-license │ └── LICENSE.txt ├── pom.xml ├── resources └── PARC │ ├── configs │ ├── acl2016.crf.prop │ ├── acl2016.greedy.prop │ ├── acl2016.sampling.prop │ ├── predpipeline.crf.prop │ ├── predpipeline.greedy.prop │ └── predpipeline.sampling.prop │ ├── listfeatures │ ├── attribution_nouns.txt │ ├── krestel_verbs.txt │ ├── organization.hyponyms.txt │ ├── person.hyponyms.txt │ ├── titles.txt │ └── verbnet.txt │ └── news.txt └── src └── main └── java └── ims └── cs ├── bbn ├── BbnNeHandler.java └── BbnNeParser.java ├── corenlp ├── DocumentAligner.java ├── Helper.java ├── IndexedWordIterator.java ├── PARCCoreNlpPipeline.java └── TokenAligner.java ├── lingdata ├── ByteCount.java ├── Corpus.java ├── Document.java ├── DocumentId.java ├── GornAddressList.java ├── Partition.java ├── PlainTextCorpus.java ├── PlainTextDocId.java ├── Sentence.java ├── SentenceId.java ├── Token.java ├── Types.java └── WSJId.java ├── mallet ├── DocumentFeatureSet2TokenSequence.java └── PARCDocumentInstance.java ├── parc ├── PARCAttribution.java ├── PARCCorpus.java ├── ParcUtils.java ├── ProcessedCorpus.java ├── SpanLabelExtractor.java └── xml │ ├── PARCHandler.java │ └── PARCParser.java ├── qsample ├── evaluate │ ├── EvaluateClassifier.java │ ├── EvaluateSpan.java │ └── F1.java ├── features │ ├── Binning.java │ ├── BoundaryFeatures.java │ ├── FeatureExtraction.java │ ├── FeatureIndexMap.java │ ├── FeatureIntSet.java │ ├── FeatureSet.java │ ├── FeatureStringSet.java │ ├── SpanFeatures.java │ └── components │ │ ├── DocumentOffsetConjunction.java │ │ ├── DocumentQuotationFeature.java │ │ ├── SentenceConstituentFeatures.java │ │ ├── SentenceDependencyFeatures.java │ │ ├── SentenceFeaturesDerivedFromListCue.java │ │ ├── SentenceIndicatorFeatures.java │ │ ├── TokenDictFeatures.java │ │ ├── TokenLexicalFeatures.java │ │ └── TokenListFeatures.java ├── greedysample │ ├── HasScore.java │ ├── HeuristicSampler.java │ ├── PerceptronSampler.java │ └── Sampling.java ├── models │ ├── CrfClassifier.java │ ├── HigherSpanModel.java │ └── QuotationPerceptrons.java ├── perceptron │ ├── Perceptron.java │ ├── PerceptronTrainer.java │ └── Weights.java ├── run │ ├── Common.java │ ├── PlainTextCorpusReader.java │ ├── QSample.java │ ├── RunCrf.java │ ├── RunHeuristicTest.java │ └── RunPerceptronSampler.java └── spans │ ├── Span.java │ ├── SpanBegin.java │ └── SpanEnd.java └── util ├── MultiOutputStream.java ├── NewStaticPrinter.java └── StaticConfig.java /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Mobile Tools for Java (J2ME) 4 | .mtj.tmp/ 5 | 6 | # Package Files # 7 | *.jar 8 | *.war 9 | *.ear 10 | 11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 12 | hs_err_pid* 13 | 14 | # model binaries 15 | resources/PARC/models/* 16 | models.tar.gz 17 | 18 | # local configuration files 19 | resources/PARC/configs/local/ 20 | 21 | # build output 22 | target/* 23 | 24 | # tool output 25 | output 26 | 27 | # intellij project 28 | *.iml 29 | *.idea 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | QSample 2 | ======= 3 | 4 | QSample is a natural language processing tool for automatically 5 | detecting quotations in text. 6 | 7 | 8 | **Example:** In the sentence 9 | 10 | > Witnesses said that several passengers have broken bones. 11 | 12 | the span 13 | 14 | > *that several passengers have broken bones* 15 | 16 | is a quotation. 17 | 18 | 19 | Requirements 20 | ------------ 21 | 22 | Java JVM (>= 1.7) and Maven (>= 3.0.0) need to be installed. All other 23 | dependencies will be downloaded automatically. The dependencies all 24 | together will amount to ~250 MB. The trained model files take up another 25 | ~80 MB. 26 | 27 | 28 | Setup 29 | -------- 30 | 31 | Install the tool by running the following commands (NOTE: this will trigger a 32 | **~250 MB** Maven dependency download and will produce a .jar file of 33 | comparable size): 34 | 35 | git clone https://github.com/christianscheible/qsample.git 36 | cd qsample 37 | mvn compile 38 | mvn package 39 | 40 | If the build was successful, you will find two .jar files in `target/` 41 | (with and without dependencies, respectively). 42 | 43 | Next, download and unpack the pre-trained models (**~80 MB**): 44 | 45 | wget https://github.com/christianscheible/qsample/releases/download/0.1/models.tar.gz 46 | tar xzfv models.tar.gz 47 | 48 | 49 | Usage 50 | ----- 51 | 52 | Now we are ready to detect quotations. As a first step, you can run the 53 | tool on the example documents we provide in `example/documents`. The 54 | expected format is a directory of plain text files, each containing a 55 | single document. To process the documents, run the following command: 56 | 57 | java -jar target/qsample-0.1-jar-with-dependencies.jar --sample example/documents/ output 58 | 59 | QSample will produce several files in the output directory: 60 | 61 | * `.log` file storing the messages that were also output to command line 62 | * `.conf` file documenting the configuration used by the tool for this run 63 | * one `.quotations.gz` file for each document in the input directory 64 | containing the detected quotations 65 | 66 | The `.quotations.gz` files contain the predictions made by the model. As 67 | an example, take the following snippet: 68 | 69 | Witnesses 230 239 O O 70 | said 240 244 O C 71 | that 245 249 O B 72 | several 250 257 O I 73 | passengers 258 268 O I 74 | have 269 273 O I 75 | broken 274 280 O I 76 | bones 281 286 O E 77 | . 286 287 O O 78 | 79 | The output format consists of five columns. The first column contains 80 | the tokens; the second and third columns contains the byte begin and end 81 | positions of the tokens in the original input file; the fourth column 82 | contains the gold labels (if there are any); the fifth column contains 83 | the predicted quotes. The predictions are encoded using BIOE-style 84 | labels. The label `C` marks the occurrence of a *cue*, and all words 85 | between the `B` (begin) and `E` (end) tag are the *content* of the 86 | quotation. 87 | 88 | 89 | Data 90 | ---- 91 | 92 | This repository includes the following data: 93 | 94 | * `example/documents`: Three news articles from WikiNews for 95 | testing. QSample expects one plain text document per file. You can 96 | mark paragraph boundaries in the text by adding an empty line after 97 | each paragraph. Knowledge about paragraphs is useful for detecting 98 | quotations. Linguistic pre-processing is performed by Stanford 99 | CoreNLP. 100 | * `resources/PARC/configs`: Configuration files for running experiments 101 | (see below). The `acl2016*` configurations use gold pre-processing, 102 | whereas the `predpipeline*` configurations use CoreNLP processing. For 103 | each setup, we supply one file for each of the methods used in the 104 | paper. 105 | * `resources/PARC/listfeatures`: Word lists for extracting features. We 106 | supply lists of attribution nouns and verbs, organizations and 107 | persons, titles, as well as a mapping of verbs to VerbNet 108 | classes. These lists were generated from third-party resources, see 109 | `licenses/LICENSE.md`. 110 | * `resources/news.txt`: A list of WSJ ID's that contain news documents. 111 | 112 | 113 | Running an experiment 114 | --------------------- 115 | 116 | To run an experiment on annotated data, you need to obtain several 117 | resources: 118 | 119 | * Penn Attribution Relations Corpus (PARC3, http://homepages.inf.ed.ac.uk/s1052974/resources.php) 120 | * Penn Treebank 2 (https://catalog.ldc.upenn.edu/LDC95T7) 121 | * BBN Pronoun Coreference and Entity Type Corpus (https://catalog.ldc.upenn.edu/LDC2005T33) 122 | 123 | Afterwards, you can run experiments based on the configuration files in 124 | `resources/PARC/configs/`. To test the pre-trained models, you need to 125 | adapt the paths in the configuration files. To train a model, you can 126 | simply switch from `TEST` to `TRAIN` mode in the configuration. 127 | 128 | 129 | More information 130 | ---------------- 131 | 132 | For more information, refer to our paper (available at 133 | http://www.aclweb.org/anthology/P/P16/P16-1164.pdf): 134 | 135 | @InProceedings{scheibleklingerpado2016, 136 | author = {Scheible, Christian and Klinger, Roman and Pad\'{o}, Sebastian}, 137 | title = {Model Architectures for Quotation Detection}, 138 | booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics}, 139 | pages = {1736-1745}, 140 | year = {2016} 141 | } 142 | 143 | 144 | or check the tool's website at 145 | http://www.ims.uni-stuttgart.de/data/qsample for news. 146 | 147 | 148 | License 149 | ------- 150 | 151 | Please see `licenses/LICENSE.md`. 152 | -------------------------------------------------------------------------------- /example/documents/wikinews-20070223-train.txt: -------------------------------------------------------------------------------- 1 | A 9 Carriage Class 390 Pendolino train, with as many as 180 people onboard, operated by Virgin Trains has derailed and crashed in Cumbria, England. 2 | 3 | The train was the 17:15 service from London's Euston Station to Glasgow Central. Witnesses said that several passengers have broken bones. All but one carriage have slid down the embankment and all of the carriages were derailed. 5 people are still seriously injured in hospital, including the driver Ian Black, and 11 people altogether are still in hospital. Hospital reports early on Saturday morning indicate 1 death, 84 year old lady Margaret Masson from Glasgow. 4 | 5 | "It's our understanding there are a number of people injured on the train. We think there are numerous injuries," said a spokesman for the Cumbria Fire and Rescue squad, Brian Mitchelhill. 6 | 7 | "A train has crashed between Oxenholme and Tebay, but that is all we know at the moment. We have got two cars travelling there now and local police are attending," said a spokeswoman for the British Transport Police. 8 | 9 | At least twelve ambulances and at least five fire trucks are at the scene, along with 3 Royal Air Force Sea King helicopters, two mountain rescue teams and one police helicopter. The rescue effort was hampered by limited access to the remote site and poor weather conditions. 10 | 11 | "At the moment, we have reports of various injuries, from leg injuries to back injuries and head injuries -- ranging from minor to quite serious," said an unnamed ambulance official. 12 | 13 | Virgin West Coast Class 390 Pendolino EMU 390009 at platform 1 of Carlisle railway station. 14 | "The carriage I am in is completely on its side, it did a sort of bump - and I was thinking don't worry this fine - but then the swaying became very dramatic," said a BBC executive, Caroline Thomson. "Where I am there are some injuries - a woman with a very badly hurt back but I suspect further up the train it's a bit more serious." 15 | 16 | First reports suggested that the train hit something on the track at 20:15 near Grayrigg, between Oxenholme and Tebay, derailed and slid down an embankment. Early investigators reports say a set of points will be the primary focus of the investigation. All but one of the train's carriages are said to have totally come off the tracks, a spokeswoman said. 17 | 18 | So far 12 passengers have been taken to The Royal Preston Hospital, where three casualties are said to be in a "critical condition" and one serious. 3 passengers with limb injuries are at the Royal Lancaster Infirmary. People with minor injuries were taken to a nearby farm before being transferred to a Kendal hotel. 19 | 20 | As of 22:16, Sleeper services over the affected line have been suspended. Other major services are terminating at Preston or Carlisle according to (UK) National Rail Enquires. A five mile cordon has been set up to seal the crash site as investigators from the Rail Accident Investigation Branch attempt to determine the cause of the crash. 21 | 22 | Virgin owner Richard Branson came to the site from his holiday in Switzerland and held a press conference at a safe distance, roughly 200 meters, from the site. Branson stated that the Virgin Train Pendolino was "built like a tank", and believed the track was to blame. All of the carriages retained their integrity and none of the windows broke. Mr Branson also praised the driver of the train for attempting to stop the train and not leaving the cockpit. 23 | 24 | Network Rail has checked 700 sets of rail points in response to the accident, though no speed restrictions have been imposed. 25 | -------------------------------------------------------------------------------- /example/documents/wikinews-20070306-drain.txt: -------------------------------------------------------------------------------- 1 | A rupture in the underground stormwater drain system opened a huge sinkhole on February 23, killing three people and bringing down twelve houses in Guatemala City. 2 | 3 | Teenagers Irma and David Soyos and their father, 53-year old Domingo Soyos were killed when their house collapsed into the sinkhole. Nearly a thousand people were evacuated from the San Antonio neighborhood after the collapse. 4 | 5 | Wikinews interviewed Eric Haddox, a civil engineer who has visited the site of the sinkhole and spoken to the engineers working on fixing the drain. Mr. Haddox, who specialises in the building of earthworks, roads, water supply and sewage systems, and is working as a missionary in Guatemala, visited the site following the collapse to help in the recovery effort. 6 | 7 | Mr. Haddox told us that the size of the hole is much smaller than the 330 feet depth originally reported and that the erosion causing the collapse is believed to have happened over a long time, and not just during the recent rains as initially suspected. 8 | 9 | There are also concerns that a four-story building less than a metre from the edge of the hole may collapse as the earth under the building continues to be eroded. 10 | 11 | Before the collapse, a junction box linked two collector pipes to a 3.5m main pipe leading to a nearby canyon in a system believed to be 20 to 50 years old. The surrounding earth had been filled in artificially to level the ground, but the fill was not well compacted before being built upon. Such leveling of the ground is widespread in Guatemala city. 12 | 13 | It is thought that, at some point in the last 20 years, either one of the collector pipes ruptured or was detached from the junction box, possibly because of seismic activity. Water gushing out of the break following rainstorms gradually eroded the loosely compacted soil, creating an expanding cavern around the junction box. On February 23, the roof of this cavern collapsed, creating the sinkhole, 20m wide at the top and tapering out towards the bottom, which is about 60m (204 feet) deep, not 330 feet as originally reported. 14 | 15 | "Things like this don't happen often and there are many interesting engineering lessons to be learned with them", Mr. Haddox said. 16 | 17 | The sinkhole has continued to expand even after the collapse, since the collector pipes continue to carry water, which cascades 15m down the sinkhole to the main pipe, further eroding the sides of the sinkhole. The hole was about 25m wide at the top and 40m wide at the bottom a week ago. 18 | 19 | A bypass pipe is being laid to divert the water away from the junction to arrest further erosion. The sinkhole will then have to be drained before repair work can begin. 20 | 21 | Authorities are also concerned that similar breakages and undermining may be happening at other locations, Mr. Haddox said. Muddy water has been seen coming out of the main collector pipes, but it is not certain whether this is due to ruptures elsewhere or simply mud from the surface that has been washed into the drainage system. 22 | 23 | -------------------------------------------------------------------------------- /example/documents/wikinews-20160706-telescope.txt: -------------------------------------------------------------------------------- 1 | On Sunday, China announced the attachment of the final panel to its telescope named Five hundred meter Aperture Spherical Telescope (FAST). This piece marks the end of a five-year-long US$180 million (CNY¥1.2 billion) construction project. 2 | 3 | FAST comprises about 4,500 panels and spans a diameter of 500 meters (about 1640 feet). The telescope is part of a series of ventures into space exploration by China, including planning another robotic Moon mission and creating a Chinese space station, with its core module set to be launched into space in 2018. With the country's founding centenary coming in 2049, Chinese President Xi Jinping said during a Beijing conference, "great scientific and technological capacity is a must for China to be strong". 4 | 5 | In order to achieve optimal electromagnetic performance for FAST with minimal signal interference, it was built in the South China Karst. This ultimately forced the relocation of about 9,100 inhabitants within a 3.1-mile (5km) radius of the telescope. The residents received about US$1,800 (CNY¥12,000) in reimbursement, with those experiencing difficulties with housing receiving about US$1,500 (CNY¥10,000) in extra compensation. The Chinese government supports the resettlement, with senior party official Li Yuecheng saying the relocation would provide a "sound electromagnetic wave environment". 6 | 7 | The telescope is now the largest-diameter single-dish radio telescope. It took the spot from the 305-meter diameter Arecibo Observatory telescope in Puerto Rico. Russia's RATAN-600 multi-element radio telescope has a diameter of 576 meters. This adds to China's record-defying achievements; it contains the world's largest bridge and the world's longest wall, the Great Wall of China. 8 | 9 | The telescope is set to be ready for use in September. Its possible uses include exploration for pulsars, a special type of neutron stars detected through their emission of radio pulses. Scientists have also described the telescope's potential to explore alien civilization, with NAO Radio Astronomy Technology Laboratory director Peng Bo saying FAST's "potential to discover an alien civilization will be 5 to 10 times that of current equipment, as it can see farther and darker planets". 10 | 11 | -------------------------------------------------------------------------------- /example/documents/wikinews-20160714-UN.txt: -------------------------------------------------------------------------------- 1 | On Tuesday, a United Nations (UN) tribunal in The Hague dismissed China's sovereignty claims to the South China Sea, a body of water connecting to the Pacific Ocean which is also bordered by the Philippines, Vietnam, Brunei, Malaysia, Taiwan, and Indonesia. Court battles over the claims between China and the Philippines go back to 2013. 2 | 3 | These claims were established by China during the reign of its Nationalist government in the 1940s, marked by a demarcation line nicknamed its Nine-dash line. Its line stretched hundreds of miles from the Chinese mainland, including about nine tenths of the entire sea. The South China Sea is a valuable property, providing passage for about US$5 trillion in trade by planes and boats every year. China is not the only country to claim large parts of the sea; notably, Taiwan and Vietnam have also done so, but other large-scale claimants have been less militarily active about their claims than China. 4 | 5 | China has built several artificial islands and military bases in the South China Sea. The tribunal scolded the impeding of fishing and exploration in the sea by China, which it deemed against the United Nations Convention on the Law of the Sea (UNCLOS), signed by China in 1982. The tribunal also concluded China knowingly permitted the poaching of endangered turtles and clams as well as destroyed coral reefs to construct artificial islands. 6 | 7 | UNCLOS permits countries to claim a 200-nautical mile area from their mainland, referred to as an exclusive economic zone. It also permits freedom of navigation, allowing unimpeded exploration through "high seas": international waters also available for the use of fishing and trade passages. 8 | 9 | There is no process to enforce the decision. UNCLOS allows countries to exclude themselves from "compulsory binding procedures for the settlement of disputes" as defined in Part XV, Section 3 - Article 298. China exercised this right to exclude themselves from compulsory binding procedures on August 25, 2006. They reject the jurisdiction or authority of the tribunal's findings. Various other countries have also exercised Article 298 partially or fully, such as Australia, Canada, the UK, Russia, and France. 10 | 11 | Many nations made statements after the decision. The Chinese government opposed the decision, calling it "ill-founded". It said "China neither accepts nor recognizes" the decision. The Philippine government referred to the decision as a "milestone decision". The US, a key ally with many of the countries claiming parts of the sea, said it was an "important contribution to the shared goal of a peaceful resolution to disputes in the South China Sea". 12 | -------------------------------------------------------------------------------- /licenses/LICENSE.md: -------------------------------------------------------------------------------- 1 | Code 2 | ==== 3 | 4 | Our code is, unless otherwise specified below, subject to the GPL 3.0 license (`gpl-3.0/`). 5 | 6 | MultiOutputStream based on code by 7 | http://www.codeproject.com/Tips/315892/A-quick-and-easy-way-to-direct-Java-System-out-to 8 | (CodeProject license, http://www.codeproject.com/info/cpol10.aspx) 9 | 10 | 11 | Resources 12 | ========= 13 | 14 | `resources/PARC/listfeatures/`: Lists and dictionaries for feature extraction 15 | 16 | * `attribution_nouns.txt`: List of attribution nouns published by Pareti (2015). 17 | * `krestel_verbs.txt`: List of attribution verbs published by Krestel et al. (2008). 18 | * `organization.hyponyms.txt`, `person.hyponyms.txt`: List of persons and organizations, extracted from WordNet (WordNet license, wordnet-license.txt) 19 | * `titles.txt`: List of titles collected from Wikipedia page https://en.wikipedia.org/wiki/Title (CC-SA license, http://creativecommons.org/licenses/by-sa/3.0/) 20 | * `verbnet.txt`: VerbNet category mappings (VerbNet license, verbnet-license.3.0.txt) 21 | 22 | `resources/PARC/news.txt`: List of WSJ news articles by http://www.let.rug.nl/~bplank/metadata/genre_files_updated.html 23 | 24 | `examples/documents`: Three news documents from WikiNews (CC-SA license) 25 | -------------------------------------------------------------------------------- /licenses/verbnet-license-3.0/LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | VerbNet 3.0 License (also applies to VerbNet 3.X versions) 3 | 4 | This software and database is being provided to you, the LICENSEE, by 5 | the University of Colorado under the following license. By obtaining, using 6 | and/or copying this software and database, you agree that you have 7 | read, understood, and will comply with these terms and conditions.: 8 | 9 | Permission to use, copy, modify and distribute this software and 10 | database and its documentation for any purpose and without fee or 11 | royalty is hereby granted, provided that you agree to comply with 12 | the following copyright notice and statements, including the disclaimer, 13 | and that the same appear on ALL copies of the software, database and 14 | documentation, including modifications that you make for internal 15 | use or for distribution. 16 | 17 | VerbNet 3.0 (or 3.X) Copyright 2009 by University of Colorado. All rights reserved. 18 | 19 | THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND THE UNIVERSITY 20 | OF COLORADO MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR 21 | IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, UNIVERSITY 22 | OF COLORADO MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- 23 | ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE 24 | OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT 25 | INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR 26 | OTHER RIGHTS. 27 | 28 | The name of University of Colorado or CU may not be used in 29 | advertising or publicity pertaining to distribution of the software 30 | and/or database. Title to copyright in this software, database and 31 | any associated documentation shall at all times remain with 32 | University of Colorado and LICENSEE agrees to preserve same. 33 | 34 | Please reference the following document(s) in any description of 35 | applications based on VerbNet 3.0 or 3.X: 36 | 37 | Karin Kipper, Anna Korhonen, Neville Ryant, Martha Palmer, 38 | A Large-scale Classification of English Verbs, 39 | Language Resources and Evaluation Journal, 42(1), pp. 21-40, 40 | Springer Netherland, 2008. 41 | 42 | and/or 43 | 44 | Karin Kipper Schuler, Anna Korhonen, Susan W. Brown, VerbNet overview, 45 | extensions, mappings and apps, Tutorial, NAACL-HLT 2009, Boulder, 46 | Colorado. 47 | -------------------------------------------------------------------------------- /licenses/wordnet-license/LICENSE.txt: -------------------------------------------------------------------------------- 1 | WordNet Release 3.0 2 | 3 | This software and database is being provided to you, the LICENSEE, by Princeton University under the following license. By obtaining, using and/or copying this software and database, you agree that you have read, understood, and will comply with these terms and conditions.: Permission to use, copy, modify and distribute this software and database and its documentation for any purpose and without fee or royalty is hereby granted, provided that you agree to comply with the following copyright notice and statements, including the disclaimer, and that the same appear on ALL copies of the software, database and documentation, including modifications that you make for internal use or for distribution. WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved. THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of Princeton University or Princeton may not be used in advertising or publicity pertaining to distribution of the software and/or database. Title to copyright in this software, database and any associated documentation shall at all times remain with Princeton University and LICENSEE agrees to preserve same. 4 | 5 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | QSample 6 | ims.cs 7 | qsample 8 | 0.1 9 | 2016 10 | 11 | 1.7 12 | 4.10 13 | UTF-8 14 | 15 | 16 | IMS, University of Stuttgart, Germany 17 | http://www.ims.uni-stuttgart.de/~scheibcn 18 | 19 | 20 | 21 | 22 | 23 | 24 | junit 25 | junit 26 | ${junit.version} 27 | 28 | 29 | 30 | edu.stanford.nlp 31 | stanford-corenlp 32 | 3.9.2 33 | 34 | 35 | 36 | edu.stanford.nlp 37 | stanford-corenlp 38 | 3.9.2 39 | models 40 | 41 | 42 | 43 | cc.mallet 44 | mallet 45 | 2.0.7 46 | 47 | 48 | 49 | 50 | org.apache.commons 51 | commons-lang3 52 | 3.0 53 | 54 | 55 | 56 | 57 | net.sf.jgrapht 58 | jgrapht 59 | 0.8.3 60 | 61 | 62 | 63 | 64 | 65 | src/main/java 66 | 67 | 68 | 69 | org.apache.maven.plugins 70 | maven-compiler-plugin 71 | 3.5.1 72 | 73 | 1.7 74 | 1.7 75 | 76 | 77 | 78 | 79 | 80 | 81 | maven-assembly-plugin 82 | 2.6 83 | 84 | 85 | make-assembly 86 | package 87 | 88 | attached 89 | 90 | 91 | 92 | 93 | 94 | 95 | ims.cs.qsample.run.QSample 96 | 97 | 98 | 99 | jar-with-dependencies 100 | 101 | 102 | 103 | 104 | 105 | org.apache.maven.plugins 106 | maven-jar-plugin 107 | 3.0.0 108 | 109 | 110 | make-assembly 111 | package 112 | 113 | 114 | 115 | 116 | **/log4j.properties 117 | 118 | 119 | 120 | ims.cs.qsample.run.QSample 121 | 122 | 123 | 124 | 125 | 126 | 127 | maven-release-plugin 128 | 2.1 129 | 130 | 131 | release 132 | deploy package 133 | 134 | 135 | 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /resources/PARC/configs/acl2016.crf.prop: -------------------------------------------------------------------------------- 1 | # Properties file to replicate the ACL 2016 CRF results 2 | # To run this, please set the following paths first: 3 | # path for writing output 4 | outputDirectory=/path/to/output 5 | # path to PARC3 6 | parcRoot=/path/to/PARC3_complete 7 | # path to PTB raw data 8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/ 9 | # path to BBN named entities 10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/ 11 | # path for caching CoreNLP output (optional, see below) 12 | coreNlpOutputDirectory=/tmp/ 13 | 14 | # you may optionally change the following settings 15 | # switch this to true if you want a lot of debug output 16 | verbose=false 17 | # switch this to TRAIN if you also want to train the model 18 | cliMode=TEST 19 | # switch this on to cache CoreNLP output in a file 20 | cacheParses=true 21 | # switch off if you want all outputs in one file 22 | oneFilePerInput=false 23 | 24 | # the following settings need to remain unchanged 25 | # Training options 26 | modelForTextFileMode=CRF 27 | quotationTypes=DIM 28 | jackknifing=false 29 | outerIter=30 30 | innerIter=50 31 | predictionIter=1000 32 | maxNumTrials=10 33 | predictEvery=10 34 | maxCueDistanceSampling=30 35 | maxCueDistanceHeuristic=30 36 | maxLengthSampling=75 37 | maxLengthHeuristic=50 38 | cueMargin=25.0 39 | beginMargin=25.0 40 | endMargin=25.0 41 | samplerMarginPositive=15 42 | cueTemperature=10.0 43 | beginTemperature=10.0 44 | endTemperature=10.0 45 | useGoldPreprocessing=true 46 | 47 | # Paths 48 | inputDirectory=*NOT USED IN EXPERIMENT MODE* 49 | crfModelFile=resources/PARC/models/acl2016.goldtok.crfmodel 50 | perceptronModelFile=resources/PARC/models/acl2016.goldtok.models 51 | 52 | # Features 53 | dependencyParentRelHead=true 54 | constituentLevel=true 55 | lexicalLemma=false 56 | sentenceHasQuote=true 57 | constituentBinningStacked=false 58 | sentenceHasCue=false 59 | sentenceLength=true 60 | dependencyChildRel=true 61 | lexicalPos=false 62 | sentenceLengthBinningStacked=false 63 | dependencyChildRelHead=true 64 | constituentParent=true 65 | dependencyFeatures=true 66 | lexicalWindowSize=5 67 | documentQuotationFeature=true 68 | sentenceHasNe=true 69 | samplerMarginNegative=1 70 | constituentLeftmost=true 71 | flattenQuotes=true 72 | useBioeTags=true 73 | constituentAncestorL=true 74 | sentenceHasPronoun=true 75 | lexicalToken=false 76 | sentenceLengthBinning=false 77 | dependencyCueDependent=true 78 | constituentGoverning=true 79 | constituentBinning=false 80 | dependencyParentRel=true 81 | constituentFeatures=true 82 | lexicalBigram=false 83 | documentOffsetConjunction=true 84 | 85 | -------------------------------------------------------------------------------- /resources/PARC/configs/acl2016.greedy.prop: -------------------------------------------------------------------------------- 1 | # Properties file to replicate the ACL 2016 greedy results 2 | # To run this, please set the following paths first: 3 | # path for writing output 4 | outputDirectory=/path/to/output 5 | # path to PARC3 6 | parcRoot=/path/to/PARC3_complete 7 | # path to PTB raw data 8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/ 9 | # path to BBN named entities 10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/ 11 | # path for caching CoreNLP output (optional, see below) 12 | coreNlpOutputDirectory=/tmp/ 13 | 14 | # you may optionally change the following settings 15 | # switch this to true if you want a lot of debug output 16 | verbose=false 17 | # switch this to TRAIN if you also want to train the model 18 | cliMode=TEST 19 | # switch this on to cache CoreNLP output in a file 20 | cacheParses=true 21 | # switch off if you want all outputs in one file 22 | oneFilePerInput=false 23 | 24 | # the following settings need to remain unchanged 25 | # Training options 26 | modelForTextFileMode=GREEDY 27 | quotationTypes=DIM 28 | jackknifing=false 29 | outerIter=30 30 | innerIter=50 31 | predictionIter=1000 32 | maxNumTrials=10 33 | predictEvery=10 34 | maxCueDistanceSampling=30 35 | maxCueDistanceHeuristic=30 36 | maxLengthSampling=75 37 | maxLengthHeuristic=50 38 | cueMargin=25.0 39 | beginMargin=25.0 40 | endMargin=25.0 41 | samplerMarginPositive=15 42 | cueTemperature=10.0 43 | beginTemperature=10.0 44 | endTemperature=10.0 45 | useGoldPreprocessing=true 46 | 47 | # Paths 48 | inputDirectory=*NOT USED IN EXPERIMENT MODE* 49 | crfModelFile=resources/PARC/models/acl2016.goldtok.crfmodel 50 | perceptronModelFile=resources/PARC/models/acl2016.goldtok.models 51 | 52 | # Features 53 | dependencyParentRelHead=true 54 | constituentLevel=true 55 | lexicalLemma=false 56 | sentenceHasQuote=true 57 | constituentBinningStacked=false 58 | sentenceHasCue=false 59 | sentenceLength=true 60 | dependencyChildRel=true 61 | lexicalPos=false 62 | sentenceLengthBinningStacked=false 63 | dependencyChildRelHead=true 64 | constituentParent=true 65 | dependencyFeatures=true 66 | lexicalWindowSize=5 67 | documentQuotationFeature=true 68 | sentenceHasNe=true 69 | samplerMarginNegative=1 70 | constituentLeftmost=true 71 | flattenQuotes=true 72 | useBioeTags=true 73 | constituentAncestorL=true 74 | sentenceHasPronoun=true 75 | lexicalToken=false 76 | sentenceLengthBinning=false 77 | dependencyCueDependent=true 78 | constituentGoverning=true 79 | constituentBinning=false 80 | dependencyParentRel=true 81 | constituentFeatures=true 82 | lexicalBigram=false 83 | documentOffsetConjunction=true 84 | 85 | -------------------------------------------------------------------------------- /resources/PARC/configs/acl2016.sampling.prop: -------------------------------------------------------------------------------- 1 | # Properties file to replicate the ACL 2016 greedy results 2 | # To run this, please set the following paths first: 3 | # path for writing output 4 | outputDirectory=/path/to/output 5 | # path to PARC3 6 | parcRoot=/path/to/PARC3_complete 7 | # path to PTB raw data 8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/ 9 | # path to BBN named entities 10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/ 11 | # path for caching CoreNLP output (optional, see below) 12 | coreNlpOutputDirectory=/tmp/ 13 | 14 | # you may optionally change the following settings 15 | # switch this to true if you want a lot of debug output 16 | verbose=false 17 | # switch this to TRAIN if you also want to train the model 18 | cliMode=TEST 19 | # switch this on to cache CoreNLP output in a file 20 | cacheParses=true 21 | # switch off if you want all outputs in one file 22 | oneFilePerInput=false 23 | 24 | # the following settings need to remain unchanged 25 | # Training options 26 | modelForTextFileMode=SAMPLE 27 | quotationTypes=DIM 28 | jackknifing=false 29 | outerIter=30 30 | innerIter=50 31 | predictionIter=1000 32 | maxNumTrials=10 33 | predictEvery=10 34 | maxCueDistanceSampling=30 35 | maxCueDistanceHeuristic=30 36 | maxLengthSampling=75 37 | maxLengthHeuristic=50 38 | cueMargin=25.0 39 | beginMargin=25.0 40 | endMargin=25.0 41 | samplerMarginPositive=15 42 | cueTemperature=10.0 43 | beginTemperature=10.0 44 | endTemperature=10.0 45 | useGoldPreprocessing=true 46 | 47 | # Paths 48 | inputDirectory=*NOT USED IN EXPERIMENT MODE* 49 | crfModelFile=resources/PARC/models/acl2016.goldtok.crfmodel 50 | perceptronModelFile=resources/PARC/models/acl2016.goldtok.models 51 | 52 | # Features 53 | dependencyParentRelHead=true 54 | constituentLevel=true 55 | lexicalLemma=false 56 | sentenceHasQuote=true 57 | constituentBinningStacked=false 58 | sentenceHasCue=false 59 | sentenceLength=true 60 | dependencyChildRel=true 61 | lexicalPos=false 62 | sentenceLengthBinningStacked=false 63 | dependencyChildRelHead=true 64 | constituentParent=true 65 | dependencyFeatures=true 66 | lexicalWindowSize=5 67 | documentQuotationFeature=true 68 | sentenceHasNe=true 69 | samplerMarginNegative=1 70 | constituentLeftmost=true 71 | flattenQuotes=true 72 | useBioeTags=true 73 | constituentAncestorL=true 74 | sentenceHasPronoun=true 75 | lexicalToken=false 76 | sentenceLengthBinning=false 77 | dependencyCueDependent=true 78 | constituentGoverning=true 79 | constituentBinning=false 80 | dependencyParentRel=true 81 | constituentFeatures=true 82 | lexicalBigram=false 83 | documentOffsetConjunction=true 84 | 85 | -------------------------------------------------------------------------------- /resources/PARC/configs/predpipeline.crf.prop: -------------------------------------------------------------------------------- 1 | # Properties file to replicate the ACL 2016 CRF results 2 | # To run this, please set the following paths first: 3 | # path for writing output 4 | outputDirectory=/path/to/output 5 | # path to PARC3 6 | parcRoot=/path/to/PARC3_complete 7 | # path to PTB raw data 8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/ 9 | # path to BBN named entities 10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/ 11 | # path for caching CoreNLP output (optional, see below) 12 | coreNlpOutputDirectory=/tmp/ 13 | 14 | # you may optionally change the following settings 15 | # switch this to true if you want a lot of debug output 16 | verbose=false 17 | # switch this to TRAIN if you also want to train the model 18 | cliMode=TEST 19 | # switch this on to cache CoreNLP output in a file 20 | cacheParses=false 21 | # switch off if you want all outputs in one file 22 | oneFilePerInput=true 23 | 24 | # the following settings need to remain unchanged 25 | # Training options 26 | modelForTextFileMode=CRF 27 | quotationTypes=DIM 28 | jackknifing=false 29 | outerIter=30 30 | innerIter=50 31 | predictionIter=1000 32 | maxNumTrials=10 33 | predictEvery=10 34 | maxCueDistanceSampling=30 35 | maxCueDistanceHeuristic=30 36 | maxLengthSampling=75 37 | maxLengthHeuristic=50 38 | cueMargin=25.0 39 | beginMargin=25.0 40 | endMargin=25.0 41 | samplerMarginPositive=15 42 | cueTemperature=10.0 43 | beginTemperature=10.0 44 | endTemperature=10.0 45 | useGoldPreprocessing=false 46 | 47 | # Paths 48 | inputDirectory=*NOT USED IN EXPERIMENT MODE* 49 | crfModelFile=resources/PARC/models/predpipeline.crfmodel 50 | perceptronModelFile=resources/PARC/models/predpipeline.models 51 | 52 | # Features 53 | dependencyParentRelHead=true 54 | constituentLevel=true 55 | lexicalLemma=false 56 | sentenceHasQuote=true 57 | constituentBinningStacked=false 58 | sentenceHasCue=false 59 | sentenceLength=true 60 | dependencyChildRel=true 61 | lexicalPos=false 62 | sentenceLengthBinningStacked=false 63 | dependencyChildRelHead=true 64 | constituentParent=true 65 | dependencyFeatures=true 66 | lexicalWindowSize=5 67 | documentQuotationFeature=true 68 | sentenceHasNe=true 69 | samplerMarginNegative=1 70 | constituentLeftmost=true 71 | flattenQuotes=true 72 | useBioeTags=true 73 | constituentAncestorL=true 74 | sentenceHasPronoun=true 75 | lexicalToken=false 76 | sentenceLengthBinning=false 77 | dependencyCueDependent=true 78 | constituentGoverning=true 79 | constituentBinning=false 80 | dependencyParentRel=true 81 | constituentFeatures=true 82 | lexicalBigram=false 83 | documentOffsetConjunction=true 84 | 85 | -------------------------------------------------------------------------------- /resources/PARC/configs/predpipeline.greedy.prop: -------------------------------------------------------------------------------- 1 | # Properties file to replicate the ACL 2016 CRF results 2 | # To run this, please set the following paths first: 3 | # path for writing output 4 | outputDirectory=/path/to/output 5 | # path to PARC3 6 | parcRoot=/path/to/PARC3_complete 7 | # path to PTB raw data 8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/ 9 | # path to BBN named entities 10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/ 11 | # path for caching CoreNLP output (optional, see below) 12 | coreNlpOutputDirectory=/tmp/ 13 | 14 | # you may optionally change the following settings 15 | # switch this to true if you want a lot of debug output 16 | verbose=false 17 | # switch this to TRAIN if you also want to train the model 18 | cliMode=TEST 19 | # switch this on to cache CoreNLP output in a file 20 | cacheParses=false 21 | # switch off if you want all outputs in one file 22 | oneFilePerInput=true 23 | 24 | # the following settings need to remain unchanged 25 | # Training options 26 | modelForTextFileMode=GREEDY 27 | quotationTypes=DIM 28 | jackknifing=false 29 | outerIter=30 30 | innerIter=50 31 | predictionIter=1000 32 | maxNumTrials=10 33 | predictEvery=10 34 | maxCueDistanceSampling=30 35 | maxCueDistanceHeuristic=30 36 | maxLengthSampling=75 37 | maxLengthHeuristic=50 38 | cueMargin=25.0 39 | beginMargin=25.0 40 | endMargin=25.0 41 | samplerMarginPositive=15 42 | cueTemperature=10.0 43 | beginTemperature=10.0 44 | endTemperature=10.0 45 | useGoldPreprocessing=false 46 | 47 | # Paths 48 | inputDirectory=*NOT USED IN EXPERIMENT MODE* 49 | crfModelFile=resources/PARC/models/predpipeline.crfmodel 50 | perceptronModelFile=resources/PARC/models/predpipeline.models 51 | 52 | # Features 53 | dependencyParentRelHead=true 54 | constituentLevel=true 55 | lexicalLemma=false 56 | sentenceHasQuote=true 57 | constituentBinningStacked=false 58 | sentenceHasCue=false 59 | sentenceLength=true 60 | dependencyChildRel=true 61 | lexicalPos=false 62 | sentenceLengthBinningStacked=false 63 | dependencyChildRelHead=true 64 | constituentParent=true 65 | dependencyFeatures=true 66 | lexicalWindowSize=5 67 | documentQuotationFeature=true 68 | sentenceHasNe=true 69 | samplerMarginNegative=1 70 | constituentLeftmost=true 71 | flattenQuotes=true 72 | useBioeTags=true 73 | constituentAncestorL=true 74 | sentenceHasPronoun=true 75 | lexicalToken=false 76 | sentenceLengthBinning=false 77 | dependencyCueDependent=true 78 | constituentGoverning=true 79 | constituentBinning=false 80 | dependencyParentRel=true 81 | constituentFeatures=true 82 | lexicalBigram=false 83 | documentOffsetConjunction=true 84 | 85 | -------------------------------------------------------------------------------- /resources/PARC/configs/predpipeline.sampling.prop: -------------------------------------------------------------------------------- 1 | # Properties file to replicate the ACL 2016 CRF results 2 | # To run this, please set the following paths first: 3 | # path for writing output 4 | outputDirectory=/path/to/output 5 | # path to PARC3 6 | parcRoot=/path/to/PARC3_complete 7 | # path to PTB raw data 8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/ 9 | # path to BBN named entities 10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/ 11 | # path for caching CoreNLP output (optional, see below) 12 | coreNlpOutputDirectory=/tmp/ 13 | 14 | # you may optionally change the following settings 15 | # switch this to true if you want a lot of debug output 16 | verbose=false 17 | # switch this to TRAIN if you also want to train the model 18 | cliMode=TEST 19 | # switch this on to cache CoreNLP output in a file 20 | cacheParses=false 21 | # switch off if you want all outputs in one file 22 | oneFilePerInput=true 23 | 24 | # the following settings need to remain unchanged 25 | # Training options 26 | modelForTextFileMode=SAMPLE 27 | quotationTypes=DIM 28 | jackknifing=false 29 | outerIter=30 30 | innerIter=50 31 | predictionIter=1000 32 | maxNumTrials=10 33 | predictEvery=10 34 | maxCueDistanceSampling=30 35 | maxCueDistanceHeuristic=30 36 | maxLengthSampling=75 37 | maxLengthHeuristic=50 38 | cueMargin=25.0 39 | beginMargin=25.0 40 | endMargin=25.0 41 | samplerMarginPositive=15 42 | cueTemperature=10.0 43 | beginTemperature=10.0 44 | endTemperature=10.0 45 | useGoldPreprocessing=false 46 | 47 | # Paths 48 | inputDirectory=*NOT USED IN EXPERIMENT MODE* 49 | crfModelFile=resources/PARC/models/predpipeline.crfmodel 50 | perceptronModelFile=resources/PARC/models/predpipeline.models 51 | 52 | # Features 53 | dependencyParentRelHead=true 54 | constituentLevel=true 55 | lexicalLemma=false 56 | sentenceHasQuote=true 57 | constituentBinningStacked=false 58 | sentenceHasCue=false 59 | sentenceLength=true 60 | dependencyChildRel=true 61 | lexicalPos=false 62 | sentenceLengthBinningStacked=false 63 | dependencyChildRelHead=true 64 | constituentParent=true 65 | dependencyFeatures=true 66 | lexicalWindowSize=5 67 | documentQuotationFeature=true 68 | sentenceHasNe=true 69 | samplerMarginNegative=1 70 | constituentLeftmost=true 71 | flattenQuotes=true 72 | useBioeTags=true 73 | constituentAncestorL=true 74 | sentenceHasPronoun=true 75 | lexicalToken=false 76 | sentenceLengthBinning=false 77 | dependencyCueDependent=true 78 | constituentGoverning=true 79 | constituentBinning=false 80 | dependencyParentRel=true 81 | constituentFeatures=true 82 | lexicalBigram=false 83 | documentOffsetConjunction=true 84 | 85 | -------------------------------------------------------------------------------- /resources/PARC/listfeatures/attribution_nouns.txt: -------------------------------------------------------------------------------- 1 | accord 2 | bill 3 | counterclaim 4 | document 5 | formulation 6 | according 7 | call 8 | criticism 9 | doubt 10 | guess 11 | accusation 12 | challenge 13 | critic 14 | effort 15 | highlight 16 | acknowledgement 17 | charge 18 | cry 19 | elaboration 20 | hint 21 | ad 22 | chart 23 | data 24 | encouragement 25 | hope 26 | admission 27 | citation 28 | decision 29 | eruption 30 | idea 31 | advice 32 | claim 33 | declaration 34 | estimate 35 | illustration 36 | agreement 37 | command 38 | deduction 39 | eulogy 40 | implication 41 | allegation 42 | comment 43 | defence 44 | evidence 45 | imposition 46 | amendment 47 | commercial 48 | definition 49 | exclamation 50 | indication 51 | announcement 52 | complaint 53 | deliberation 54 | expectation 55 | information 56 | answer 57 | concern 58 | demand 59 | explanation 60 | insinuation 61 | anticipation 62 | concession 63 | denial 64 | expression 65 | inspiration 66 | argument 67 | conclusion 68 | depiction 69 | fear 70 | instruction 71 | article 72 | condition 73 | description 74 | feeling 75 | intention 76 | assertion 77 | confession 78 | dictate 79 | file 80 | interjection 81 | assumption 82 | confidence 83 | disappointment 84 | filing 85 | interpretation 86 | assurance 87 | confirmation 88 | disapproval 89 | find 90 | issue 91 | belief 92 | consideration 93 | disclosure 94 | finding 95 | joke 96 | bet 97 | contention 98 | discovery 99 | figure 100 | knowledge 101 | book 102 | convinction 103 | dispute 104 | forecast 105 | lament 106 | laugh 107 | offer 108 | question 109 | response 110 | support 111 | law 112 | opinion 113 | quotation 114 | revelation 115 | supposition 116 | lawsuit 117 | order 118 | realization 119 | rule 120 | survey 121 | lecture 122 | pact 123 | reason 124 | rumor 125 | suspicion 126 | legislation 127 | paper 128 | recognition 129 | saying 130 | talk 131 | lesson 132 | permission 133 | recollection 134 | scream 135 | temptation 136 | letter 137 | plan 138 | recommendation 139 | shout 140 | testimony 141 | list 142 | pledge 143 | recount 144 | sigh 145 | theory 146 | menace 147 | point 148 | reflection 149 | sign 150 | thought 151 | mention 152 | policy 153 | reform 154 | signal 155 | threat 156 | message 157 | poll 158 | refusal 159 | snort 160 | understandment 161 | mind 162 | praise 163 | rejection 164 | specification 165 | urge 166 | moan 167 | prediction 168 | remark 169 | speculation 170 | view 171 | need 172 | press 173 | repetition 174 | spell 175 | voice 176 | news 177 | proclamation 178 | reply 179 | statement 180 | want 181 | note 182 | project 183 | report 184 | statistic 185 | warning 186 | notice 187 | promise 188 | reproach 189 | story 190 | wisdom 191 | notification 192 | proposal 193 | request 194 | strategy 195 | worry 196 | oath 197 | protest 198 | requirement 199 | study 200 | yell 201 | objection 202 | prove 203 | research 204 | suggestion 205 | observation 206 | provision 207 | resentment 208 | suit -------------------------------------------------------------------------------- /resources/PARC/listfeatures/krestel_verbs.txt: -------------------------------------------------------------------------------- 1 | according 2 | accuse 3 | acknowledge 4 | add 5 | admit 6 | agree 7 | allege 8 | announce 9 | argue 10 | assert 11 | believe 12 | blame 13 | charge 14 | cite 15 | claim 16 | complain 17 | concede 18 | conclude 19 | confirm 20 | contend 21 | criticize 22 | declare 23 | decline 24 | deny 25 | describe 26 | disagree 27 | disclose 28 | estimate 29 | explain 30 | fear 31 | hope 32 | insist 33 | maintain 34 | mention 35 | note 36 | order 37 | predict 38 | promise 39 | recall 40 | recommend 41 | reply 42 | report 43 | say 44 | state 45 | stress 46 | suggest 47 | tell 48 | testify 49 | think 50 | urge 51 | warn 52 | worry 53 | write 54 | observe -------------------------------------------------------------------------------- /resources/PARC/listfeatures/titles.txt: -------------------------------------------------------------------------------- 1 | Mr 2 | Mrs 3 | Ms 4 | Mr. 5 | Mrs. 6 | Ms. 7 | Miss 8 | Mister 9 | Madam 10 | Hon. 11 | MP 12 | MYP 13 | Representative 14 | Senator 15 | Speaker 16 | President 17 | Councillor 18 | Alderman 19 | Selectman 20 | Delegate 21 | Mayor 22 | Lady 23 | Mayoress 24 | Lord 25 | Governor 26 | Lieutenant 27 | Prefect 28 | Prelate 29 | Premier 30 | Burgess 31 | Ambassador 32 | Envoy 33 | Secretary 34 | Cardinal 35 | Attaché 36 | Chargé 37 | Provost 38 | Prince 39 | Princess 40 | Archduke 41 | Archduchess 42 | Duke 43 | Duchess 44 | Marquis 45 | Marquess 46 | Marquise 47 | Marchioness 48 | Count 49 | Countess 50 | Earl 51 | Viscount 52 | Viscountess 53 | Baron 54 | Baroness 55 | Emperor 56 | Empress 57 | King 58 | Queen 59 | Tsar 60 | Tsarina 61 | Leader 62 | Pope 63 | Sir 64 | Dame 65 | Advocate 66 | Attorney 67 | Bailiff 68 | Barrister 69 | Chancellor 70 | Judge 71 | Justice 72 | Clerk 73 | Magistrate 74 | Promagistrate 75 | Mufti 76 | Grand Mufti 77 | Privy 78 | Counsellor 79 | Majesty 80 | Solicitor 81 | Abbess 82 | Abbot 83 | Brother 84 | Sister 85 | Mother 86 | Superior 87 | Friar 88 | Bishop 89 | Archbishop 90 | Metropolitan 91 | Presbyter 92 | Priest 93 | Priestess 94 | Father 95 | Fr. 96 | Patriarch 97 | Pope 98 | Catholicos 99 | Vicar 100 | Chaplain 101 | Canon 102 | Pastor 103 | Prelate 104 | Primate 105 | Dom 106 | Cardinal 107 | Venerable 108 | Blessed 109 | Saint 110 | Christ 111 | Deacon 112 | Archdeacon 113 | Acolyte 114 | Dean 115 | Elder 116 | Minister 117 | Monsignor 118 | Reader 119 | Almoner 120 | Dr. 121 | Dr 122 | MD 123 | PhD 124 | EdD 125 | PharmD 126 | LLD 127 | JD 128 | Prof 129 | Prof. 130 | Professor 131 | Colonel 132 | General 133 | Commodore 134 | Corporal 135 | Mate 136 | Sergeant 137 | Admiral 138 | Brigadier 139 | Captain 140 | Commander 141 | General 142 | Officer 143 | Lieutenant 144 | Major 145 | Private 146 | Constable 147 | Agent 148 | Principal 149 | Nurse 150 | Nanny 151 | Coach 152 | Wizard 153 | Chief 154 | Scout 155 | Lama 156 | Dalai 157 | Panchen 158 | Druid 159 | Archdruid 160 | Rabbi 161 | Rebbe 162 | Hakham 163 | Buddha 164 | Ayatollah 165 | Imam 166 | Bodhisattva 167 | Mullah 168 | Kohen 169 | Nat 170 | Mahdi 171 | Rosh 172 | HaYeshiva 173 | Saoshyant 174 | Tirthankar 175 | Vardapet 176 | Mahatma 177 | Pandit 178 | Swami 179 | Ustad 180 | Sheikh 181 | Emir 182 | Emira 183 | Sultan 184 | Sultana 185 | Maharajah 186 | Maharani 187 | Eze 188 | Mwami 189 | Nizam 190 | Dato 191 | Oba 192 | Tor 193 | Tiv 194 | Obi 195 | Elder 196 | Vizier 197 | Grand -------------------------------------------------------------------------------- /src/main/java/ims/cs/bbn/BbnNeHandler.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.bbn; 19 | 20 | import org.xml.sax.Attributes; 21 | import org.xml.sax.SAXException; 22 | import org.xml.sax.helpers.DefaultHandler; 23 | 24 | import java.util.ArrayList; 25 | import java.util.HashMap; 26 | import java.util.List; 27 | import java.util.Map; 28 | 29 | /** 30 | * XML handler to process named entity information from the BBN dataset. 31 | */ 32 | public class BbnNeHandler extends DefaultHandler { 33 | 34 | StringBuffer accumulator = new StringBuffer(); /* Accumulate parsed text */ 35 | List tags; 36 | Map> tagMap = new HashMap<>(); 37 | String currentTag; 38 | String fileNo; 39 | boolean tagPreceded = false; 40 | boolean disableNextTag = false; 41 | private String docNo; 42 | 43 | 44 | public void characters(char[] buffer, int start, int length) { 45 | accumulator.append(buffer, start, length); 46 | } 47 | 48 | 49 | @Override 50 | public void startDocument() throws SAXException { 51 | } 52 | 53 | @Override 54 | public void endDocument() throws SAXException { 55 | } 56 | 57 | /** 58 | * Returns all currently unprocessed text read so far 59 | * @return 60 | */ 61 | public String popText() { 62 | String text = accumulator.toString(); 63 | accumulator.setLength(0); 64 | return text; 65 | } 66 | 67 | /** 68 | * Counts number of spaces. Double spaces are conflated. 69 | * @param s 70 | * @return 71 | */ 72 | public int numSpaces(String s) { 73 | int numSpaces = 0; 74 | boolean prevIsWhitespace = false; 75 | 76 | for (int i = 0; i < s.length(); i++) { 77 | if (Character.isWhitespace(s.charAt(i))) { 78 | if (!prevIsWhitespace) 79 | numSpaces++; 80 | prevIsWhitespace = true; 81 | } else { 82 | prevIsWhitespace = false; 83 | } 84 | } 85 | 86 | return numSpaces; 87 | } 88 | 89 | /** 90 | * Counts number of words. 91 | * @param s 92 | * @return 93 | */ 94 | public int numWords(String s) { 95 | int numWords; 96 | if (s.equals("")) { 97 | numWords = 0; 98 | } else { 99 | numWords = numSpaces(s) + 1; 100 | } 101 | return numWords; 102 | } 103 | 104 | @Override 105 | public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { 106 | 107 | if (qName.equals("DOC")) { /* document starts, reset accumulator */ 108 | accumulator.setLength(0); 109 | tags = new ArrayList<>(); 110 | } else if (qName.endsWith("EX")) { /* NE tag starts */ 111 | String text = popText(); 112 | String trimText = text.trim(); 113 | 114 | // count words to align with the tokenized text 115 | int numWords = numWords(trimText); 116 | 117 | 118 | // adjust word counters in case of mid-word tags 119 | if (tagPreceded && (text.length() == 0 || !Character.isWhitespace(text.charAt(0)))) numWords--; 120 | if (text.length() == 0 || !Character.isWhitespace(text.charAt(text.length()-1))) numWords--; 121 | 122 | if (trimText.length() > 0 && numWords < 0) { 123 | disableNextTag = true; 124 | } 125 | 126 | // pad with outside tags 127 | for (int i = 0; i < numWords; i++) tags.add("O"); 128 | currentTag = atts.getValue("TYPE"); 129 | } else if (qName.equals("DOCNO")) { /* new document, reset accumulator (to be sure) */ 130 | accumulator.setLength(0); 131 | } 132 | } 133 | 134 | @Override 135 | public void endElement(String uri, String localName, String qName) { 136 | if (qName.equals("DOC")) { //* document ends */ 137 | String text = popText(); 138 | String trimText = text.trim(); 139 | int numWords = numWords(trimText); 140 | 141 | // adjust word counters in case of mid-word tags 142 | if (tagPreceded && (text.length() == 0 || !Character.isWhitespace(text.charAt(0)))) numWords--; 143 | 144 | // pad with outside tags 145 | for (int i = 0; i < numWords; i++) tags.add("O"); 146 | 147 | // store annotation 148 | tagMap.put(fileNo, tags); 149 | tagPreceded = false; 150 | } else if (qName.endsWith("EX")) { /* NE tag ends */ 151 | if (disableNextTag) { 152 | disableNextTag = false; 153 | return; 154 | } 155 | 156 | String text = popText(); 157 | String trimText = text.trim(); 158 | int numWords = numWords(trimText); 159 | for (int i = 0; i < numWords; i++) tags.add(currentTag); 160 | tagPreceded = true; 161 | } else if (qName.equals("DOCNO")) { /* document number ends, parse document number */ 162 | docNo = popText(); 163 | fileNo = docNo.trim().substring(5); 164 | tagPreceded = false; 165 | } 166 | } 167 | 168 | /** 169 | * Returns the NE annotations for a given file ID 170 | * @param fileId 171 | * @return 172 | */ 173 | public List getTags(String fileId) { 174 | return tagMap.get(fileId); 175 | } 176 | 177 | 178 | } 179 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/bbn/BbnNeParser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | package ims.cs.bbn; 18 | 19 | import ims.cs.lingdata.Document; 20 | import ims.cs.lingdata.DocumentId; 21 | import ims.cs.lingdata.Token; 22 | import ims.cs.util.StaticConfig; 23 | import org.xml.sax.InputSource; 24 | import org.xml.sax.SAXException; 25 | import org.xml.sax.XMLReader; 26 | 27 | import javax.xml.parsers.ParserConfigurationException; 28 | import javax.xml.parsers.SAXParser; 29 | import javax.xml.parsers.SAXParserFactory; 30 | import java.io.File; 31 | import java.io.IOException; 32 | import java.util.List; 33 | 34 | /** 35 | * XML parser for BBN named entity dataset 36 | */ 37 | public class BbnNeParser { 38 | 39 | 40 | private static BbnNeParser instance; 41 | private static SAXParser saxParser; 42 | private static XMLReader xmlReader; 43 | private static BbnNeHandler handler; 44 | 45 | public String currentBbnFile; 46 | 47 | 48 | private BbnNeParser() throws ParserConfigurationException, SAXException { 49 | SAXParserFactory spf = SAXParserFactory.newInstance(); 50 | saxParser = spf.newSAXParser(); 51 | xmlReader = saxParser.getXMLReader(); 52 | handler = new BbnNeHandler(); 53 | xmlReader.setContentHandler(handler); 54 | } 55 | 56 | /** 57 | * BBN splits each section in up to 4 files. This function determines which one contains the document in question. 58 | * @param document 59 | * @return 60 | */ 61 | public String getBbnFileName(Document document) { 62 | DocumentId id = document.docId; 63 | String sectionStr = id.getSectionStr(); 64 | String fileStr = id.getFileStr(); 65 | int num = Integer.parseInt(fileStr); 66 | char partitionChar; 67 | 68 | // BBN partition rule 69 | if (num < 25) { 70 | partitionChar = 'a'; 71 | } else if (num < 50) { 72 | partitionChar = 'b'; 73 | } else if (num < 75) { 74 | partitionChar = 'c'; 75 | } else { 76 | partitionChar = 'd'; 77 | } 78 | 79 | String fileName = "wsj" + sectionStr + partitionChar + ".qa"; 80 | 81 | return fileName; 82 | } 83 | 84 | 85 | /** 86 | * Takes a previously loaded WSJ document and adds BBN named entities. 87 | * This function does some rudimentary caching, which requires the WSJ documents to be parsed in order to stay fast. 88 | * @param document 89 | * @return 90 | * @throws IOException 91 | * @throws SAXException 92 | */ 93 | public Document augmentDocumentXml(Document document) throws IOException, SAXException { 94 | String fileName = getBbnFileName(document); 95 | 96 | // move to the next BBN file if necessary 97 | // this will be efficient if the documents are passed in WSJ order as it avoids reloading the same file 98 | if (!fileName.equals(currentBbnFile)) { 99 | File xmlFile = new File(StaticConfig.bbnPath + fileName); 100 | xmlReader.parse(new InputSource(xmlFile.getPath())); 101 | currentBbnFile = fileName; 102 | } 103 | 104 | List tags = handler.getTags(document.docId.getFileStr()); 105 | List tokenList = document.tokenList; 106 | 107 | // sanity check: same number of tokens? 108 | if (tags.size() != tokenList.size()) { 109 | throw new Error("Tag and token counts differ"); 110 | } 111 | 112 | // align tags and tokens 113 | for (int i = 0; i < tokenList.size(); i++) { 114 | Token token = tokenList.get(i); 115 | String neTag = tags.get(i); 116 | token.goldNer = neTag; 117 | } 118 | 119 | return document; 120 | } 121 | 122 | 123 | public static BbnNeParser getInstance() throws ParserConfigurationException, SAXException { 124 | if (instance == null) { 125 | instance = new BbnNeParser(); 126 | } 127 | return instance; 128 | } 129 | 130 | } 131 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/corenlp/DocumentAligner.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.corenlp; 19 | 20 | import java.util.ArrayList; 21 | import java.util.Iterator; 22 | import java.util.List; 23 | 24 | import ims.cs.lingdata.Document; 25 | import ims.cs.lingdata.Sentence; 26 | import ims.cs.lingdata.Token; 27 | import edu.stanford.nlp.ling.CoreAnnotations; 28 | import edu.stanford.nlp.ling.CoreLabel; 29 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; 30 | import edu.stanford.nlp.util.CoreMap; 31 | import ims.cs.util.StaticConfig; 32 | 33 | /** 34 | * Aligns CoreNLP parser output with the original document. This is necessary since CoreNLP may produce a 35 | * tokenization that deviates from the input. 36 | */ 37 | public class DocumentAligner { 38 | 39 | private Document pDocument; 40 | private List pcSentenceList; 41 | private boolean useCoreNlpQuoteCompletion = true; 42 | 43 | 44 | public DocumentAligner(Document pDocument, CoreMap cDocument) { 45 | this.pDocument = pDocument; 46 | alignSentences(pDocument, cDocument); 47 | } 48 | 49 | /** 50 | * Aligns original document and CoreNLP processed document. 51 | * @param pDocument 52 | * @param cDocument 53 | */ 54 | private void alignSentences(Document pDocument, CoreMap cDocument) { 55 | 56 | // get sentences 57 | List cSentenceList = cDocument.get(SentencesAnnotation.class); 58 | 59 | // state variables 60 | pcSentenceList = new ArrayList<>(); 61 | Iterator cSentenceIter = cSentenceList.iterator(); 62 | Iterator pTokenIter = pDocument.tokenList.iterator(); 63 | Token nextPToken = pTokenIter.next(); 64 | 65 | // now iterate over CoreNLP sentences 66 | while (cSentenceIter.hasNext()) { 67 | // get sentence tokens 68 | CoreMap cSentence = cSentenceIter.next(); 69 | List cTokens = cSentence.get(CoreAnnotations.TokensAnnotation.class); 70 | List currentSentencePTokens = new ArrayList<>(cTokens.size()); 71 | 72 | 73 | // identify last token 74 | CoreLabel finalToken = cTokens.get(cTokens.size()-1); 75 | int endPosition = finalToken.endPosition(); 76 | 77 | // align tokens by byte count until the end of the sentence 78 | while (nextPToken.goldByteCount.getBegin() <= endPosition) { 79 | currentSentencePTokens.add(nextPToken); 80 | if (nextPToken.goldByteCount.getEnd() <= endPosition) { 81 | if (pTokenIter.hasNext()) { 82 | nextPToken = pTokenIter.next(); 83 | } else { 84 | break; 85 | } 86 | } else { 87 | break; 88 | } 89 | } 90 | 91 | 92 | // check if any tokens need to be aligned at all 93 | if (currentSentencePTokens.size() > 0) { 94 | TokenAligner ta = new TokenAligner(currentSentencePTokens, cSentence); 95 | ta.setUseCoreNlpQuoteCompletion(useCoreNlpQuoteCompletion); 96 | Sentence combinedSentence = ta.getCombinedSentence(); 97 | 98 | if (combinedSentence == null) { 99 | if (StaticConfig.verbose) 100 | System.out.println("Discarding empty combined sentence: " + 101 | cSentence.toString() + currentSentencePTokens.toString()); 102 | } else { 103 | pcSentenceList.add(combinedSentence); 104 | } 105 | } else { /* sentence may be empty if CoreNLP produced spurious tokens */ 106 | if (StaticConfig.verbose) 107 | System.out.println("Discarding empty PARC sentence: " + 108 | cSentence.toString() + currentSentencePTokens.toString()); 109 | } 110 | 111 | } 112 | 113 | } 114 | 115 | /** 116 | * Returns the aligned document 117 | * @return 118 | */ 119 | public Document getDocument() { 120 | Document combinedDocument = new Document(pDocument); 121 | 122 | combinedDocument.sentenceList = pcSentenceList; 123 | 124 | List documentTokenList = new ArrayList(pcSentenceList.size() * 5); 125 | 126 | for (Sentence sentence: pcSentenceList) { 127 | sentence.document = combinedDocument; 128 | documentTokenList.addAll(sentence.tokenList); 129 | } 130 | 131 | combinedDocument.tokenList = documentTokenList; 132 | 133 | // set token positions in the new document 134 | for (int i = 0; i < combinedDocument.tokenList.size(); i++) { 135 | combinedDocument.tokenList.get(i).predPosition = i; 136 | } 137 | 138 | return combinedDocument; 139 | 140 | } 141 | 142 | } 143 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/corenlp/IndexedWordIterator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.corenlp; 19 | 20 | import java.util.Iterator; 21 | 22 | import edu.stanford.nlp.ling.IndexedWord; 23 | 24 | /** 25 | * Iterates over all indexed words safely -- this is useful as punctuation may not have an associated indexed word 26 | */ 27 | public class IndexedWordIterator implements Iterator { 28 | 29 | Iterator iter; 30 | IndexedWord currentWord; 31 | int index = 1; 32 | 33 | private void fetch() { 34 | if (iter.hasNext()) { 35 | currentWord = iter.next(); 36 | } else { 37 | currentWord = null; 38 | } 39 | } 40 | 41 | public IndexedWordIterator(Iterator iter) { 42 | this.iter = iter; 43 | fetch(); 44 | } 45 | 46 | public boolean hasNext() { 47 | return true; 48 | } 49 | 50 | public IndexedWord next() { 51 | IndexedWord returnVal; 52 | 53 | if (currentWord == null) { 54 | returnVal = null; 55 | } else if (currentWord.index() == index) { 56 | returnVal = currentWord; 57 | fetch(); 58 | } else { 59 | returnVal = null; 60 | } 61 | 62 | index++; 63 | return returnVal; 64 | } 65 | 66 | public void remove() { 67 | throw new UnsupportedOperationException("no remove allowed"); 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/lingdata/ByteCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.lingdata; 19 | 20 | /** 21 | * Byte offset information 22 | */ 23 | public class ByteCount { 24 | public int begin; 25 | public int end; 26 | 27 | public ByteCount (int begin, int end) { 28 | this.begin = begin; 29 | this.end = end; 30 | } 31 | 32 | public ByteCount(String value) { 33 | String[] tokens = value.split(","); 34 | begin = Integer.parseInt(tokens[0]); 35 | end = Integer.parseInt(tokens[1]); 36 | } 37 | 38 | public int getBegin() { 39 | return begin; 40 | } 41 | 42 | 43 | public int getEnd() { 44 | return end; 45 | } 46 | 47 | @Override 48 | public String toString() { 49 | return "" + begin + "," + end; 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/lingdata/Corpus.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.lingdata; 19 | 20 | import java.util.List; 21 | import java.util.Map; 22 | 23 | import ims.cs.lingdata.Types.PartitionName; 24 | 25 | /** 26 | * Abstract Corpus class. 27 | * A corpus has training, dev, and test partitions as well as a document list 28 | */ 29 | public abstract class Corpus { 30 | 31 | List docList; 32 | private Map partitionMap; 33 | 34 | public abstract Partition getTrain(); 35 | public abstract Partition getDev(); 36 | public abstract Partition getTest(); 37 | 38 | public List getDocumentList() { 39 | return docList; 40 | } 41 | 42 | public void setDocumentList(List docList) { 43 | this.docList = docList; 44 | } 45 | public Map getPartitionMap() { 46 | return partitionMap; 47 | } 48 | public void setPartitionMap(Map partitionMap) { 49 | this.partitionMap = partitionMap; 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/lingdata/Document.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.lingdata; 19 | 20 | import java.util.HashSet; 21 | import java.util.List; 22 | import java.util.Set; 23 | 24 | import ims.cs.lingdata.Types.Genre; 25 | import ims.cs.qsample.spans.Span; 26 | 27 | /** 28 | * Representation of a document. 29 | * Has a list of sentences and a list of tokens; holds span predictions. 30 | */ 31 | public class Document { 32 | 33 | public List sentenceList; 34 | public List tokenList; 35 | public DocumentId docId; 36 | public Genre genre; 37 | public String text; 38 | public String sourceCorpusName; 39 | 40 | // span predictions 41 | public Set predictedSpanSet; 42 | public Set goldSpanSet; 43 | 44 | 45 | // CoreNLP flag to avoid multiple processing 46 | public boolean isCoreNlpProcessed; 47 | 48 | public Document(Document pDocument) { 49 | this.docId = pDocument.docId; 50 | this.genre = pDocument.genre; 51 | this.text = pDocument.text; 52 | this.sourceCorpusName = pDocument.sourceCorpusName; 53 | 54 | this.predictedSpanSet = new HashSet(); 55 | this.goldSpanSet = new HashSet(); 56 | } 57 | 58 | 59 | public Document() { } 60 | 61 | 62 | public List getTokenList() { 63 | return tokenList; 64 | } 65 | 66 | public Set goldSpansOfLabel(String label) { 67 | Set selectedGoldSpans = new HashSet<>(); 68 | for (Span gs : goldSpanSet) { 69 | if (gs.label.equals(label)) { 70 | selectedGoldSpans.add(gs); 71 | } 72 | } 73 | return selectedGoldSpans; 74 | } 75 | 76 | public Set predictedSpansOfLabel(String label) { 77 | Set predGoldSpans = new HashSet<>(); 78 | for (Span ps : predictedSpanSet) { 79 | if (ps.label.equals(label)) { 80 | predGoldSpans.add(ps); 81 | } 82 | } 83 | return predGoldSpans; 84 | } 85 | 86 | public Token getPrevToken(Token t) { 87 | return getPrevToken(t, 1); 88 | } 89 | 90 | public Token getNextToken(Token t) { 91 | return getNextToken(t, 1); 92 | } 93 | 94 | public Token getPrevToken(Token t, int dist) { 95 | if (t.predPosition - dist >= 0) return tokenList.get(t.predPosition-dist); 96 | else return null; 97 | } 98 | 99 | public Token getNextToken(Token t, int dist) { 100 | if (t.predPosition < tokenList.size()-dist) return tokenList.get(t.predPosition+dist); 101 | else return null; 102 | } 103 | 104 | } 105 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/lingdata/DocumentId.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.lingdata; 19 | 20 | /** 21 | * Interface to represent a document ID. 22 | * In our world, all documents have WSJ behavior, so they are part of a section and have a file number. 23 | */ 24 | public interface DocumentId { 25 | String getSectionStr(); 26 | String getFileStr(); 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/lingdata/GornAddressList.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.lingdata; 19 | 20 | /** 21 | * Dummy class for Gorn addresses which turns out were not needed. Remains for compatibility. 22 | */ 23 | public class GornAddressList { 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/lingdata/Partition.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.lingdata; 19 | 20 | import java.util.List; 21 | import java.util.Map; 22 | 23 | /** 24 | * A partition is a list of documents which may belong to different sections 25 | */ 26 | public class Partition { 27 | 28 | public List docList; 29 | public Map> sectionMap; 30 | 31 | 32 | public Partition() {} 33 | 34 | public Partition(List docList) { 35 | this.docList = docList; 36 | } 37 | 38 | 39 | public List getDocumentList() { 40 | return docList; 41 | } 42 | 43 | public int size() { 44 | return docList.size(); 45 | } 46 | 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/lingdata/PlainTextCorpus.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.lingdata; 19 | 20 | import java.util.List; 21 | 22 | /** 23 | * A corpus to hold documents read from plain text files. 24 | * Has only one partition and consists only of test data. 25 | */ 26 | public class PlainTextCorpus extends Corpus { 27 | 28 | Partition partition; 29 | 30 | public PlainTextCorpus(List documentList) { 31 | setDocumentList(documentList); 32 | partition = new Partition(); 33 | partition.docList = documentList; 34 | } 35 | 36 | @Override 37 | public Partition getTrain() { 38 | return null; 39 | } 40 | 41 | @Override 42 | public Partition getDev() { 43 | return null; 44 | } 45 | 46 | @Override 47 | public Partition getTest() { 48 | return partition; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/lingdata/PlainTextDocId.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.lingdata; 19 | 20 | 21 | /** 22 | * A document ID for plain text documents. 23 | * Since we require a WSJ-like directory structure, we can generate IDs from that. 24 | */ 25 | public class PlainTextDocId implements DocumentId { 26 | 27 | String sectionStr; 28 | String fileStr; 29 | 30 | public PlainTextDocId (String section, String file) { 31 | sectionStr = section; 32 | fileStr = file; 33 | } 34 | 35 | @Override 36 | public String getSectionStr() { 37 | return sectionStr; 38 | } 39 | 40 | @Override 41 | public String getFileStr() { 42 | return fileStr; 43 | } 44 | 45 | @Override 46 | public String toString() { 47 | return sectionStr + "," + fileStr; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/lingdata/Sentence.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.lingdata; 19 | 20 | import java.util.HashMap; 21 | import java.util.List; 22 | import java.util.Map; 23 | 24 | import edu.stanford.nlp.ling.IndexedWord; 25 | import edu.stanford.nlp.semgraph.SemanticGraph; 26 | import edu.stanford.nlp.trees.Tree; 27 | import org.jgrapht.alg.FloydWarshallShortestPaths; 28 | 29 | /** 30 | * Representation of a sentence. 31 | * Is part of a document; contains a list of tokens; may have a constituency and a dependency tree. 32 | */ 33 | public class Sentence { 34 | 35 | public List tokenList; 36 | public GornAddressList gorn; 37 | public SentenceId sentenceId; 38 | public int positionInDocument; 39 | public Document document; 40 | 41 | // CoreLabel backwards lookup 42 | public Map indexedWordLookup; 43 | public HashMap treeLookup; 44 | 45 | // CoreNLP output 46 | public Tree tree; 47 | public SemanticGraph dependencyGraph; 48 | public FloydWarshallShortestPaths fw; 49 | 50 | 51 | public Sentence () {} 52 | public Sentence (Document d) { 53 | document = d; 54 | } 55 | 56 | public List getTokenList() { 57 | return tokenList; 58 | } 59 | 60 | public Token first() { return tokenList.get(0); } 61 | public Token last() { return tokenList.get(tokenList.size()-1); } 62 | 63 | @Override 64 | public String toString() { 65 | return tokenList.toString(); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/lingdata/SentenceId.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.lingdata; 19 | 20 | /** 21 | * Holds an ID for a sentence. 22 | * Can be calculated from the document's ID together with the Gorn address of the sentence. 23 | */ 24 | public class SentenceId { 25 | 26 | private WSJId wsjId; 27 | private GornAddressList gorn; 28 | 29 | public SentenceId(WSJId wsdId, GornAddressList gorn) { 30 | this.gorn = gorn; 31 | this.wsjId = wsdId; 32 | } 33 | 34 | public WSJId getWsjId () { 35 | return wsjId; 36 | } 37 | 38 | public GornAddressList getGorn() { 39 | return gorn; 40 | } 41 | 42 | @Override 43 | public String toString() { 44 | return "" + wsjId + ":" + gorn; 45 | } 46 | 47 | @Override 48 | public boolean equals(Object obj) { 49 | if (obj instanceof SentenceId) { 50 | SentenceId objId = (SentenceId) obj; 51 | return this.wsjId.equals(objId.wsjId) && this.gorn.equals(objId.gorn); 52 | //FIXME: maybe the gorn thing doesn't work 53 | } else { 54 | return false; 55 | } 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/lingdata/Types.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.lingdata; 19 | 20 | public abstract class Types { 21 | public enum PartitionName {TRAIN, DEV, TEST}; 22 | public enum Genre {FICTION, NEWS, BIOGRAPHY}; 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/lingdata/WSJId.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.lingdata; 19 | 20 | import java.io.Serializable; 21 | 22 | /** 23 | * Document ID as used in the Wall Street Journal corpus. 24 | * Each document is part of a section and is stored in a file, each of which have an integral ID. 25 | */ 26 | public class WSJId implements Serializable, DocumentId { 27 | 28 | private static final long serialVersionUID = 4443044961863001270L; 29 | 30 | private Integer section; 31 | private Integer file; 32 | 33 | 34 | public WSJId (Integer section) { 35 | this(section, null); 36 | } 37 | 38 | public WSJId (String section, String file) { 39 | this(Integer.parseInt(section), Integer.parseInt(file)); 40 | } 41 | 42 | public WSJId (String section) { 43 | this(Integer.parseInt(section)); 44 | } 45 | 46 | public WSJId (Integer section, Integer file) { 47 | this.section = section; 48 | this.file = file; 49 | } 50 | 51 | public int getSectionInt() { 52 | return section; 53 | } 54 | 55 | public int getFileInt() { 56 | return file; 57 | } 58 | 59 | private static String addOffset(int i) { 60 | if (i < 10) { 61 | return "0" + i; 62 | } else { 63 | return "" + i; 64 | } 65 | } 66 | 67 | public String getSectionStr() { 68 | return addOffset(section); 69 | } 70 | 71 | public String getFileStr() { 72 | return addOffset(file); 73 | } 74 | 75 | @Override 76 | public boolean equals(Object other) { 77 | if (other instanceof WSJId) { 78 | WSJId otherId = (WSJId) other; 79 | return (this.section == otherId.section) && (this.file == otherId.file); 80 | } else { 81 | return false; 82 | } 83 | } 84 | 85 | public boolean sectionEquals(WSJId other) { 86 | return this.section == other.section; 87 | } 88 | 89 | @Override 90 | public String toString() { 91 | return getSectionStr() + getFileStr(); 92 | } 93 | } 94 | 95 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/mallet/DocumentFeatureSet2TokenSequence.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.mallet; 19 | 20 | import java.util.List; 21 | 22 | import cc.mallet.pipe.Pipe; 23 | import cc.mallet.types.Instance; 24 | import cc.mallet.types.Token; 25 | import cc.mallet.types.TokenSequence; 26 | import ims.cs.qsample.features.FeatureSet; 27 | 28 | /** 29 | * Pipe to convert our internal feature set to mallet token feature entries 30 | * @author scheibcn 31 | */ 32 | public class DocumentFeatureSet2TokenSequence extends Pipe { 33 | 34 | private static final long serialVersionUID = 3218174517742238232L; 35 | 36 | @Override 37 | public Instance pipe(Instance inst) { 38 | 39 | // ensure that the instance is of the right type 40 | if (!(inst instanceof PARCDocumentInstance)) { 41 | throw new UnsupportedOperationException("Expected CoreMap, got " + inst.getClass()); 42 | } 43 | 44 | 45 | List tokenList = ((PARCDocumentInstance) inst).document.getTokenList(); 46 | TokenSequence ts = new TokenSequence(); 47 | 48 | // iterate over tokens and convert their internal feature sets into Mallet feature sets 49 | for (ims.cs.lingdata.Token cToken : tokenList) { 50 | FeatureSet fs = cToken.boundaryFeatureSet; 51 | Token mToken = new Token(cToken.predText); 52 | 53 | // copy each feature 54 | for (Object entry : fs) { 55 | mToken.setFeatureValue(entry.toString(), 1); 56 | } 57 | 58 | ts.add(mToken); 59 | } 60 | 61 | inst.setData(ts); 62 | 63 | return inst; 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/mallet/PARCDocumentInstance.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.mallet; 19 | 20 | 21 | import cc.mallet.types.Instance; 22 | import ims.cs.lingdata.Document; 23 | 24 | /** 25 | * Mallet "Instance" wrapper class for documents 26 | */ 27 | public class PARCDocumentInstance extends Instance { 28 | 29 | private static final long serialVersionUID = -6933321582801583924L; 30 | 31 | public transient Document document; 32 | 33 | private PARCDocumentInstance() { 34 | super(null, null, null, null); 35 | }; 36 | 37 | public PARCDocumentInstance(Document document) { 38 | super(document, null, document.docId, document); 39 | this.document = document; 40 | } 41 | 42 | 43 | public Document getDocument() { 44 | return document; 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/parc/PARCAttribution.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.parc; 19 | 20 | /** 21 | * A single attribution. 22 | */ 23 | public class PARCAttribution { 24 | 25 | // attribution roles as annotated in the PARC corpus 26 | public enum Role { SOURCE, CONTENT, CUE, SUPPLEMENT }; 27 | 28 | // types by Pareti et al. 29 | public enum Type { DIRECT, INDIRECT, MIXED }; 30 | 31 | public Role role; 32 | public String id; 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/parc/ParcUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.parc; 19 | 20 | import edu.stanford.nlp.ling.IndexedWord; 21 | import edu.stanford.nlp.semgraph.SemanticGraphEdge; 22 | import edu.stanford.nlp.trees.GrammaticalRelation; 23 | import ims.cs.corenlp.Helper; 24 | import ims.cs.lingdata.ByteCount; 25 | import ims.cs.lingdata.Document; 26 | import ims.cs.lingdata.Sentence; 27 | import ims.cs.lingdata.Token; 28 | import edu.stanford.nlp.trees.Tree; 29 | import org.jgrapht.alg.FloydWarshallShortestPaths; 30 | import org.jgrapht.graph.SimpleDirectedGraph; 31 | 32 | import java.util.Iterator; 33 | import java.util.List; 34 | 35 | /** 36 | * Collection of Utility functions 37 | */ 38 | public abstract class ParcUtils { 39 | 40 | /** 41 | * Find all head verbs in the corpus. The algorithm is taken from Pareti (2015). 42 | * @param sentence 43 | */ 44 | public static void markHeadVerbs (Sentence sentence) { 45 | 46 | for (Tree tree : sentence.tree.preOrderNodeList()) { 47 | if (tree.label().value().equals("VP")) { 48 | boolean valid = true; 49 | for (Tree child : tree.children()) { 50 | if (child.label().value().equals("VP")) { 51 | valid = false; 52 | break; 53 | } 54 | } 55 | 56 | if (valid) { 57 | for (Tree child : tree.children()) { 58 | if (child.firstChild().isLeaf() && child.label().value().startsWith("V")) { 59 | Token token = sentence.treeLookup.get(child.firstChild()); 60 | if (token != null) 61 | token.isHeadVerb = true; 62 | } 63 | } 64 | } 65 | } 66 | } 67 | } 68 | 69 | /** 70 | * Annotates paragraph-continuing quotation marks. doParagraphAnnotation() needs to be called before this. 71 | * @param document 72 | */ 73 | public static void markParagraphQuotes(Document document) { 74 | int quoteIndex = 1; 75 | 76 | for (Token token: document.tokenList) { 77 | if (Helper.isQuote(token)) { 78 | // ignore even quotes at paragraph begins 79 | if (token.paragraphBegins && quoteIndex % 2 == 0) 80 | token.ignoreQuote = true; 81 | else 82 | quoteIndex++; 83 | } 84 | } 85 | } 86 | 87 | /** 88 | * Annotates for each token whether it starts a paragraph by its raw text 89 | * @param document 90 | */ 91 | public static void doParagraphAnnotation (Document document) { 92 | String documentText = document.text; 93 | Iterator tokenIter = document.tokenList.iterator(); 94 | 95 | if (!tokenIter.hasNext()) { 96 | System.err.println("Skipping paragraph annotation empty document: " + document.docId); 97 | return; 98 | } 99 | 100 | Token token = tokenIter.next(); 101 | ByteCount bc = token.goldByteCount; 102 | 103 | // iterate over all character positions in the text 104 | char prevC = 0; 105 | 106 | for (int i = 0; i < documentText.length(); i++) { 107 | if (i > bc.getEnd()) { 108 | if (!tokenIter.hasNext()) break; /* reached the last token */ 109 | 110 | token = tokenIter.next(); 111 | bc = token.goldByteCount; 112 | } 113 | 114 | char c = documentText.charAt(i); 115 | 116 | // two consecutive newlines indicate a paragraph 117 | if (prevC == '\n' && c == '\n') { 118 | token.paragraphBegins = true; 119 | } 120 | 121 | prevC = c; 122 | } 123 | } 124 | 125 | /** 126 | * Anonymizes certain named entities in the text 127 | * @param document 128 | */ 129 | public static void anonymizeNamedEntities (Document document) { 130 | for (Token token: document.getTokenList()) { 131 | if (token.predNer.startsWith("ORGANIZATION") || token.predNer.startsWith("PERSON")) { 132 | String substText = "[NE]"; 133 | token.originalPredText = token.predText; 134 | 135 | token.predLemma = substText; 136 | token.predText = substText; 137 | token.goldLemma = substText; 138 | token.goldText = substText; 139 | } 140 | } 141 | } 142 | 143 | /** 144 | * CoreNLP tries to predict opening and closing quotation marks. 145 | * This method maps the variation back to one symbol. 146 | * @param document 147 | */ 148 | public static void sanitizeQuotationMarks (Document document) { 149 | for (Token token : document.getTokenList()) { 150 | // double quotes 151 | if (token.predLemma.equals("``") || token.predLemma.equals("\"") || token.predLemma.equals("''")) { 152 | token.predLemma = "\""; 153 | token.predPosTag = "\""; 154 | token.predText = "\""; 155 | token.goldPosTag = "\""; 156 | token.goldLemma = "\""; 157 | token.goldText = "\""; 158 | } 159 | 160 | // single quotes 161 | if (token.predLemma.equals("`") || token.predLemma.equals("''")) { 162 | token.predLemma = "'"; 163 | token.predPosTag = "'"; 164 | token.predText = "'"; 165 | token.goldLemma = "'"; 166 | token.goldPosTag = "'"; 167 | token.goldText = "'"; 168 | } 169 | 170 | } 171 | } 172 | 173 | /** 174 | * The FW implementation needs distinct objects as edges, which this class accomplishes. 175 | * CoreNLP seems to optimize storage by caching strings, so different edges have identical label strings. 176 | */ 177 | public static class IndexedEdge { 178 | public GrammaticalRelation rel; 179 | public int index; 180 | 181 | public IndexedEdge(GrammaticalRelation rel, int index) { 182 | this.rel = rel; 183 | this.index = index; 184 | } 185 | } 186 | 187 | /** 188 | * Compute cached dependency paths using Floyd Warshall 189 | * @param dependencies 190 | * @return 191 | */ 192 | public static FloydWarshallShortestPaths computeFloydWarshallSGE(List dependencies) { 193 | SimpleDirectedGraph graph = new SimpleDirectedGraph(IndexedEdge.class); 194 | int edgeId = 0; 195 | for (SemanticGraphEdge dep : dependencies) { 196 | graph.addVertex(dep.getGovernor()); 197 | graph.addVertex(dep.getDependent()); 198 | graph.addEdge(dep.getGovernor(), dep.getDependent(), new IndexedEdge(dep.getRelation(), edgeId)); 199 | } 200 | return new FloydWarshallShortestPaths(graph); 201 | } 202 | 203 | } 204 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/parc/xml/PARCParser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.parc.xml; 19 | 20 | import java.io.File; 21 | import java.io.IOException; 22 | 23 | import javax.xml.parsers.ParserConfigurationException; 24 | import javax.xml.parsers.SAXParser; 25 | import javax.xml.parsers.SAXParserFactory; 26 | 27 | import org.xml.sax.InputSource; 28 | import org.xml.sax.SAXException; 29 | import org.xml.sax.XMLReader; 30 | 31 | import ims.cs.lingdata.Document; 32 | 33 | /** 34 | * XML parser for the PARC corpus 35 | */ 36 | public class PARCParser { 37 | 38 | private static PARCParser instance; 39 | private static SAXParser saxParser; 40 | private static XMLReader xmlReader; 41 | private static PARCHandler handler; 42 | 43 | private PARCParser () throws ParserConfigurationException, SAXException { 44 | SAXParserFactory spf = SAXParserFactory.newInstance(); 45 | saxParser = spf.newSAXParser(); 46 | xmlReader = saxParser.getXMLReader(); 47 | handler = new PARCHandler(); 48 | xmlReader.setContentHandler(handler); 49 | } 50 | 51 | public Document parseFile(File xmlFile) throws IOException, SAXException { 52 | xmlReader.parse(new InputSource(xmlFile.getPath())); 53 | return handler.getDocument(); 54 | 55 | } 56 | 57 | public static PARCParser getInstance() throws ParserConfigurationException, SAXException { 58 | if (instance == null) { 59 | instance = new PARCParser(); 60 | } 61 | return instance; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/evaluate/EvaluateClassifier.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.evaluate; 19 | 20 | import ims.cs.lingdata.Document; 21 | 22 | import java.util.List; 23 | 24 | /** 25 | * Evaluation functions for single-token classifiers 26 | * Created by scheibcn on 3/2/16. 27 | */ 28 | public class EvaluateClassifier { 29 | 30 | /** 31 | * Container class for quotation classifier results, i.e., begin, end, and cue F1 32 | */ 33 | public static class ClassifierResults { 34 | F1.Stats beginStats; 35 | F1.Stats endStats; 36 | F1.Stats cueStats; 37 | 38 | public String toString() { 39 | return String.format("Pb=%1.3f Rb=%1.3f Fb=%1.3f Pe=%1.3f Re=%1.3f Fe=%1.3f Pc=%1.3f Rc=%1.3f Fc=%1.3f", 40 | beginStats.precision, beginStats.recall, beginStats.f1, 41 | endStats.precision, endStats.recall, endStats.f1, 42 | cueStats.precision, cueStats.recall, cueStats.f1); 43 | 44 | } 45 | } 46 | 47 | /** 48 | * Evaluate begin, end, and cue classifier output over all tokens in the specified documents 49 | * @param trainDocs 50 | * @return 51 | */ 52 | public static ClassifierResults evaluateClassifier (List trainDocs) { 53 | if (trainDocs == null) return null; 54 | ClassifierResults results = new ClassifierResults(); 55 | 56 | results.beginStats = F1.evalPerceptron(trainDocs, "begin"); 57 | results.endStats = F1.evalPerceptron(trainDocs, "end"); 58 | results.cueStats = F1.evalPerceptron(trainDocs, "cue"); 59 | 60 | return results; 61 | } 62 | 63 | 64 | /** 65 | * Print begin, end, and cue classifier evaluations over all tokens in the specified training, test, val, and 66 | * resubstitution documents 67 | * @param trainDocs 68 | * @param testDocs 69 | * @param valDocs 70 | * @param resDocs 71 | * @param prefix 72 | */ 73 | public static void evaluateAndPrint(List trainDocs, List testDocs, List valDocs, List resDocs, String prefix) { 74 | ClassifierResults trainResults = evaluateClassifier(trainDocs); 75 | ClassifierResults testResults = evaluateClassifier(testDocs); 76 | ClassifierResults valResults = evaluateClassifier(valDocs); 77 | ClassifierResults resResults = evaluateClassifier(resDocs); 78 | 79 | if (trainResults != null) System.out.println(prefix + " TRAIN " + trainResults.toString()); 80 | if (testResults != null) System.out.println(prefix + " TEST " + testResults.toString()); 81 | if (valResults != null) System.out.println(prefix + " VAL " + valResults.toString()); 82 | if (resResults != null) System.out.println(prefix + " RES " + resResults.toString()); 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/evaluate/EvaluateSpan.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.evaluate; 19 | 20 | import ims.cs.lingdata.Document; 21 | import ims.cs.parc.PARCAttribution; 22 | 23 | import java.util.List; 24 | 25 | /** 26 | * Evaluation functions for span prediction models. 27 | * Created by scheibcn on 3/2/16. 28 | */ 29 | public class EvaluateSpan { 30 | 31 | /** 32 | * Container class for all necessary F1 statistics to do Pareti-style quotation evaluation 33 | */ 34 | public static class SpanResults { 35 | public F1.Stats strictCue; 36 | public F1.Stats strictContent; 37 | public F1.Stats partialContent; 38 | public F1.Stats strictContentDirect; 39 | public F1.Stats partialContentDirect; 40 | public F1.Stats strictContentIndirect; 41 | public F1.Stats partialContentIndirect; 42 | public F1.Stats strictContentMixed; 43 | public F1.Stats partialContentMixed; 44 | 45 | public String toString(String sep) { 46 | return strictContent.toString() + sep 47 | + strictContentDirect + sep 48 | + strictContentIndirect + sep 49 | + strictContentMixed + sep 50 | + strictCue + sep + sep 51 | + partialContent + sep 52 | + partialContentDirect + sep 53 | + partialContentIndirect + sep 54 | + partialContentMixed; 55 | } 56 | } 57 | 58 | /** 59 | * SpanResults for training, test, validation, and resubstitution data 60 | */ 61 | public static class ResultSet { 62 | public SpanResults trainResults; 63 | public SpanResults testResults; 64 | public SpanResults valResults; 65 | public SpanResults resResults; 66 | } 67 | 68 | 69 | /** 70 | * Evaluate cue and content span models 71 | * @param documentList 72 | * @return 73 | */ 74 | public static SpanResults cueContentEvaluation (List documentList) { 75 | SpanResults evaluation = new SpanResults(); 76 | evaluation.strictCue = F1.evalSpans(documentList, "cue", false, null); 77 | evaluation.strictContent = F1.evalSpans(documentList, "content", false, null); 78 | evaluation.partialContent = F1.evalSpans(documentList, "content", true, null); 79 | evaluation.strictContentDirect = F1.evalSpans(documentList, "content", false, PARCAttribution.Type.DIRECT); 80 | evaluation.partialContentDirect = F1.evalSpans(documentList, "content", true, PARCAttribution.Type.DIRECT); 81 | evaluation.strictContentIndirect = F1.evalSpans(documentList, "content", false, PARCAttribution.Type.INDIRECT); 82 | evaluation.partialContentIndirect = F1.evalSpans(documentList, "content", true, PARCAttribution.Type.INDIRECT); 83 | evaluation.strictContentMixed = F1.evalSpans(documentList, "content", false, PARCAttribution.Type.MIXED); 84 | evaluation.partialContentMixed = F1.evalSpans(documentList, "content", true, PARCAttribution.Type.MIXED); 85 | 86 | return evaluation; 87 | } 88 | 89 | /** 90 | * Returns a string where the input s is repeated n times 91 | * @param s 92 | * @param n 93 | * @return 94 | */ 95 | private static String generateN(String s, int n) { 96 | StringBuilder sb = new StringBuilder(); 97 | for (int i = 0; i < n; i++) { 98 | sb.append(s); 99 | } 100 | 101 | return sb.toString(); 102 | } 103 | 104 | private static void printHeader(String sep, int offset) { 105 | System.out.println(generateN("-", offset) + "--------------------------------------------------------------------------------------------------------------------------------------------------------"); 106 | System.out.println(generateN(" ", offset) + " exact "+sep+""+sep+" partial"); 107 | System.out.println(generateN(" ", offset) + " ALL "+sep+" DIRECT "+sep+" INDIRECT "+sep+" MIXED "+sep+" cue "+sep+""+sep+" ALL "+sep+" DIRECT "+sep+" INDIRECT "+sep+" MIXED "); 108 | System.out.println(generateN(" ", offset) + " P R F "+sep+" P R F "+sep+" P R F "+sep+" P R F "+sep+" P R F "+sep+""+sep+" P R F "+sep+" P R F "+sep+" P R F "+sep+" P R F "); 109 | 110 | } 111 | 112 | private static void printFooter(int offset) { 113 | System.out.println(generateN("-", offset) + "--------------------------------------------------------------------------------------------------------------------------------------------------------"); 114 | } 115 | 116 | 117 | private static void printResults(String prefix, String sep, SpanResults trainingEval, SpanResults testEval, SpanResults valEval, SpanResults resEval) { 118 | printHeader(sep, prefix.length() + 1); 119 | 120 | if (trainingEval != null) System.out.println(prefix + " TRAIN " + trainingEval.toString(sep)); 121 | if (testEval != null) System.out.println(prefix + " TEST " + testEval.toString(sep)); 122 | if (valEval != null) System.out.println(prefix + " VAL " + valEval.toString(sep)); 123 | if (resEval != null) System.out.println(prefix + " RES " + resEval.toString(sep)); 124 | 125 | printFooter(prefix.length() + 1); 126 | } 127 | 128 | public static ResultSet evaluateAndPrint(String prefix, String sep, List trainingDocuments, List testDocuments, List valDocuments, List resDocuments) { 129 | ResultSet resultSet = new ResultSet(); 130 | 131 | if (trainingDocuments != null) resultSet.trainResults = cueContentEvaluation(trainingDocuments); 132 | if (testDocuments != null) resultSet.testResults = cueContentEvaluation(testDocuments); 133 | if (valDocuments != null) resultSet.valResults = cueContentEvaluation(valDocuments); 134 | if (resDocuments != null) resultSet.resResults = cueContentEvaluation(resDocuments); 135 | 136 | printResults(prefix, sep, resultSet.trainResults, resultSet.testResults, 137 | resultSet.valResults, resultSet.resResults); 138 | 139 | return resultSet; 140 | } 141 | 142 | 143 | } 144 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/features/Binning.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.features; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | /** 24 | * Binning for distances 25 | * Created by scheibcn on 3/4/16. 26 | */ 27 | public class Binning { 28 | /** 29 | * Bins that stack up from 0 to 100 30 | * @param distance 31 | * @param prefix 32 | * @return 33 | */ 34 | public static List distanceBinsStackUp (int distance, String prefix) { 35 | List features = new ArrayList<>(); 36 | 37 | if (distance > 0) features.add(prefix + ">=1"); 38 | if (distance > 1) features.add(prefix + ">=2"); 39 | if (distance > 2) features.add(prefix + ">=3"); 40 | if (distance > 3) features.add(prefix + ">=4"); 41 | if (distance > 4) features.add(prefix + ">=5"); 42 | if (distance > 5) features.add(prefix + ">=6"); 43 | if (distance > 6) features.add(prefix + ">=7"); 44 | if (distance > 7) features.add(prefix + ">=8"); 45 | if (distance > 10) features.add(prefix + ">=11"); 46 | if (distance > 15) features.add(prefix + ">=16"); 47 | if (distance > 20) features.add(prefix + ">=21"); 48 | if (distance > 25) features.add(prefix + ">=26"); 49 | if (distance > 30) features.add(prefix + ">=31"); 50 | if (distance > 40) features.add(prefix + ">=41"); 51 | if (distance > 50) features.add(prefix + ">=51"); 52 | if (distance > 60) features.add(prefix + ">=61"); 53 | if (distance > 70) features.add(prefix + ">=71"); 54 | if (distance > 80) features.add(prefix + ">=81"); 55 | if (distance > 90) features.add(prefix + ">=91"); 56 | if (distance > 100) features.add(prefix + ">=101"); 57 | 58 | return features; 59 | } 60 | 61 | /** 62 | * Bins that stack down from 0 to 100 63 | * @param distance 64 | * @param prefix 65 | * @return 66 | */ 67 | public static List distanceBinsStackDown (int distance, String prefix) { 68 | List features = new ArrayList<>(); 69 | 70 | if (distance < 2) features.add(prefix + "<=1"); 71 | if (distance < 3) features.add(prefix + "<=2"); 72 | if (distance < 4) features.add(prefix + "<=3"); 73 | if (distance < 5) features.add(prefix + "<=4"); 74 | if (distance < 6) features.add(prefix + "<=5"); 75 | if (distance < 7) features.add(prefix + "<=6"); 76 | if (distance < 8) features.add(prefix + "<=7"); 77 | if (distance < 9) features.add(prefix + "<=8"); 78 | if (distance < 12) features.add(prefix + "<=11"); 79 | if (distance < 17) features.add(prefix + "<=16"); 80 | if (distance < 22) features.add(prefix + "<=21"); 81 | if (distance < 27) features.add(prefix + "<=26"); 82 | if (distance < 32) features.add(prefix + "<=31"); 83 | if (distance < 42) features.add(prefix + "<=41"); 84 | if (distance < 52) features.add(prefix + "<=51"); 85 | if (distance < 62) features.add(prefix + "<=61"); 86 | if (distance < 72) features.add(prefix + "<=71"); 87 | if (distance < 82) features.add(prefix + "<=81"); 88 | if (distance < 92) features.add(prefix + "<=91"); 89 | if (distance < 102) features.add(prefix + "<=101"); 90 | 91 | return features; 92 | } 93 | 94 | /** 95 | * Interval bins from 0 to 100 96 | * @param distance 97 | * @param prefix 98 | * @return 99 | */ 100 | public static List distanceBins1to100(int distance, String prefix) { 101 | List features = new ArrayList<>(); 102 | 103 | if (distance > 0 && distance < 5) features.add(prefix + "_in_[0,5)"); 104 | if (distance >= 5 && distance < 10) features.add(prefix + "_in_[5,10)"); 105 | if (distance >= 10 && distance < 20) features.add(prefix + "_in_[10,20)"); 106 | if (distance >= 20 && distance < 40) features.add(prefix + "_in_[20,40)"); 107 | if (distance >= 40 && distance < 60) features.add(prefix + "_in_[40,60)"); 108 | if (distance >= 60 && distance < 80) features.add(prefix + "_in_[60,80)"); 109 | if (distance >= 80 && distance <= 100) features.add(prefix + "_in_[60,100]"); 110 | 111 | return features; 112 | } 113 | 114 | 115 | /** 116 | * Bins from 0 to 100, intervals and stacking up & down 117 | * @param distance 118 | * @param prefix 119 | * @return 120 | */ 121 | public static List distanceBinsAll (int distance, String prefix) { 122 | List features = new ArrayList<>(); 123 | features.addAll(distanceBins1to100(distance, prefix)); 124 | features.addAll(distanceBinsStackDown(distance,prefix)); 125 | features.addAll(distanceBinsStackUp(distance,prefix)); 126 | return features; 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/features/FeatureExtraction.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.features; 19 | 20 | import java.io.IOException; 21 | 22 | 23 | import ims.cs.qsample.features.components.SentenceConstituentFeatures; 24 | import ims.cs.qsample.features.components.SentenceDependencyFeatures; 25 | import ims.cs.qsample.features.components.SentenceFeaturesDerivedFromListCue; 26 | import ims.cs.qsample.features.components.SentenceIndicatorFeatures; 27 | import ims.cs.qsample.features.components.TokenDictFeatures; 28 | import ims.cs.qsample.features.components.TokenLexicalFeatures; 29 | import ims.cs.qsample.features.components.TokenListFeatures; 30 | import ims.cs.lingdata.Document; 31 | import ims.cs.lingdata.Sentence; 32 | import ims.cs.lingdata.Token; 33 | import ims.cs.qsample.features.components.DocumentOffsetConjunction; 34 | import ims.cs.qsample.features.components.DocumentQuotationFeature; 35 | import ims.cs.util.StaticConfig; 36 | 37 | /** 38 | * Feature extractor class for (mostly) those features that require non-static code. 39 | */ 40 | public class FeatureExtraction { 41 | 42 | private TokenListFeatures tokenPersonFeatures; 43 | private TokenListFeatures tokenOrganizationFeatures; 44 | private TokenListFeatures tokenTitleFeatures; 45 | private TokenListFeatures tokenListFeatures; 46 | private TokenListFeatures tokenNounListFeatures; 47 | private TokenDictFeatures verbNetFeatures; 48 | private DocumentOffsetConjunction documentOffsetConjunction; 49 | 50 | 51 | public FeatureExtraction () throws IOException, ClassNotFoundException { 52 | // non-static extractors 53 | tokenPersonFeatures = new TokenListFeatures("resources/PARC/listfeatures/person.hyponyms.txt", "EK:PER"); 54 | tokenOrganizationFeatures = new TokenListFeatures("resources/PARC/listfeatures/organization.hyponyms.txt", "EK:ORG"); 55 | tokenTitleFeatures = new TokenListFeatures("resources/PARC/listfeatures/titles.txt", "EK:TITLE"); 56 | tokenListFeatures = new TokenListFeatures("resources/PARC/listfeatures/krestel_verbs.txt", "CUELIST"); 57 | tokenNounListFeatures = new TokenListFeatures("resources/PARC/listfeatures/attribution_nouns.txt", "NOUNCUELIST"); 58 | verbNetFeatures = new TokenDictFeatures("resources/PARC/listfeatures/verbnet.txt", "VERBNET"); 59 | 60 | // restrict extractors to certain pos tags 61 | tokenNounListFeatures.posStart = "N"; 62 | tokenListFeatures.posStart = "V"; 63 | verbNetFeatures.posStart = "V"; 64 | 65 | // Offset conjunction on non-static features 66 | documentOffsetConjunction = new DocumentOffsetConjunction(); 67 | } 68 | 69 | 70 | /** 71 | * Runs token-level feature extraction on the tokens in the document 72 | * @param document 73 | */ 74 | public void extractTokenFeatures(Document document) { 75 | for (Token token : document.tokenList) { 76 | tokenPersonFeatures.extract(token); 77 | tokenOrganizationFeatures.extract(token); 78 | tokenTitleFeatures.extract(token); 79 | TokenLexicalFeatures.extract(token); 80 | 81 | tokenListFeatures.extract(token); 82 | tokenNounListFeatures.extract(token); 83 | verbNetFeatures.extract(token); 84 | } 85 | } 86 | 87 | 88 | /** 89 | * Runs sentence-level feature extraction on the sentences in the document 90 | * @param document 91 | */ 92 | public void extractSentenceFeatures (Document document) { 93 | for (Sentence sentence : document.sentenceList) { 94 | SentenceIndicatorFeatures.extract(sentence); 95 | if (StaticConfig.dependencyFeatures) SentenceDependencyFeatures.extract(sentence); 96 | if (StaticConfig.constituentFeatures) SentenceConstituentFeatures.extract(sentence); 97 | SentenceFeaturesDerivedFromListCue.extract(sentence); 98 | } 99 | } 100 | 101 | 102 | public void setUpFeatureSets(Document doc) { 103 | for (Token token : doc.tokenList) 104 | if (token.boundaryFeatureSet == null) 105 | token.boundaryFeatureSet = new FeatureIntSet(); 106 | } 107 | 108 | 109 | /** 110 | * Runs feature extraction on a single document 111 | * @param document 112 | */ 113 | public void extractAllFeatures (Document document) { 114 | // initialize empty feature sets 115 | setUpFeatureSets(document); 116 | 117 | // Token features & sentence features 118 | extractTokenFeatures(document); 119 | extractSentenceFeatures(document); 120 | 121 | // quotation mark features 122 | if (StaticConfig.documentQuotationFeature) 123 | DocumentQuotationFeature.extract(document); 124 | 125 | // offset conjunction 126 | if (StaticConfig.documentOffsetConjunction) 127 | documentOffsetConjunction.extract(document); 128 | 129 | // additional features 130 | BoundaryFeatures.additionalBoundaryFeatures(document); 131 | } 132 | 133 | 134 | } 135 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/features/FeatureIndexMap.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.features; 19 | 20 | import java.util.ArrayList; 21 | import java.util.HashMap; 22 | import java.util.List; 23 | import java.util.Map; 24 | 25 | /** 26 | * Automatically counting string to int mapping for feature sets. 27 | * Created by scheibcn on 6/1/16. 28 | */ 29 | public class FeatureIndexMap { 30 | Map f2i; 31 | List i2f; 32 | 33 | int maxIndex = -1; 34 | 35 | FeatureIndexMap () { 36 | f2i = new HashMap<>(); 37 | i2f = new ArrayList<>(); 38 | } 39 | 40 | /** 41 | * Translate string to index. If the string is unknown, it is assigned a new index. 42 | * @param feature 43 | * @return 44 | */ 45 | public int getIndex(String feature) { 46 | if (f2i.containsKey(feature)) { 47 | return f2i.get(feature); 48 | } else { 49 | maxIndex++; 50 | f2i.put(feature, maxIndex); 51 | i2f.add(feature); 52 | return maxIndex; 53 | } 54 | } 55 | 56 | /** 57 | * Translate index to string. 58 | * @param index 59 | * @return 60 | */ 61 | public String getFeature(int index) { 62 | if (index <= maxIndex) { 63 | return i2f.get(index); 64 | } else { 65 | throw new Error("Lookup error"); 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/features/FeatureIntSet.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.features; 19 | 20 | import java.util.Collection; 21 | import java.util.HashSet; 22 | import java.util.Iterator; 23 | import java.util.Set; 24 | 25 | /** 26 | * A feature set storing features as integers. 27 | * Created by scheibcn on 6/1/16. 28 | */ 29 | public class FeatureIntSet implements FeatureSet { 30 | 31 | // internal mapping from feature strings to integers 32 | static FeatureIndexMap featureIndexMap = new FeatureIndexMap(); // a static map across all feature sets 33 | Set featureIndices = new HashSet<>(); 34 | 35 | @Override 36 | public int size() { 37 | return featureIndices.size(); 38 | } 39 | 40 | @Override 41 | public boolean isEmpty() { 42 | return featureIndices.isEmpty(); 43 | } 44 | 45 | 46 | 47 | @Override 48 | public boolean add(String s) { 49 | int index = featureIndexMap.getIndex(s); 50 | featureIndices.add(index); 51 | return true; 52 | } 53 | 54 | @Override 55 | public Iterator iterator() { return new StringIterator(); } 56 | 57 | @Override 58 | public boolean addAll(Collection c) { 59 | if (c instanceof FeatureIntSet) { 60 | // just call addAll on the index sets 61 | featureIndices.addAll(((FeatureIntSet) c).featureIndices); 62 | } else if (c instanceof Collection) { 63 | for (String s : c) this.add(s); 64 | } else { 65 | throw new Error("Incompatible types"); 66 | } 67 | 68 | return true; 69 | } 70 | 71 | 72 | @Override 73 | public boolean contains(Object o) { 74 | int targetIndex = featureIndexMap.getIndex((String) o); 75 | return featureIndices.contains(targetIndex); 76 | } 77 | 78 | @Override 79 | public void clear() { featureIndices.clear(); } 80 | 81 | 82 | /** 83 | * Iterator that automatically maps the stored indices to strings 84 | */ 85 | class StringIterator implements Iterator { 86 | 87 | Iterator featureIndexIter; 88 | 89 | StringIterator () { featureIndexIter = featureIndices.iterator(); } 90 | 91 | @Override 92 | public boolean hasNext() { 93 | return featureIndexIter.hasNext(); 94 | } 95 | 96 | @Override 97 | public String next() { 98 | int index = featureIndexIter.next(); 99 | return featureIndexMap.getFeature(index); 100 | } 101 | 102 | @Override 103 | public void remove() { 104 | featureIndexIter.remove(); 105 | } 106 | } 107 | 108 | 109 | 110 | // NOTE: for compatibility, FeatureSets are collections 111 | // BELOW: interfaces inherited from collection that we do not need to implement 112 | 113 | @Override 114 | public Object[] toArray() { throw new Error("Not implemented"); } 115 | 116 | @Override 117 | public T[] toArray(T[] a) { throw new Error("Not implemented"); } 118 | 119 | @Override 120 | public boolean remove(Object o) { throw new Error("Not implemented"); } 121 | 122 | @Override 123 | public boolean containsAll(Collection c) { throw new Error("Not implemented"); } 124 | 125 | @Override 126 | public boolean removeAll(Collection c) { throw new Error("Not implemented"); } 127 | 128 | @Override 129 | public boolean retainAll(Collection c) { throw new Error("Not implemented"); } 130 | 131 | } 132 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/features/FeatureSet.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.features; 19 | 20 | import java.util.Collection; 21 | 22 | /** 23 | * Interface for feature sets. For now just a collection of String. 24 | * Created by scheibcn on 6/1/16. 25 | */ 26 | public interface FeatureSet extends Collection { 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/features/FeatureStringSet.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.features; 19 | 20 | import java.util.HashSet; 21 | import java.util.Iterator; 22 | 23 | /** 24 | * A feature set that stores features as strings internally. Essentially just a HashSet. 25 | */ 26 | public class FeatureStringSet extends HashSet implements FeatureSet { 27 | public FeatureStringSet(FeatureStringSet f) { 28 | super(f); 29 | } 30 | 31 | public FeatureStringSet() { 32 | super(); 33 | } 34 | 35 | public FeatureStringSet(int size) { 36 | super(size); 37 | } 38 | 39 | @Override 40 | public boolean add(String e) { return super.add(e); } 41 | 42 | @Override 43 | public Iterator iterator() { 44 | return super.iterator(); 45 | } 46 | 47 | 48 | } 49 | 50 | 51 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/features/components/DocumentOffsetConjunction.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.features.components; 19 | 20 | import java.util.Arrays; 21 | import java.util.LinkedList; 22 | import java.util.List; 23 | 24 | import ims.cs.lingdata.Document; 25 | import ims.cs.lingdata.Token; 26 | import ims.cs.qsample.features.FeatureSet; 27 | import org.apache.commons.lang3.StringUtils; 28 | 29 | /** 30 | * Offset conjunction over a selection of features. 31 | * Idea here: enumerate all possible patterns of feature conjunctions. Then test for each feature set whether it 32 | * contains each of the conjunctions. If so, add the conjunction. 33 | */ 34 | public class DocumentOffsetConjunction { 35 | 36 | // features subject to conjunction 37 | private static final String[] features = new String[] {"SENT:QUOT", "SENT:NE", "SENT:PRO", "SENT:HASCUE", "CUE-DEP", "IS-LEFTMOST", "SENT-BEGIN-WIN", "SENT-END-WIN"}; 38 | 39 | private List patternList; 40 | 41 | 42 | public DocumentOffsetConjunction() { 43 | patternList = new LinkedList<>(); 44 | 45 | // add empty entry to start 46 | patternList.add(new String[] {}); 47 | 48 | 49 | for (String s : features) { 50 | List newPatterns = new LinkedList(); 51 | for (String[] pattern : patternList) { 52 | String[] concat = append(pattern, s); 53 | newPatterns.add(concat); 54 | } 55 | 56 | patternList.addAll(newPatterns); 57 | } 58 | 59 | // remove the empty entry 60 | patternList.remove(0); 61 | } 62 | 63 | /** 64 | * Add feature conjunctions to all tokens in the document 65 | * @param document 66 | */ 67 | public void extract (Document document) { 68 | List tokenList = document.getTokenList(); 69 | 70 | for (Token token : tokenList) { 71 | FeatureSet fs = token.boundaryFeatureSet; 72 | 73 | // check for each pattern whether the feature set satisfies it 74 | for (String[] features : patternList) { 75 | boolean matches = true; 76 | for (String feature: features) { 77 | if (!fs.contains(feature)) { 78 | matches = false; 79 | break; 80 | } 81 | } 82 | 83 | // if the pattern is satisfied, add the conjunction 84 | if (matches) { 85 | fs.add("CONJUNCTION:" + StringUtils.join(",", features)); 86 | } 87 | } 88 | } 89 | } 90 | 91 | 92 | public static String[] append (String[] a1, String s) { 93 | String[] ret = new String[a1.length + 1]; 94 | System.arraycopy(a1, 0, ret, 0, a1.length); 95 | ret[ret.length-1] = s; 96 | return ret; 97 | } 98 | 99 | public void printPatterns() { 100 | for(String[] p : patternList) { 101 | System.out.println(Arrays.toString(p)); 102 | } 103 | } 104 | 105 | 106 | } 107 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/features/components/DocumentQuotationFeature.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.features.components; 19 | 20 | import java.util.List; 21 | 22 | import ims.cs.lingdata.Document; 23 | import ims.cs.lingdata.Token; 24 | import ims.cs.corenlp.Helper; 25 | 26 | /** 27 | * Check for each token whether it is enclosed in quotation marks 28 | */ 29 | public abstract class DocumentQuotationFeature { 30 | 31 | public static final String INQ_PREFIX = "DOC:INQ"; 32 | public static final String NOTINQ_PREFIX = "DOC:NOTINQ"; 33 | public static final String OPEN_PREFIX = "DOC:Q-OPENS"; 34 | public static final String CLOSE_PREFIX = "DOC:Q-CLOSES"; 35 | 36 | public static void extract(Document document) { 37 | boolean inQuote = false; 38 | 39 | List tokenList = document.getTokenList(); 40 | 41 | for (Token token : tokenList) { 42 | // check if token is a quotation mark and is not to be ignored 43 | // (paragraph-initial tokens may be marked to be ignored) 44 | if (Helper.isQuote(token) && !token.ignoreQuote) { 45 | 46 | // add respective feature ... 47 | if (inQuote) 48 | token.boundaryFeatureSet.add(CLOSE_PREFIX); 49 | else 50 | token.boundaryFeatureSet.add(OPEN_PREFIX); 51 | 52 | // toggle in-quote state 53 | inQuote = !inQuote; 54 | token.boundaryFeatureSet.add(INQ_PREFIX); 55 | } else if (inQuote) { /* currently in quote */ 56 | token.boundaryFeatureSet.add(INQ_PREFIX); 57 | } else { /* currently not in quote */ 58 | token.boundaryFeatureSet.add(NOTINQ_PREFIX); 59 | } 60 | } 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/features/components/SentenceConstituentFeatures.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.features.components; 19 | 20 | import java.util.LinkedList; 21 | import java.util.List; 22 | 23 | import ims.cs.lingdata.Sentence; 24 | import ims.cs.lingdata.Token; 25 | import ims.cs.qsample.features.FeatureSet; 26 | import edu.stanford.nlp.trees.Tree; 27 | import ims.cs.util.StaticConfig; 28 | 29 | /** 30 | * Token features derived from the constituency parse of a sentence 31 | */ 32 | public abstract class SentenceConstituentFeatures { 33 | 34 | // feature names 35 | private static final String LEVEL_FEATURE = "LVL"; 36 | private static final String LEFTMOST_FEATURE = "IS-LEFTMOST"; 37 | private static final String GOV_FEATURE = "GOV:"; 38 | private static final String AL_FEATURE = "AL:"; 39 | private static final String PARENT_FEATURE = "PARENT:"; 40 | 41 | public static void extract(Sentence s) { 42 | addTreeFeatures(s, s.tree); 43 | } 44 | 45 | /** 46 | * Class for keeping track of node-level pairs 47 | */ 48 | private static class NodeFeatures { 49 | String label; 50 | Integer level; 51 | 52 | public NodeFeatures(String label, int depth) { 53 | this.label = label; 54 | this.level = depth; 55 | } 56 | } 57 | 58 | /** 59 | * Add tree features recursively 60 | * @param s 61 | * @param t 62 | */ 63 | private static void addTreeFeatures(Sentence s, Tree t) { 64 | addTreeFeatures(s, t, 0, new LinkedList(), null, true, null); 65 | } 66 | 67 | /** 68 | * Recursion step for tree featues 69 | * @param sentence 70 | * @param t complete tree 71 | * @param level current level 72 | * @param governingLabels list of governing labels 73 | * @param parent information about direct parent 74 | * @param isLeftmost is the node the leftmost one in the constituent specified by ancestorWhereLeftmost 75 | * @param ancestorWhereLeftmost 76 | */ 77 | private static void addTreeFeatures(Sentence sentence, Tree t, int level, List governingLabels, NodeFeatures parent, boolean isLeftmost, NodeFeatures ancestorWhereLeftmost) { 78 | 79 | 80 | if (t.isLeaf()) { /* terminal nodes */ 81 | // get the current token represented by this subtree 82 | Token pToken = sentence.treeLookup.get(t); 83 | 84 | // check if token is null. this can happen if the token was unaligned previously (e.g., because of 85 | // a parser error) 86 | if (pToken == null) { 87 | if (StaticConfig.verbose) 88 | System.err.println(sentence.sentenceId + " Dropping tree without associated token: " + t + " "); 89 | return; 90 | } 91 | 92 | FeatureSet fs = pToken.boundaryFeatureSet; 93 | 94 | // leftmost feature (see Pareti paper for description) 95 | if (StaticConfig.constituentLeftmost && isLeftmost) 96 | fs.add(LEFTMOST_FEATURE); 97 | 98 | // level in tree 99 | if (StaticConfig.constituentLevel) { 100 | fs.add(LEVEL_FEATURE + level); 101 | addLevelBinHeuristic(pToken, LEVEL_FEATURE, level); 102 | } 103 | 104 | // leftmost feature label 105 | if (StaticConfig.constituentAncestorL) { 106 | fs.add(AL_FEATURE + "LBL:" + ancestorWhereLeftmost.label); 107 | fs.add(AL_FEATURE + "LVL:" + ancestorWhereLeftmost.level); 108 | 109 | addLevelBinHeuristic(pToken, AL_FEATURE + "LVL", ancestorWhereLeftmost.level); 110 | } 111 | 112 | // parent in constituent tree 113 | if (StaticConfig.constituentParent) { 114 | fs.add(PARENT_FEATURE + "LBL:" + parent.label); 115 | } 116 | 117 | // labels of all ancestors 118 | if (StaticConfig.constituentGoverning) { /* "Ancestor" features in the paper */ 119 | for (NodeFeatures nf: governingLabels) { 120 | // label with and without depth 121 | fs.add(GOV_FEATURE + nf.label + "@" + nf.level); /* ambiguous in paper */ 122 | fs.add(GOV_FEATURE + nf.label); 123 | fs.add(GOV_FEATURE + nf.label + "@-" + (level - nf.level)); /* ambiguous in paper */ 124 | 125 | addLevelBinHeuristic(pToken, GOV_FEATURE + nf.label + "@", nf.level); 126 | addLevelBinHeuristic(pToken, GOV_FEATURE + nf.label + "@-", (level - nf.level)); 127 | } 128 | } 129 | } else { // non-terminal node 130 | List childList = t.getChildrenAsList(); 131 | String label = t.label().toString(); 132 | 133 | // copy governing node features for next recursion step 134 | List governingLabelsUpdate = new LinkedList(governingLabels); 135 | governingLabelsUpdate.add(new NodeFeatures(label, level)); 136 | 137 | // set leftmost ancestor 138 | if (ancestorWhereLeftmost == null) { 139 | ancestorWhereLeftmost = new NodeFeatures(label, level); 140 | } 141 | 142 | // check for pre-terminals -- otherwise, set the leftmost flag for the first constituent 143 | if (childList.size() > 1) { 144 | isLeftmost = true; 145 | } 146 | 147 | // call function for all children 148 | for (Tree child : childList) { 149 | addTreeFeatures(sentence, child, level + 1, governingLabelsUpdate, new NodeFeatures(label, level), isLeftmost, ancestorWhereLeftmost); 150 | isLeftmost = false; 151 | ancestorWhereLeftmost = null; 152 | } 153 | } 154 | } 155 | 156 | /** 157 | * Binning for levels 158 | * @param mToken 159 | * @param feature 160 | * @param value 161 | */ 162 | private static void addLevelBinHeuristic(Token mToken, String feature, int value) { 163 | if (!StaticConfig.constituentBinning) return; 164 | 165 | FeatureSet fs = mToken.boundaryFeatureSet; 166 | 167 | int[] bins = new int[] {0, 1, 2, 3, 5, 7, 10, 13, 16, 20, 25, 40, 1000 }; 168 | 169 | for (int i=0; i < bins.length - 1; i++) { 170 | int threshLower = bins[i]; 171 | int threshUpper = bins[i + 1]; 172 | 173 | // threshold satisfied? add bin feature! 174 | if (value <= threshUpper) { 175 | if (StaticConfig.constituentBinningStacked) { 176 | fs.add(feature + "(<=)" + threshLower); 177 | if (value >= threshLower) 178 | fs.add(feature + "(>=)" + threshLower); 179 | } else if (value > threshLower) { 180 | fs.add(feature + "(EXACT)" + threshLower); 181 | } 182 | } 183 | } 184 | } 185 | 186 | 187 | } 188 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/features/components/SentenceDependencyFeatures.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.features.components; 19 | 20 | import java.util.List; 21 | 22 | import ims.cs.lingdata.Sentence; 23 | import ims.cs.lingdata.Token; 24 | import ims.cs.corenlp.Helper; 25 | import ims.cs.qsample.features.FeatureSet; 26 | import edu.stanford.nlp.semgraph.SemanticGraphEdge; 27 | import ims.cs.util.StaticConfig; 28 | 29 | /** 30 | * Token features derived from the dependency parse of a sentence 31 | */ 32 | public abstract class SentenceDependencyFeatures { 33 | 34 | // feature names 35 | private static final String PARENT_REL_PREFIX = "PARENT-REL"; 36 | private static final String PARENT_RELHEAD_PREFIX = "PARENT-REL+HD"; 37 | private static final String CHILD_REL_PREFIX = "CHILD-REL"; 38 | private static final String CHILD_RELHEAD_PREFIX = "CHILD-REL+HD"; 39 | 40 | /** 41 | * Extract dependency features for all tokens in this sentence 42 | * @param sentence 43 | */ 44 | public static void extract (Sentence sentence) { 45 | for (Token pToken : sentence.tokenList) { 46 | if (StaticConfig.dependencyParentRel || StaticConfig.dependencyParentRelHead) addParentFeature(pToken); 47 | if (StaticConfig.dependencyChildRel || StaticConfig.dependencyChildRelHead) addChildFeatures(pToken); 48 | } 49 | } 50 | 51 | /** 52 | * Add features about the parent of the token 53 | * @param token 54 | */ 55 | private static void addParentFeature(Token token) { 56 | SemanticGraphEdge parentEdge = Helper.getDependencyParentRel(token); 57 | 58 | FeatureSet fs = token.boundaryFeatureSet; 59 | 60 | if (parentEdge != null) { 61 | // plain parent 62 | if (StaticConfig.dependencyParentRel) 63 | fs.add(PARENT_REL_PREFIX + "=" + parentEdge.getRelation()); 64 | 65 | // parent and relation label 66 | if (StaticConfig.dependencyParentRelHead) 67 | fs.add(PARENT_RELHEAD_PREFIX + "=" + parentEdge.getRelation() + "," + parentEdge.getGovernor().lemma()); 68 | } 69 | } 70 | 71 | /** 72 | * Add features about the child of a token 73 | * @param pcToken 74 | */ 75 | private static void addChildFeatures(Token pcToken) { 76 | List childEdgeList = Helper.getDependencyChildrenRels(pcToken); 77 | FeatureSet fs = pcToken.boundaryFeatureSet; 78 | 79 | if (childEdgeList != null) { 80 | for (SemanticGraphEdge childEdge : childEdgeList) { 81 | // plain child 82 | if (StaticConfig.dependencyChildRel) 83 | fs.add(CHILD_REL_PREFIX + "=" + childEdge.getRelation()); 84 | 85 | // child and relation label 86 | if (StaticConfig.dependencyChildRelHead) 87 | fs.add(CHILD_RELHEAD_PREFIX + "=" + childEdge.getRelation() + "," + childEdge.getDependent().lemma()); 88 | } 89 | } 90 | } 91 | 92 | 93 | } 94 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/features/components/SentenceFeaturesDerivedFromListCue.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.features.components; 19 | 20 | import ims.cs.lingdata.Sentence; 21 | import ims.cs.lingdata.Token; 22 | import ims.cs.corenlp.Helper; 23 | import ims.cs.util.StaticConfig; 24 | 25 | import java.util.LinkedList; 26 | import java.util.List; 27 | 28 | /** 29 | * Token features based on cue information from the noun cue list. 30 | */ 31 | public abstract class SentenceFeaturesDerivedFromListCue { 32 | 33 | private static final String CUE_DEP_PREFIX = "CUE-DEP:NOUNCUE"; 34 | private static final String CUE_PREFIX = "SENT:HASCUE:NOUNCUE"; 35 | 36 | /** 37 | * Extract features for all tokens in the sentence 38 | * @param sentence 39 | */ 40 | public static void extract (Sentence sentence) { 41 | boolean sentenceHasCueFeature = sentenceHasCue(sentence.tokenList); 42 | 43 | // check each token for noun-cue-ness, push features to its dependents (transitively) 44 | for (Token pToken : sentence.tokenList) { 45 | if (StaticConfig.dependencyCueDependent) { 46 | // token is in noun cue list 47 | if (pToken.boundaryFeatureSet.contains("NOUNCUELIST")) 48 | addCueDependentFeature("LIST", pToken, sentence); 49 | 50 | // token is "according to" 51 | if (pToken.predText.toLowerCase().equals("according") 52 | && pToken.nextToken != null 53 | && pToken.nextToken.predText.equals("to")) 54 | addCueDependentFeature("ACCORDINGTO", pToken, sentence); 55 | } 56 | 57 | SentenceIndicatorFeatures.addFeaturePositiveAndNegative(CUE_PREFIX, sentenceHasCueFeature, pToken); 58 | } 59 | } 60 | 61 | /** 62 | * Push features to all dependents of a cue 63 | * @param type 64 | * @param token 65 | * @param sentence 66 | */ 67 | private static void addCueDependentFeature(String type, Token token, Sentence sentence) { 68 | List stack = new LinkedList(); 69 | stack.add(token); 70 | 71 | // recursively iterate over all children (and their children ...) 72 | while (stack.size() > 0) { 73 | Token current = stack.remove(0); 74 | current.boundaryFeatureSet.add(CUE_DEP_PREFIX + "-" + type); 75 | 76 | List children = Helper.getDependencyChildren(current); 77 | 78 | if (children == null) continue; 79 | 80 | for (Token c : children) { 81 | if (c != null) stack.add(c); 82 | } 83 | } 84 | } 85 | 86 | /** 87 | * Check whether the sentence has any noun cues 88 | * @param data 89 | * @return 90 | */ 91 | private static boolean sentenceHasCue(List data) { 92 | for (Token token: data) { 93 | if (token.boundaryFeatureSet.contains("NOUNCUELIST")) { 94 | return true; 95 | } 96 | } 97 | return false; 98 | } 99 | 100 | } 101 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/features/components/SentenceIndicatorFeatures.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.features.components; 19 | 20 | import ims.cs.lingdata.Sentence; 21 | import ims.cs.lingdata.Token; 22 | import ims.cs.corenlp.Helper; 23 | import ims.cs.qsample.features.FeatureSet; 24 | import ims.cs.util.StaticConfig; 25 | 26 | /** 27 | * Add sentence-level indicator features to each token 28 | */ 29 | public abstract class SentenceIndicatorFeatures { 30 | 31 | // feature names 32 | private static final String QUOT_PREFIX = "SENT:QUOT"; 33 | private static final String NE_PREFIX = "SENT:NE"; 34 | private static final String PRO_PREFIX = "SENT:PRO"; 35 | private static final String SL_PREFIX = "SL="; 36 | private static final String SL_LT_PREFIX = "SL<="; 37 | private static final String SL_GT_PREFIX = "SL>="; 38 | private static final String SL_EXACT_PREFIX = "SL-EXACT-BIN="; 39 | private static final String SENT_BEGIN_WINDOW = "SENT-BEGIN-WIN"; 40 | private static final String SENT_END_WINDOW = "SENT-END-WIN"; 41 | private static final String INVERT_PREFIX = "NOT:"; 42 | 43 | /** 44 | * Extract indicator features for all tokens in this sentence 45 | * @param sentence 46 | */ 47 | public static void extract (Sentence sentence) { 48 | // pre-compute features 49 | boolean sentenceHasQuotFeature = sentenceHasQuotationMark(sentence); 50 | boolean sentenceHasProFeature = sentenceHasPro(sentence); 51 | boolean sentenceHasNeFeature = sentenceHasNe(sentence); 52 | int sentenceLength = sentence.tokenList.size(); 53 | 54 | // distance to sentence boundaries 55 | sentenceBoundDistance(sentence); 56 | 57 | // now add pre-computed features to token list 58 | for (Token mToken : sentence.tokenList) { 59 | if (StaticConfig.sentenceHasQuote) addFeaturePositiveAndNegative(QUOT_PREFIX, sentenceHasQuotFeature, mToken); 60 | if (StaticConfig.sentenceHasPronoun) addFeaturePositiveAndNegative(PRO_PREFIX, sentenceHasProFeature, mToken); 61 | 62 | if (StaticConfig.sentenceHasNe) addFeaturePositiveAndNegative(NE_PREFIX, sentenceHasNeFeature, mToken); 63 | if (StaticConfig.sentenceLength) { 64 | addLengthLogBinHeuristic(mToken, sentenceLength); 65 | mToken.boundaryFeatureSet.add(SL_PREFIX + sentenceLength); 66 | } 67 | } 68 | } 69 | 70 | /** 71 | * Add positive or negative version of a feature (i.e., also explicitly mark the absence of a feature) 72 | * @param featureName 73 | * @param featureOn 74 | * @param token 75 | */ 76 | public static void addFeaturePositiveAndNegative(String featureName, boolean featureOn, Token token) { 77 | if (featureOn) 78 | token.boundaryFeatureSet.add(featureName); 79 | else 80 | token.boundaryFeatureSet.add(INVERT_PREFIX + featureName); 81 | } 82 | 83 | /** 84 | * Binning for lengths, exponential bin spacing 85 | * @param pToken 86 | * @param length 87 | */ 88 | private static void addLengthLogBinHeuristic(Token pToken, int length) { 89 | if (!StaticConfig.sentenceLengthBinning) return; 90 | 91 | FeatureSet fs = pToken.boundaryFeatureSet; 92 | 93 | int[] bins = new int[] {0, 2, 4, 8, 16, 32, 64, 1000}; 94 | 95 | for (int i=0; i < bins.length - 1; i++) { 96 | int threshLower = bins[i]; 97 | int threshUpper = bins[i+1]; 98 | 99 | if (length <= threshUpper) { 100 | if (StaticConfig.sentenceLengthBinningStacked) { 101 | fs.add(SL_LT_PREFIX + "STACKED-" + threshLower); 102 | } else if (length > threshLower) { 103 | fs.add(SL_EXACT_PREFIX + threshLower); 104 | } 105 | } 106 | 107 | if ((length >= threshLower) && StaticConfig.sentenceLengthBinningStacked) { 108 | fs.add(SL_GT_PREFIX + threshLower); 109 | } 110 | } 111 | 112 | } 113 | 114 | /** 115 | * Add features about the distance of each token to the sentence boundary 116 | * @param sentence 117 | */ 118 | private static void sentenceBoundDistance(Sentence sentence) { 119 | int pos = 0; 120 | int sl = sentence.tokenList.size(); 121 | 122 | for (Token token : sentence.tokenList) { 123 | // compute distance to end 124 | int endDist = sl - pos - 1; 125 | 126 | // if distance to either boundary is within a window of 5, add respective feature 127 | if (pos < 5) token.boundaryFeatureSet.add(SENT_BEGIN_WINDOW); 128 | if (endDist < 5) token.boundaryFeatureSet.add(SENT_END_WINDOW); 129 | 130 | pos++; 131 | } 132 | } 133 | 134 | /** 135 | * Determines whether a sentence contains a quotation mark 136 | * @param sentence 137 | * @return 138 | */ 139 | private static boolean sentenceHasQuotationMark(Sentence sentence) { 140 | for (Token token: sentence.tokenList) { 141 | if (Helper.isQuote(token)) { 142 | return true; 143 | } 144 | } 145 | return false; 146 | } 147 | 148 | /** 149 | * Determines whether a sentence contains a pronoun 150 | * @param sentence 151 | * @return 152 | */ 153 | private static boolean sentenceHasPro(Sentence sentence) { 154 | for (Token token: sentence.tokenList) { 155 | if (token.predPosTag.startsWith("PR")) { 156 | return true; 157 | } 158 | } 159 | return false; 160 | } 161 | 162 | /** 163 | * Determines whether a sentence contains a named entity 164 | * @param sentence 165 | * @return 166 | */ 167 | private static boolean sentenceHasNe(Sentence sentence) { 168 | for (Token token: sentence.tokenList) { 169 | if ((token.predNer.startsWith("PERSON")) || (token.predNer.startsWith("ORGANIZATION"))) { 170 | return true; 171 | } 172 | } 173 | return false; 174 | } 175 | 176 | 177 | 178 | 179 | } 180 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/features/components/TokenDictFeatures.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.features.components; 19 | 20 | import ims.cs.lingdata.Token; 21 | 22 | import java.io.BufferedReader; 23 | import java.io.FileReader; 24 | import java.io.IOException; 25 | import java.util.HashMap; 26 | import java.util.HashSet; 27 | import java.util.Map; 28 | import java.util.Set; 29 | 30 | /** 31 | * Feature extractor that extracts information about a token from a dictionary (read from a tab-separated file) 32 | */ 33 | public class TokenDictFeatures { 34 | 35 | private String featureName = "VERBDICT"; 36 | private String listFileName; 37 | private Map> wordMap; 38 | public String posStart = null; 39 | 40 | 41 | /** 42 | * Set up the feature extractor 43 | * @param listFileName name of the dictionary file (tab-separated) 44 | * @param featureName name of the feature that will be extracted 45 | * @throws IOException 46 | */ 47 | public TokenDictFeatures(String listFileName, String featureName) throws IOException { 48 | this.listFileName = listFileName; 49 | this.featureName = featureName; 50 | loadDictionary(); 51 | } 52 | 53 | /** 54 | * Extract dictionary information for the token t 55 | * @param t 56 | */ 57 | public void extract(Token t) { 58 | // check if the token's lemma is in the dictionary 59 | if (wordMap.containsKey(t.predLemma)) { 60 | // check for POS restriction if necessary 61 | if (posStart == null || t.predPosTag.startsWith(posStart)) { 62 | for (String vclass : wordMap.get(t.predLemma)) 63 | t.boundaryFeatureSet.add(featureName + "=" + vclass); 64 | } 65 | } 66 | } 67 | 68 | /** 69 | * Load dictionary from a tab-separated file 70 | * @throws IOException 71 | */ 72 | private void loadDictionary() throws IOException { 73 | wordMap = new HashMap<>(); 74 | 75 | BufferedReader br = new BufferedReader(new FileReader(listFileName)); 76 | String line; 77 | 78 | while ((line = br.readLine()) != null) { 79 | line = line.trim(); 80 | String[] tokens = line.split("\\s+"); 81 | String word = tokens[0]; 82 | String wordClass = tokens[1]; 83 | 84 | if (!wordMap.containsKey(word)) { 85 | wordMap.put(word, new HashSet()); 86 | } 87 | 88 | wordMap.get(word).add(wordClass); 89 | } 90 | 91 | br.close(); 92 | 93 | } 94 | 95 | 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/features/components/TokenLexicalFeatures.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.features.components; 19 | 20 | import ims.cs.lingdata.Token; 21 | import ims.cs.qsample.features.FeatureSet; 22 | import ims.cs.util.StaticConfig; 23 | 24 | /** 25 | * Extracts lexical information about a token (e.g., word, lemma, POS) 26 | */ 27 | public abstract class TokenLexicalFeatures { 28 | 29 | private static final String TOK_PREFIX = "TOK"; 30 | private static final String LEMMA_PREFIX = "LEMMA"; 31 | private static final String POS_PREFIX = "POS"; 32 | private static final String BG_PREFIX = "BG"; 33 | private static final String NE_PREFIX = "NE"; 34 | private static final String PARBEGIN_PREFIX = "PAR-BEGINS"; 35 | private static final String PAREND_PREFIX = "PAR-ENDS"; 36 | 37 | 38 | /** 39 | * Extract lexical features about a single token t 40 | * @param t 41 | */ 42 | public static void extract(Token t) { 43 | 44 | if (StaticConfig.lexicalPos || 45 | StaticConfig.lexicalLemma || 46 | StaticConfig.lexicalToken) 47 | addWindowFeatures(t); 48 | 49 | if (StaticConfig.lexicalBigram) addBigramFeature(t); 50 | addNeFeature(t); 51 | addDocStructureFeature(t); 52 | } 53 | 54 | /** 55 | * Adds paragraph begin and end features 56 | * @param token 57 | */ 58 | private static void addDocStructureFeature(Token token) { 59 | if (token.paragraphBegins) token.boundaryFeatureSet.add(PARBEGIN_PREFIX); 60 | if (token.nextToken == null || token.nextToken.paragraphBegins) token.boundaryFeatureSet.add(PAREND_PREFIX); 61 | } 62 | 63 | /** 64 | * Adds features about whether the token is part of a named entity 65 | * @param token 66 | */ 67 | private static void addNeFeature(Token token) { 68 | if (!token.predNer.equals("?") && !token.predNer.equals("O")) { 69 | token.boundaryFeatureSet.add(NE_PREFIX+"-IS-NE"); 70 | token.boundaryFeatureSet.add(NE_PREFIX+"-IS-NE-" + token.predNer); 71 | } 72 | } 73 | 74 | /** 75 | * Adds bigram features with the previous and next token 76 | * @param token 77 | */ 78 | private static void addBigramFeature(Token token) { 79 | String prevWordForm; 80 | String prevLemma; 81 | 82 | String nextWordForm; 83 | String nextLemma; 84 | 85 | // find previous token 86 | if (token.previousToken == null) { 87 | prevWordForm = "null"; 88 | prevLemma = "null"; 89 | } else { 90 | Token prevToken = token.previousToken; 91 | prevWordForm = prevToken.predText; 92 | prevLemma = prevToken.predLemma; 93 | } 94 | 95 | // find next token 96 | if (token.nextToken == null) { 97 | nextWordForm = "null"; 98 | nextLemma = "null"; 99 | } else { 100 | Token nextToken = token.nextToken; 101 | nextWordForm = nextToken.predText; 102 | nextLemma = nextToken.predLemma; 103 | } 104 | 105 | // add features of word and lemma bigrams 106 | FeatureSet fs = token.boundaryFeatureSet; 107 | 108 | fs.add(BG_PREFIX + prevWordForm + "<--" + token.predText); 109 | fs.add(BG_PREFIX + "(LEMMA)" + prevLemma + "<--" + token.predLemma); 110 | 111 | fs.add(BG_PREFIX + nextWordForm + "-->" + token.predText); 112 | fs.add(BG_PREFIX + "(LEMMA)" + nextLemma + "-->" + token.predLemma); 113 | } 114 | 115 | 116 | /** 117 | * Adds features from other tokens within a window 118 | * @param pToken 119 | */ 120 | private static void addWindowFeatures(Token pToken) { 121 | // current POS tag 122 | FeatureSet fs = pToken.boundaryFeatureSet; 123 | 124 | if (StaticConfig.lexicalPos) fs.add(POS_PREFIX + "-0=" + pToken.predPosTag); 125 | if (StaticConfig.lexicalToken) fs.add(TOK_PREFIX + "-0=" + pToken.predText); 126 | if (StaticConfig.lexicalLemma) fs.add(LEMMA_PREFIX + "-0=" + pToken.predLemma); 127 | 128 | 129 | // previous tokens 130 | Token currentToken = pToken; 131 | for (int i = 1; i <= StaticConfig.lexicalWindowSize; i++) { 132 | String leftPos; 133 | String leftTok; 134 | String leftLemma; 135 | 136 | Token prevToken = currentToken.previousToken; 137 | if (prevToken != null) { 138 | leftPos = prevToken.predPosTag; 139 | leftTok = prevToken.predText; 140 | leftLemma = prevToken.predLemma; 141 | currentToken = prevToken; 142 | } else { 143 | leftPos = "NONE"; 144 | leftLemma = "NONE"; 145 | leftTok = "NONE"; 146 | } 147 | 148 | if (StaticConfig.lexicalPos) fs.add("WIN_" + POS_PREFIX + "-" + i + "=" + leftPos); 149 | if (StaticConfig.lexicalToken) fs.add("WIN_" + TOK_PREFIX + "-" + i + "=" + leftTok); 150 | if (StaticConfig.lexicalLemma) fs.add("WIN_" + LEMMA_PREFIX + "-" + i + "=" + leftLemma); 151 | } 152 | 153 | // subsequent tokens 154 | currentToken = pToken; 155 | for (int i = 1; i <= StaticConfig.lexicalWindowSize; i++) { 156 | String rightPos; 157 | String rightTok; 158 | String rightLemma; 159 | 160 | Token nextToken = currentToken.nextToken; 161 | if (nextToken != null) { 162 | rightPos = nextToken.predPosTag; 163 | rightTok = nextToken.predText; 164 | rightLemma = nextToken.predLemma; 165 | currentToken = nextToken; 166 | 167 | } else { 168 | rightPos = "NONE"; 169 | rightLemma = "NONE"; 170 | rightTok = "NONE"; 171 | 172 | } 173 | 174 | if (StaticConfig.lexicalPos) fs.add("WIN_" + POS_PREFIX + "+" + i + "=" + rightPos); 175 | if (StaticConfig.lexicalToken) fs.add("WIN_" + TOK_PREFIX + "+" + i + "=" + rightTok); 176 | if (StaticConfig.lexicalLemma) fs.add("WIN_" + LEMMA_PREFIX + "+" + i + "=" + rightLemma); 177 | 178 | } 179 | } 180 | 181 | 182 | 183 | } 184 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/features/components/TokenListFeatures.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.features.components; 19 | 20 | import java.io.BufferedReader; 21 | import java.io.FileReader; 22 | import java.io.IOException; 23 | import java.util.HashSet; 24 | import java.util.Set; 25 | 26 | import ims.cs.lingdata.Token; 27 | 28 | 29 | /** 30 | * Feature extractor that check whether a token is in a list (specified in a file) 31 | */ 32 | public class TokenListFeatures { 33 | 34 | private String featureName = "VERBLIST"; 35 | private String listFileName; 36 | private Set wordSet; 37 | private int window = 5; 38 | public String posStart = null; 39 | 40 | 41 | /** 42 | * Set up the feature extractor 43 | * @param listFileName list of words (one word per line) 44 | * @param featureName 45 | * @throws IOException 46 | */ 47 | public TokenListFeatures(String listFileName, String featureName) throws IOException { 48 | this.listFileName = listFileName; 49 | this.featureName = featureName; 50 | loadWordList(); 51 | } 52 | 53 | /** 54 | * Extract list feature for the token t 55 | * @param t 56 | */ 57 | public void extract(Token t) { 58 | // current token 59 | if ((posStart == null || t.predPosTag.startsWith(posStart)) && wordSet.contains(t.predLemma)) { 60 | t.boundaryFeatureSet.add(featureName); 61 | } 62 | 63 | // window before the token 64 | Token prevToken = t; 65 | for (int i = 0; i < window; i++) { 66 | prevToken = prevToken.previousToken; 67 | if (prevToken == null) break; 68 | if (wordSet.contains(prevToken.predLemma)) { 69 | t.boundaryFeatureSet.add("WIN_-" + (i+1) + "-" + featureName); 70 | } 71 | } 72 | 73 | // window after the token 74 | Token nextToken = t; 75 | for (int i = 0; i < window; i++) { 76 | nextToken = nextToken.nextToken; 77 | if (nextToken == null) break; 78 | if (wordSet.contains(nextToken.predLemma)) { 79 | t.boundaryFeatureSet.add("WIN_+" + (i+1) + "-" + featureName); 80 | } 81 | } 82 | } 83 | 84 | /** 85 | * Loads the word list (one word per line) 86 | * @throws IOException 87 | */ 88 | private void loadWordList() throws IOException { 89 | wordSet = new HashSet<>(); 90 | 91 | BufferedReader br = new BufferedReader(new FileReader(listFileName)); 92 | String line; 93 | 94 | while ((line = br.readLine()) != null) { 95 | line = line.trim(); 96 | wordSet.add(line); 97 | } 98 | 99 | br.close(); 100 | 101 | } 102 | 103 | 104 | } 105 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/greedysample/HasScore.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.greedysample; 19 | 20 | /** 21 | * An interface for things that have a score. 22 | * Created by scheibcn on 11/5/15. 23 | */ 24 | public interface HasScore { 25 | double getScore(); 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/greedysample/Sampling.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | package ims.cs.qsample.greedysample; 18 | 19 | import java.util.List; 20 | import java.util.Random; 21 | 22 | /** 23 | * Sample an element from a list of elements with a score. 24 | * Created by scheibcn on 11/5/15. 25 | */ 26 | public class Sampling { 27 | 28 | 29 | Random random; 30 | public boolean doExp = true; 31 | 32 | public Sampling(Random random) { 33 | this.random = random; 34 | } 35 | 36 | 37 | /** 38 | * Sample an element proportionally to sigmoid-transformed scores 39 | * @param items 40 | */ 41 | public int sampleOne(List items, double temperature, double bias) { 42 | double[] values = new double[items.size()]; 43 | double sum = 0; 44 | 45 | // first compute scores and normalize 46 | for (int i = 0; i < values.length; i++) { 47 | double score = items.get(i).getScore(); 48 | values[i] = (score + bias) / temperature; 49 | 50 | if (doExp) { 51 | values[i] = 1/(1+Math.exp(-values[i])); 52 | } 53 | sum += values[i]; 54 | } 55 | 56 | // then sample proportionally 57 | double sumNorm = 0; 58 | double r = random.nextDouble(); 59 | int resultPosition = 0; 60 | 61 | for (int i = 0; i < values.length; i++) { 62 | values[i] /= sum; 63 | sumNorm += values[i]; 64 | if (sumNorm > r) { 65 | resultPosition = i; 66 | break; 67 | } 68 | } 69 | 70 | return resultPosition; 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/models/HigherSpanModel.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.models; 19 | 20 | import ims.cs.qsample.features.FeatureSet; 21 | import ims.cs.qsample.perceptron.Perceptron; 22 | import ims.cs.qsample.spans.Span; 23 | 24 | import java.io.FileNotFoundException; 25 | import java.io.FileOutputStream; 26 | import java.io.PrintStream; 27 | import java.io.Serializable; 28 | 29 | /** 30 | * A model for scoring a whole span (rather than just begin and end information) 31 | * Created by scheibcn on 3/5/16. 32 | */ 33 | public class HigherSpanModel implements Serializable { 34 | 35 | private static final long serialVersionUID = 3509778136938744648L; 36 | 37 | // We actually make separate models for begin, end, and span-level information. 38 | // This makes feature management easier, among other things. 39 | Perceptron beginPerceptron; 40 | Perceptron endPerceptron; 41 | Perceptron higherOrderPerceptron; 42 | 43 | public HigherSpanModel() { 44 | this.beginPerceptron = new Perceptron(); 45 | this.endPerceptron = new Perceptron(); 46 | this.higherOrderPerceptron = new Perceptron(); 47 | } 48 | 49 | /** 50 | * Computes the current score of a span according to the model 51 | * @param span 52 | * @param average use averaged perceptron? 53 | * @return 54 | */ 55 | public double score(Span span, boolean average) { 56 | // we handle the begin, end, and span features separately 57 | FeatureSet beginFeatures = span.first().boundaryFeatureSet; 58 | FeatureSet endFeatures = span.last().boundaryFeatureSet; 59 | FeatureSet spanFeatures = span.featureSet; 60 | 61 | // ... then, we can compute three individual scores 62 | double score = 0; 63 | score += beginPerceptron.score(beginFeatures, average); 64 | score += endPerceptron.score(endFeatures, average); 65 | score += higherOrderPerceptron.score(spanFeatures, average); 66 | 67 | return score; 68 | } 69 | 70 | /** 71 | * Train the model using a given span, updating with a specified learning rate 72 | * @param span 73 | * @param isPositive Has the example been correctly classified? 74 | * @param rate learning rate 75 | */ 76 | public void train(Span span, boolean isPositive, double rate) { 77 | FeatureSet leftFeatures = span.first().boundaryFeatureSet; 78 | FeatureSet rightFeatures = span.last().boundaryFeatureSet; 79 | FeatureSet spanFeatures = span.featureSet; 80 | 81 | // negate the learning rate if the example was wrong 82 | double effectiveRate = rate; 83 | if (!isPositive) effectiveRate = -effectiveRate; 84 | 85 | // update the three models separately 86 | // (use the update function directly as the train function would first check the score, which is nonsensical 87 | // for the individual models) 88 | beginPerceptron.update(leftFeatures, effectiveRate); 89 | endPerceptron.update(rightFeatures, effectiveRate); 90 | higherOrderPerceptron.update(spanFeatures, effectiveRate); 91 | } 92 | 93 | 94 | /** 95 | * Writes the current feature weights to a file 96 | * @param fileName 97 | * @throws FileNotFoundException 98 | */ 99 | public void printWeights(String fileName) throws FileNotFoundException { 100 | FileOutputStream fos = new FileOutputStream(fileName); 101 | PrintStream ps = new PrintStream(fos); 102 | beginPerceptron.printWeights(ps, "BEGIN"); 103 | endPerceptron.printWeights(ps, "END"); 104 | higherOrderPerceptron.printWeights(ps, "HIGHER"); 105 | ps.close(); 106 | } 107 | 108 | } 109 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/perceptron/Perceptron.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.perceptron; 19 | 20 | import ims.cs.qsample.features.FeatureSet; 21 | import ims.cs.qsample.spans.Span; 22 | 23 | import java.io.FileNotFoundException; 24 | import java.io.FileOutputStream; 25 | import java.io.PrintStream; 26 | import java.io.Serializable; 27 | import java.util.Map; 28 | 29 | /** 30 | * Implementation of perceptron model 31 | * Created by scheibcn on 11/5/15. 32 | */ 33 | public class Perceptron implements Serializable { 34 | private static final long serialVersionUID = 3436601656314837271L; 35 | 36 | // this model can actually also do logistic regression 37 | public enum UpdateType {PERCEPTRON, LR}; 38 | 39 | // .. default is perceptron 40 | public UpdateType updateType = UpdateType.PERCEPTRON; 41 | 42 | public Weights weights = new Weights(); 43 | 44 | // parameters 45 | public double fixedBias = 0; /* optional bias that can be manually adjusted */ 46 | public double marginPositive = 1; /* margin for positive class */ 47 | public double marginNegative = 1; /* margin for negative class */ 48 | 49 | // some debugging data 50 | public int numUpdates = 0; 51 | 52 | 53 | public Perceptron() { 54 | weights.weightMap.put("BIAS", 0.0); 55 | } 56 | 57 | 58 | /** 59 | * Score a feature set 60 | * @param featureSet 61 | * @return 62 | */ 63 | public double score(FeatureSet featureSet, boolean average) { 64 | double score = 0; 65 | 66 | // first, add bias 67 | if (average) { 68 | score += weights.getAvg("BIAS"); 69 | score += fixedBias; 70 | } else { 71 | score += weights.get("BIAS"); 72 | } 73 | 74 | // then, score all features in the data 75 | for (String feature: featureSet) { 76 | if (average) { 77 | score += weights.getAvg(feature); 78 | } else { 79 | score += weights.get(feature); 80 | } 81 | } 82 | 83 | return score; 84 | } 85 | 86 | /** 87 | * Perform an update with a given training example 88 | * @param featureSet 89 | * @param isPositive is this example a positive one? 90 | * @param rate 91 | */ 92 | public void train(FeatureSet featureSet, boolean isPositive, double rate) { 93 | if (updateType == UpdateType.PERCEPTRON) 94 | trainPerceptron(featureSet, isPositive, rate); /* perceptron update */ 95 | else if (updateType == UpdateType.LR) 96 | trainLr(featureSet, isPositive, rate); /* logistic regression update */ 97 | } 98 | 99 | /** 100 | * Perform a perceptron-style update 101 | * @param featureSet 102 | * @param isPositive 103 | * @param rate 104 | */ 105 | public void trainPerceptron(FeatureSet featureSet, boolean isPositive, double rate) { 106 | double predScore = score(featureSet, false); 107 | 108 | if (isPositive && predScore - marginPositive <= 0) { /* positive example and negative margin violation */ 109 | update(featureSet, rate); 110 | } else if (!isPositive && predScore + marginNegative > 0) { /* negative example and positive margin violation */ 111 | update(featureSet, -rate); 112 | } 113 | } 114 | 115 | /** 116 | * Perform a logistic regression update 117 | * @param featureSet 118 | * @param isPositive 119 | * @param rate 120 | */ 121 | public void trainLr(FeatureSet featureSet, boolean isPositive, double rate) { 122 | double predScore = score(featureSet, false); 123 | 124 | // true probability of the example? 125 | int trueProb; 126 | if (isPositive) trueProb = 1; 127 | else trueProb = 0; 128 | 129 | // learning rate times LR gradient 130 | double step = rate * (trueProb - sigmoid(predScore)); 131 | 132 | update(featureSet, step); 133 | } 134 | 135 | 136 | /** 137 | * Update the weights for each feature by the given rate 138 | * @param featureSet 139 | * @param rate 140 | */ 141 | public void update(FeatureSet featureSet, double rate) { 142 | // bias 143 | weights.update("BIAS", rate); 144 | 145 | // features 146 | for (String feature : featureSet) { 147 | weights.update(feature, rate); 148 | } 149 | 150 | numUpdates++; 151 | } 152 | 153 | /** 154 | * Print the weights for the features of the span to debug 155 | * @param span 156 | * @param prefix 157 | */ 158 | public void printInfo(Span span, String prefix) { 159 | for (String feature: span.featureSet) { 160 | double weight = weights.get(feature); 161 | System.out.println(prefix + feature + " " + weight); 162 | } 163 | } 164 | 165 | /** 166 | * Write the current feature weights to a file 167 | * @param fileName 168 | * @throws FileNotFoundException 169 | */ 170 | public void printWeights(String fileName) throws FileNotFoundException { 171 | FileOutputStream fos = new FileOutputStream(fileName); 172 | PrintStream ps = new PrintStream(fos); 173 | printWeights(ps, ""); 174 | } 175 | 176 | /** 177 | * Print the current feature weights to stdout 178 | */ 179 | public void printWeights() { 180 | printWeights(System.out, ""); 181 | } 182 | 183 | /** 184 | * Write the current feature weights to a stream, prepend each line with the specified prefix 185 | * @param out 186 | * @param prefix 187 | */ 188 | public void printWeights(PrintStream out, String prefix) { 189 | for (Map.Entry entry : weights.weightMap.entrySet()) { 190 | out.println(prefix + "-->" + entry.getKey() + "\t" + entry.getValue()); 191 | } 192 | } 193 | 194 | 195 | /** 196 | * Calculate the sigmoid of x 197 | * @param x 198 | * @return 199 | */ 200 | public static double sigmoid(double x) { 201 | return 1/(1+Math.exp(-x)); 202 | } 203 | 204 | } 205 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/perceptron/Weights.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.perceptron; 19 | 20 | import java.io.Serializable; 21 | import java.util.HashMap; 22 | import java.util.Map; 23 | 24 | /** 25 | * Store a set of weights associated to strings 26 | * Created by scheibcn on 11/5/15. 27 | */ 28 | public class Weights implements Serializable { 29 | private static final long serialVersionUID = 2945274488514737545L; 30 | 31 | // map for holding weights 32 | Map weightMap; 33 | 34 | // map for storing the weight history for averaging 35 | // for a clean description of the algorithm, 36 | // see for example Chapter 3 in Hal Daume's "A Course in Machine Learning" 37 | Map weightCacheMap; 38 | 39 | public boolean doAveraging = true; 40 | int averagingCoefficient = 0; 41 | 42 | public Weights() { 43 | // allocate some large maps 44 | weightMap = new HashMap(100000); 45 | weightCacheMap = new HashMap(100000); 46 | } 47 | 48 | /** 49 | * Resets all weights to 0 50 | */ 51 | public void resetWeights() { 52 | averagingCoefficient = 0; 53 | weightMap.clear(); 54 | weightCacheMap.clear(); 55 | } 56 | 57 | /** 58 | * Get the most recent weight of a feature. Returns 0 if the feature is unknown. 59 | * @param feature 60 | * @return 61 | */ 62 | public double get(String feature) { 63 | if (weightMap.containsKey(feature)) { 64 | return weightMap.get(feature); 65 | } else { 66 | return 0; 67 | } 68 | } 69 | 70 | /** 71 | * Get the averaged weight of a feature. Returns 0 if the feature is unknown. 72 | * @param feature 73 | * @return 74 | */ 75 | public double getAvg(String feature) { 76 | if (weightMap.containsKey(feature)) { 77 | Double cache = weightCacheMap.get(feature); 78 | if (cache == null) cache = 0.0; 79 | return weightMap.get(feature) - (cache/averagingCoefficient); 80 | } else { 81 | return 0; 82 | } 83 | } 84 | 85 | /** 86 | * Update the weight of a feature by value 87 | * @param feature 88 | * @param value 89 | */ 90 | public void update(String feature, double value) { 91 | // update the weight of the feature 92 | if (!weightMap.containsKey(feature)) { 93 | weightMap.put(feature, value); 94 | } else { 95 | weightMap.put(feature, weightMap.get(feature) + value); 96 | } 97 | 98 | // also add to averaging map if averaging is on 99 | if (doAveraging) { 100 | if (!weightCacheMap.containsKey(feature)) { 101 | weightCacheMap.put(feature, value * averagingCoefficient); 102 | } else { 103 | weightCacheMap.put(feature, weightCacheMap.get(feature) + value * averagingCoefficient); 104 | } 105 | 106 | averagingCoefficient++; 107 | } 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/run/Common.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.run; 19 | 20 | import ims.cs.lingdata.Document; 21 | import ims.cs.parc.ProcessedCorpus; 22 | import ims.cs.qsample.features.SpanFeatures; 23 | import ims.cs.qsample.models.QuotationPerceptrons; 24 | import ims.cs.qsample.spans.Span; 25 | import ims.cs.util.StaticConfig; 26 | 27 | import java.io.*; 28 | import java.util.List; 29 | import java.util.zip.GZIPInputStream; 30 | import java.util.zip.GZIPOutputStream; 31 | 32 | /** 33 | * Some common functions 34 | * Created by scheibcn on 3/5/16. 35 | */ 36 | public abstract class Common { 37 | /** 38 | * Writes the predictions to a file in BIO format 39 | * @param trainDocs 40 | * @param testDocs 41 | * @param valDocs 42 | * @param resDocs 43 | */ 44 | public static void writePredictionsToFile(List trainDocs, List testDocs, List valDocs, List resDocs) { 45 | // if in text mode, write empty line after sentence ends and write cues 46 | boolean writeNewLineAfterSentence = StaticConfig.cliMode == StaticConfig.CliMode.TEXT; 47 | boolean writeCues = StaticConfig.cliMode == StaticConfig.CliMode.TEXT; 48 | 49 | // try to write predictions 50 | try { 51 | if (trainDocs != null) ProcessedCorpus.savePredictionsToFile(trainDocs, "train-final", writeNewLineAfterSentence, writeCues); 52 | if (testDocs != null) ProcessedCorpus.savePredictionsToFile(testDocs, "test-final", writeNewLineAfterSentence, writeCues); 53 | if (valDocs != null) ProcessedCorpus.savePredictionsToFile(valDocs, "val-final", writeNewLineAfterSentence, writeCues); 54 | if (resDocs != null) ProcessedCorpus.savePredictionsToFile(resDocs, "res-final", writeNewLineAfterSentence, writeCues); 55 | } catch (IOException e) { 56 | e.printStackTrace(); 57 | System.out.println("Unable to write results to file"); 58 | } 59 | 60 | 61 | } 62 | 63 | /** 64 | * Writes out all perceptron models 65 | * @param perceptrons 66 | * @param fileName 67 | * @throws IOException 68 | */ 69 | public static void serializeModels(QuotationPerceptrons perceptrons, String fileName) throws IOException { 70 | System.out.println("Writing perceptron model to " + fileName); 71 | ObjectOutputStream outputStream = new ObjectOutputStream (new GZIPOutputStream(new FileOutputStream(fileName))); 72 | outputStream.writeObject(perceptrons); 73 | } 74 | 75 | /** 76 | * Reads all perceptron models from a file 77 | * @param fileName 78 | * @return 79 | * @throws IOException 80 | * @throws ClassNotFoundException 81 | */ 82 | public static QuotationPerceptrons deserializeModels(String fileName) throws IOException, ClassNotFoundException { 83 | System.out.println("Loading perceptron model from " + fileName); 84 | ObjectInputStream inputStream = new ObjectInputStream(new GZIPInputStream(new FileInputStream(fileName))); 85 | return (QuotationPerceptrons) inputStream.readObject(); 86 | } 87 | 88 | /** 89 | * Adds features to gold spans 90 | * @param documents 91 | */ 92 | public static void addFeaturesToGoldSpans(List documents) { 93 | for (Document document : documents) { 94 | for (Span goldSpan : document.goldSpanSet) { 95 | SpanFeatures.addAllSpanFeatures(goldSpan); 96 | } 97 | } 98 | } 99 | 100 | public static String pathConcat (String path, String subDir) { 101 | return new File(new File(path), subDir).toString(); 102 | } 103 | 104 | 105 | } 106 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/run/PlainTextCorpusReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.run; 19 | 20 | import ims.cs.lingdata.*; 21 | import ims.cs.parc.ProcessedCorpus; 22 | import ims.cs.util.StaticConfig; 23 | import org.xml.sax.SAXException; 24 | 25 | import javax.xml.parsers.ParserConfigurationException; 26 | import java.io.*; 27 | import java.util.ArrayList; 28 | import java.util.Arrays; 29 | import java.util.List; 30 | 31 | /** 32 | * Created by scheibcn on 6/1/16. 33 | */ 34 | public class PlainTextCorpusReader { 35 | 36 | /** 37 | * Read document, one sentence per line 38 | * @param file 39 | * @return 40 | */ 41 | public static Document readDocument(File file) throws IOException { 42 | StringBuilder sb = new StringBuilder(); 43 | BufferedReader reader = new BufferedReader(new FileReader(file)); 44 | 45 | // read all text from file 46 | String line; 47 | while ((line = reader.readLine()) != null) { 48 | sb.append(line); 49 | sb.append('\n'); 50 | } 51 | 52 | // build a document with some bogus structure 53 | String text = sb.toString(); 54 | 55 | Document d = new Document(); 56 | Sentence s = new Sentence(); 57 | Token t = new Token(); 58 | 59 | // add text and set byte count 60 | t.goldText = text; 61 | t.goldByteCount = new ByteCount(0, t.goldText.length()); 62 | 63 | // bookkeeping 64 | s.tokenList = new ArrayList<>(); 65 | s.tokenList.add(t); 66 | 67 | d.sentenceList = new ArrayList<>(); 68 | d.sentenceList.add(s); 69 | 70 | d.tokenList = new ArrayList<>(); 71 | d.tokenList.add(t); 72 | d.text = text; 73 | 74 | // build a document id from the file and directory names 75 | d.docId = new PlainTextDocId(file.getParentFile().getName(), file.getName()); 76 | 77 | reader.close(); 78 | 79 | return d; 80 | } 81 | 82 | public static ProcessedCorpus readDocuments(String directory) throws ClassNotFoundException, SAXException, ParserConfigurationException, IOException { 83 | List documentList = new ArrayList<>(); 84 | 85 | // import all files in the directory 86 | File dir = new File(directory); 87 | File[] files = dir.listFiles(); 88 | Arrays.sort(files); 89 | 90 | for (File file : files) { 91 | if (StaticConfig.verbose) System.out.println(file); 92 | Document document = readDocument(file); 93 | documentList.add(document); 94 | } 95 | 96 | PlainTextCorpus corpus = new PlainTextCorpus(documentList); 97 | 98 | return new ProcessedCorpus(corpus); 99 | } 100 | 101 | 102 | public static void pipeline() { 103 | 104 | } 105 | 106 | public static Document dummyDocument () { 107 | Document d = new Document(); 108 | Sentence s = new Sentence(); 109 | Token t = new Token(); 110 | 111 | t.goldText = "\"I am very disappointed,\" said Dr. Miller.\n Futher, he reported that everything was fine."; 112 | t.goldByteCount = new ByteCount(0, t.goldText.length()); 113 | 114 | s.tokenList = new ArrayList<>(); 115 | s.tokenList.add(t); 116 | 117 | d.sentenceList = new ArrayList<>(); 118 | d.sentenceList.add(s); 119 | 120 | d.tokenList = new ArrayList<>(); 121 | d.tokenList.add(t); 122 | d.text = t.goldText; 123 | 124 | d.docId = new PlainTextDocId("dummyTestDirectory1", "dummyTestFile1"); 125 | 126 | return d; 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/run/RunCrf.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.run; 19 | 20 | import ims.cs.lingdata.Document; 21 | import ims.cs.parc.PARCCorpus; 22 | import ims.cs.parc.ProcessedCorpus; 23 | import ims.cs.qsample.evaluate.EvaluateSpan; 24 | import ims.cs.qsample.models.CrfClassifier; 25 | import ims.cs.qsample.models.QuotationPerceptrons; 26 | import ims.cs.qsample.perceptron.PerceptronTrainer; 27 | import ims.cs.util.MultiOutputStream; 28 | import ims.cs.util.NewStaticPrinter; 29 | import ims.cs.util.StaticConfig; 30 | import org.xml.sax.SAXException; 31 | 32 | import javax.xml.parsers.ParserConfigurationException; 33 | import java.io.IOException; 34 | import java.util.List; 35 | 36 | /** 37 | * Run an experiment with a CRF model 38 | * Created by scheibcn on 3/3/16. 39 | */ 40 | public class RunCrf { 41 | 42 | 43 | /** 44 | * Run the full CRF training and testing pipeline 45 | * @param trainDocs training documents 46 | * @param testDocs test documents (may be null) 47 | * @param valDocs validation documents (may be null) 48 | * @param resDocs resubstitution documents (may be null) 49 | * @param beginMargin positive margin for begin perceptron 50 | * @param endMargin positive margin for end perceptron 51 | * @param cueMargin positive margin for cue perceptron 52 | * @param numIter number of epochs for training 53 | * @param perceptrons optionally: specify some pre-trained perceptrons 54 | * @param crfClassifier optionally: specify a pre-trained CRF 55 | * @return final CRF model 56 | * @throws IOException 57 | * @throws ClassNotFoundException 58 | */ 59 | public static CrfClassifier runCrfPipeline(List trainDocs, List testDocs, List valDocs, List resDocs, 60 | double beginMargin, double endMargin, double cueMargin, 61 | int numIter, QuotationPerceptrons perceptrons, CrfClassifier crfClassifier) throws IOException, ClassNotFoundException { 62 | 63 | // train a cue model if necessary, then predict 64 | if (perceptrons == null) { 65 | PerceptronTrainer.trainAllPerceptronsAndApply(trainDocs, testDocs, valDocs, resDocs, beginMargin, endMargin, cueMargin, true, 10, 10); 66 | } else { 67 | perceptrons.predictionPipelineCue(trainDocs, testDocs, valDocs, resDocs); 68 | perceptrons.predictionPipelineBoundary(trainDocs, testDocs, valDocs, resDocs); 69 | } 70 | 71 | // train CRF 72 | if (crfClassifier == null) { 73 | crfClassifier = new CrfClassifier(); 74 | crfClassifier.numIter = numIter; 75 | crfClassifier.train(trainDocs, testDocs, valDocs, resDocs); 76 | } 77 | 78 | // apply CRF 79 | System.out.println("Applying CRF to test data"); 80 | crfClassifier.test(trainDocs, testDocs, valDocs, resDocs); 81 | 82 | // evaluate 83 | EvaluateSpan.evaluateAndPrint("", "|", trainDocs, testDocs, valDocs, resDocs); 84 | 85 | // save predictions 86 | Common.writePredictionsToFile(trainDocs, testDocs, valDocs, resDocs); 87 | 88 | // output feature weights 89 | // this takes a lot of time, so it's deactivated right now 90 | if (false) crfClassifier.print(); 91 | 92 | return crfClassifier; 93 | } 94 | 95 | /** 96 | * This runs the full experimental pipeline w/ training and testing 97 | * @return 98 | * @throws ClassNotFoundException 99 | * @throws SAXException 100 | * @throws ParserConfigurationException 101 | * @throws IOException 102 | */ 103 | public static CrfClassifier fullExperiment() throws ClassNotFoundException, SAXException, ParserConfigurationException, IOException { 104 | ProcessedCorpus pc = new ProcessedCorpus(PARCCorpus.getInstance()); 105 | List trainDocs = pc.getTrain(); 106 | List testDocs = pc.getTest(); 107 | List valDocs = pc.getDev(); 108 | List resDocs = pc.getTrainSample(10); 109 | 110 | return runCrfPipeline(trainDocs, testDocs, valDocs, resDocs, StaticConfig.beginMargin, StaticConfig.endMargin, StaticConfig.cueMargin, 500, null, null); 111 | } 112 | 113 | /** 114 | * Running this program will train the CRF model as described in the paper 115 | * @param args 116 | * @throws ClassNotFoundException 117 | * @throws SAXException 118 | * @throws ParserConfigurationException 119 | * @throws IOException 120 | */ 121 | public static void main(String[] args) throws ClassNotFoundException, SAXException, ParserConfigurationException, IOException { 122 | String logFileName = NewStaticPrinter.getLogFileName(Common.pathConcat(StaticConfig.outputDirectory, "crf-")); 123 | NewStaticPrinter.init(logFileName); 124 | MultiOutputStream.init(logFileName); 125 | 126 | CrfClassifier crf = fullExperiment(); 127 | crf.saveCrf(logFileName + ".crfmodel"); 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/run/RunHeuristicTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.run; 19 | 20 | import ims.cs.lingdata.Document; 21 | import ims.cs.parc.PARCCorpus; 22 | import ims.cs.parc.ProcessedCorpus; 23 | import ims.cs.qsample.evaluate.EvaluateSpan; 24 | import ims.cs.qsample.greedysample.HeuristicSampler; 25 | import ims.cs.qsample.models.QuotationPerceptrons; 26 | import ims.cs.qsample.perceptron.PerceptronTrainer; 27 | import ims.cs.util.MultiOutputStream; 28 | import ims.cs.util.NewStaticPrinter; 29 | import ims.cs.util.StaticConfig; 30 | import org.xml.sax.SAXException; 31 | 32 | import javax.xml.parsers.ParserConfigurationException; 33 | import java.io.IOException; 34 | import java.util.List; 35 | 36 | /** 37 | * Run an experiment with the greedy heuristic model 38 | * Created by scheibcn on 11/5/15. 39 | */ 40 | public class RunHeuristicTest { 41 | 42 | // whether to shuffle tokens during prediction 43 | static boolean doShuffleTokens = false; 44 | static boolean incrementalPrediction = false; 45 | 46 | /** 47 | * Run the full greedy heuristic training and testing pipeline 48 | * @param trainDocs training documents 49 | * @param testDocs test documents (may be null) 50 | * @param valDocs validation documents (may be null) 51 | * @param resDocs resubstitution documents (may be null) 52 | * @param beginMargin positive margin for begin perceptron 53 | * @param endMargin positive margin for end perceptron 54 | * @param cueMargin positive margin for cue perceptron 55 | * @param model optionally: specify some pre-trained perceptrons 56 | * @return final perceptron models 57 | */ 58 | public static QuotationPerceptrons runHeuristicPipeline(List trainDocs, List testDocs, List valDocs, List resDocs, 59 | double beginMargin, double endMargin, double cueMargin, QuotationPerceptrons model) { 60 | 61 | // train model or predict 62 | if (model == null) { 63 | model = PerceptronTrainer.trainAllPerceptronsAndApply(trainDocs, testDocs, valDocs, resDocs, beginMargin, endMargin, cueMargin, false, 10, 10); 64 | } else { 65 | model.predictionPipelineCue(trainDocs, testDocs, valDocs, resDocs); 66 | model.predictionPipelineBoundary(trainDocs, testDocs, valDocs, resDocs); 67 | } 68 | 69 | 70 | // debug output 71 | for (Document document: testDocs) NewStaticPrinter.printPerceptronPrediction(document, "PP"); 72 | NewStaticPrinter.printN("-", 80); 73 | 74 | // SAMPLING 75 | HeuristicSampler sampler = new HeuristicSampler(); 76 | sampler.doShuffleTokens = doShuffleTokens; 77 | 78 | 79 | int[] maxDistances; 80 | int[] maxLengths; 81 | 82 | if (incrementalPrediction) { /* version 1: incremental prediction -- performs slightly worse */ 83 | maxDistances = new int[]{5, 10, 20, 30}; 84 | maxLengths = new int[]{50, 50, 50, 50}; 85 | } else { /* version 2: full prediction immediately */ 86 | maxDistances = new int[]{30}; 87 | maxLengths = new int[]{50}; 88 | } 89 | 90 | for (int i = 0; i < maxDistances.length; i++) { 91 | // sample 92 | int maxDistance = maxDistances[i]; 93 | int maxLength = maxLengths[i]; 94 | 95 | if (trainDocs != null) sampler.sampleGreedy(trainDocs, maxDistance, maxLength); 96 | if (testDocs != null) sampler.sampleGreedy(testDocs, maxDistance, maxLength); 97 | if (valDocs != null) sampler.sampleGreedy(valDocs, maxDistance, maxLength); 98 | if (resDocs != null) sampler.sampleGreedy(resDocs, maxDistance, maxLength); 99 | 100 | // evaluate 101 | EvaluateSpan.evaluateAndPrint("" + maxDistance + " ", "|", trainDocs, testDocs, valDocs, resDocs); 102 | } 103 | 104 | // save predictions 105 | Common.writePredictionsToFile(trainDocs, testDocs, valDocs, resDocs); 106 | 107 | return model; 108 | } 109 | 110 | /** 111 | * This runs the full experimental pipeline w/ training and testing 112 | * @throws ClassNotFoundException 113 | * @throws SAXException 114 | * @throws ParserConfigurationException 115 | * @throws IOException 116 | */ 117 | public static void fullExperiment() throws ClassNotFoundException, SAXException, ParserConfigurationException, IOException { 118 | ProcessedCorpus pc = new ProcessedCorpus(PARCCorpus.getInstance()); 119 | List trainDocs = pc.getTrain(); 120 | List testDocs = pc.getTest(); 121 | List valDocs = pc.getDev(); 122 | List resDocs = pc.getTrainSample(10); 123 | 124 | runHeuristicPipeline(trainDocs, testDocs, valDocs, resDocs, StaticConfig.beginMargin, StaticConfig.endMargin, StaticConfig.cueMargin, null); 125 | } 126 | 127 | /** 128 | * Run this to train a model without going through QSample.main() 129 | * @param args 130 | * @throws ClassNotFoundException 131 | * @throws SAXException 132 | * @throws ParserConfigurationException 133 | * @throws IOException 134 | */ 135 | public static void main (String[] args) throws ClassNotFoundException, SAXException, ParserConfigurationException, IOException { 136 | String logFileName = NewStaticPrinter.getLogFileName("/home/users1/scheibcn/quotations/results/txt/joint-first-run/heuristic-"); 137 | MultiOutputStream.init(logFileName); 138 | NewStaticPrinter.init(logFileName); 139 | 140 | fullExperiment(); 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/spans/SpanBegin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.spans; 19 | 20 | import ims.cs.qsample.greedysample.HasScore; 21 | 22 | /** 23 | * Representation of a span begin. It is useful to have this as a separate class since this makes sampling easier. 24 | * Created by scheibcn on 11/5/15. 25 | */ 26 | public class SpanBegin implements HasScore { 27 | // each begin has a position and a score 28 | public int position; 29 | public Double score = null; 30 | 31 | public SpanBegin(int position, double score) { 32 | this.position = position; 33 | this.score = score; 34 | } 35 | 36 | public SpanBegin(int position) { 37 | this.position = position; 38 | } 39 | 40 | public double getScore() { 41 | return score; 42 | } 43 | 44 | @Override 45 | public String toString() { 46 | return "SpanBegin(pos=" + position + ",score=" + score + ")"; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/qsample/spans/SpanEnd.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.qsample.spans; 19 | 20 | import ims.cs.qsample.greedysample.HasScore; 21 | 22 | /** 23 | * Representation of a span end. It is useful to have this as a separate class since this makes sampling easier. 24 | * Created by scheibcn on 11/5/15. 25 | */ 26 | public class SpanEnd implements HasScore { 27 | // each end has a position and a score 28 | public int position; 29 | public Double score = null; 30 | 31 | public SpanEnd(int position, double score) { 32 | this.position = position; 33 | this.score = score; 34 | } 35 | 36 | public SpanEnd(int position) { 37 | this.position = position; 38 | } 39 | 40 | public double getScore() { 41 | return score; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/util/MultiOutputStream.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.util; 19 | 20 | import java.io.*; 21 | import java.text.DateFormat; 22 | import java.text.SimpleDateFormat; 23 | import java.util.Date; 24 | 25 | /** 26 | * An extension of the default output stream that provides functionality to write to multiple streams at once. 27 | * We can use this to "tee" standard out and standard error into a file, which makes for a cheap and somewhat dirty 28 | * logging alternative. 29 | * 30 | * Adapted from http://www.codeproject.com/Tips/315892/A-quick-and-easy-way-to-direct-Java-System-out-to 31 | */ 32 | public class MultiOutputStream extends OutputStream 33 | { 34 | 35 | OutputStream[] outputStreams; 36 | 37 | public MultiOutputStream(OutputStream... outputStreams) 38 | { 39 | this.outputStreams= outputStreams; 40 | } 41 | 42 | @Override 43 | public void write(int b) throws IOException 44 | { 45 | for (OutputStream out: outputStreams) 46 | out.write(b); 47 | } 48 | 49 | @Override 50 | public void write(byte[] b) throws IOException 51 | { 52 | for (OutputStream out: outputStreams) 53 | out.write(b); 54 | } 55 | 56 | @Override 57 | public void write(byte[] b, int off, int len) throws IOException 58 | { 59 | for (OutputStream out: outputStreams) 60 | out.write(b, off, len); 61 | } 62 | 63 | @Override 64 | public void flush() throws IOException 65 | { 66 | for (OutputStream out: outputStreams) 67 | out.flush(); 68 | } 69 | 70 | @Override 71 | public void close() throws IOException 72 | { 73 | for (OutputStream out: outputStreams) 74 | out.close(); 75 | } 76 | 77 | /** 78 | * Write stdout and stderr to two separate files 79 | * @param fnOut 80 | * @param fnErr 81 | */ 82 | public static void init(String fnOut, String fnErr) { 83 | System.out.println("Logging stdout to: " + fnOut); 84 | System.out.println("Logging stdout to: " + fnErr); 85 | 86 | try 87 | { 88 | FileOutputStream fout= new FileOutputStream(fnOut); 89 | FileOutputStream ferr= new FileOutputStream(fnErr); 90 | 91 | MultiOutputStream multiOut= new MultiOutputStream(System.out, fout); 92 | MultiOutputStream multiErr= new MultiOutputStream(System.err, ferr); 93 | 94 | PrintStream stdout= new PrintStream(multiOut); 95 | PrintStream stderr= new PrintStream(multiErr); 96 | 97 | System.setOut(stdout); 98 | System.setErr(stderr); 99 | } 100 | catch (FileNotFoundException e) { 101 | e.printStackTrace(); 102 | } 103 | 104 | } 105 | 106 | /** 107 | * Write stdout and stderr into the same file 108 | * @param fnOutAndErr 109 | */ 110 | public static void init(String fnOutAndErr) { 111 | System.out.println("Logging all output to: " + fnOutAndErr); 112 | 113 | try 114 | { 115 | FileOutputStream fout= new FileOutputStream(fnOutAndErr); 116 | 117 | MultiOutputStream multiOut= new MultiOutputStream(System.out, fout); 118 | 119 | PrintStream stdout= new PrintStream(multiOut); 120 | 121 | System.setOut(stdout); 122 | System.setErr(stdout); 123 | } 124 | catch (FileNotFoundException e) { 125 | e.printStackTrace(); 126 | } 127 | 128 | } 129 | 130 | } 131 | -------------------------------------------------------------------------------- /src/main/java/ims/cs/util/NewStaticPrinter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of QSample. 3 | * QSample is free software: you can redistribute it and/or modify 4 | * it under the terms of the GNU General Public License as published by 5 | * the Free Software Foundation, either version 3 of the License, or 6 | * (at your option) any later version. 7 | * 8 | * QSample is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with QSample. If not, see . 15 | */ 16 | 17 | 18 | package ims.cs.util; 19 | 20 | import ims.cs.lingdata.Document; 21 | import ims.cs.lingdata.Token; 22 | import ims.cs.qsample.spans.Span; 23 | 24 | import java.io.FileNotFoundException; 25 | import java.io.PrintWriter; 26 | import java.text.DateFormat; 27 | import java.text.SimpleDateFormat; 28 | import java.util.Date; 29 | 30 | /** 31 | * A static printer to easily log output. Used mostly for debugging purposes. 32 | * Created by scheibcn on 3/3/16. 33 | */ 34 | public class NewStaticPrinter { 35 | // printer may be turned off 36 | public static boolean isOn = true; 37 | 38 | public static String fileRoot; 39 | public static String fileName; 40 | static PrintWriter writer; 41 | 42 | /** 43 | * Pass function to do nothing. 44 | */ 45 | public static void pass() {} 46 | 47 | /** 48 | * Sets a log file name from the specified log file root 49 | * @param logFileName 50 | * @throws FileNotFoundException 51 | */ 52 | public static void init(String logFileName) throws FileNotFoundException { 53 | fileRoot = logFileName; 54 | fileName = logFileName + ".debug"; 55 | if (isOn) writer = new PrintWriter(fileName); 56 | } 57 | 58 | 59 | /** 60 | * Generates a log file name from the specified log file root 61 | * @param prefix 62 | * @return 63 | */ 64 | public static String getLogFileName (String prefix) { 65 | DateFormat dateFormat = new SimpleDateFormat("yyyyMMdd-HH:mm:ss"); 66 | Date date = new Date(); 67 | 68 | return prefix + dateFormat.format(date) + ".log"; 69 | } 70 | 71 | /** 72 | * Print line to log file 73 | * @param s 74 | */ 75 | public static void println(String s) { 76 | if (isOn) { 77 | writer.write(s); 78 | writer.write("\n"); 79 | } 80 | } 81 | 82 | /** 83 | * Print to log file 84 | * @param s 85 | */ 86 | public static void print(String s) { 87 | if (isOn) { 88 | writer.write(s); 89 | } 90 | } 91 | 92 | /** 93 | * Print n copies of s to the log file 94 | * @param s 95 | * @param n 96 | */ 97 | public static void printN(String s, int n) { 98 | for (int i = 0; i < n; i++) print(s); 99 | println(""); 100 | } 101 | 102 | 103 | /** 104 | * Print the perceptron predictions for the given document to the log file 105 | * @param document 106 | * @param prefix string to prepend for each line 107 | */ 108 | public static void printPerceptronPrediction (Document document, String prefix) { 109 | for (Token token : document.getTokenList()) { 110 | StringBuilder line = new StringBuilder(); 111 | 112 | // prefix 113 | line.append(prefix); 114 | line.append("\t"); 115 | 116 | // add token information 117 | line.append(token.predText); 118 | line.append("\t"); 119 | 120 | 121 | // gold information 122 | boolean goldBegin = token.startsGoldContentSpan(); 123 | boolean goldEnd = token.endsGoldContentSpan(); 124 | boolean goldCue = token.isGoldCue(); 125 | 126 | if (goldBegin) line.append('B'); 127 | else line.append('_'); 128 | 129 | if (goldEnd) line.append('E'); 130 | else line.append('_'); 131 | 132 | if (goldCue) line.append('C'); 133 | else line.append('_'); 134 | 135 | line.append('\t'); 136 | 137 | // predicted information 138 | if (token.perceptronBeginScore > 0) line.append('B'); 139 | else line.append('_'); 140 | 141 | if (token.perceptronEndScore > 0) line.append('E'); 142 | else line.append('_'); 143 | 144 | if (token.isPredictedCue) line.append('C'); 145 | else line.append('_'); 146 | 147 | line.append('\t'); 148 | 149 | // scores 150 | line.append(token.perceptronBeginScore); line.append('\t'); 151 | line.append(token.perceptronEndScore); line.append('\t'); 152 | line.append(token.perceptronCueScore); line.append('\t'); 153 | line.append('\t'); 154 | 155 | // scores 156 | line.append(token.numTimesSampledBegin); line.append('\t'); 157 | line.append(token.numTimesSampledEnd); line.append('\t'); 158 | line.append(token.numTimesSampledCue); line.append('\t'); 159 | 160 | 161 | println(line.toString()); 162 | } 163 | } 164 | 165 | /** 166 | * Print document predictions and gold information using SGML-style tags 167 | * @param doc 168 | */ 169 | public static void printAnnotatedDocument(Document doc) { 170 | StringBuilder sb = new StringBuilder(); 171 | for (int i = 0; i < doc.tokenList.size(); i++) { 172 | if (Span.anyBeginsAt(doc.goldSpanSet, i)) sb.append(""); 173 | if (Span.anyBeginsAt(doc.predictedSpanSet, i)) sb.append(""); 174 | sb.append(doc.tokenList.get(i).predText); 175 | if (Span.anyEndsAt(doc.predictedSpanSet, i)) sb.append(""); 176 | if (Span.anyEndsAt(doc.goldSpanSet, i)) sb.append(""); 177 | sb.append(" "); 178 | } 179 | 180 | println(sb.toString()); 181 | } 182 | 183 | } 184 | --------------------------------------------------------------------------------