├── .gitignore
├── README.md
├── example
└── documents
│ ├── wikinews-20070223-train.txt
│ ├── wikinews-20070306-drain.txt
│ ├── wikinews-20160706-telescope.txt
│ └── wikinews-20160714-UN.txt
├── licenses
├── CC-SA-3.0
│ └── LICENSE.txt
├── LICENSE.md
├── gpl-3.0
│ └── LICENSE.txt
├── verbnet-license-3.0
│ └── LICENSE.txt
└── wordnet-license
│ └── LICENSE.txt
├── pom.xml
├── resources
└── PARC
│ ├── configs
│ ├── acl2016.crf.prop
│ ├── acl2016.greedy.prop
│ ├── acl2016.sampling.prop
│ ├── predpipeline.crf.prop
│ ├── predpipeline.greedy.prop
│ └── predpipeline.sampling.prop
│ ├── listfeatures
│ ├── attribution_nouns.txt
│ ├── krestel_verbs.txt
│ ├── organization.hyponyms.txt
│ ├── person.hyponyms.txt
│ ├── titles.txt
│ └── verbnet.txt
│ └── news.txt
└── src
└── main
└── java
└── ims
└── cs
├── bbn
├── BbnNeHandler.java
└── BbnNeParser.java
├── corenlp
├── DocumentAligner.java
├── Helper.java
├── IndexedWordIterator.java
├── PARCCoreNlpPipeline.java
└── TokenAligner.java
├── lingdata
├── ByteCount.java
├── Corpus.java
├── Document.java
├── DocumentId.java
├── GornAddressList.java
├── Partition.java
├── PlainTextCorpus.java
├── PlainTextDocId.java
├── Sentence.java
├── SentenceId.java
├── Token.java
├── Types.java
└── WSJId.java
├── mallet
├── DocumentFeatureSet2TokenSequence.java
└── PARCDocumentInstance.java
├── parc
├── PARCAttribution.java
├── PARCCorpus.java
├── ParcUtils.java
├── ProcessedCorpus.java
├── SpanLabelExtractor.java
└── xml
│ ├── PARCHandler.java
│ └── PARCParser.java
├── qsample
├── evaluate
│ ├── EvaluateClassifier.java
│ ├── EvaluateSpan.java
│ └── F1.java
├── features
│ ├── Binning.java
│ ├── BoundaryFeatures.java
│ ├── FeatureExtraction.java
│ ├── FeatureIndexMap.java
│ ├── FeatureIntSet.java
│ ├── FeatureSet.java
│ ├── FeatureStringSet.java
│ ├── SpanFeatures.java
│ └── components
│ │ ├── DocumentOffsetConjunction.java
│ │ ├── DocumentQuotationFeature.java
│ │ ├── SentenceConstituentFeatures.java
│ │ ├── SentenceDependencyFeatures.java
│ │ ├── SentenceFeaturesDerivedFromListCue.java
│ │ ├── SentenceIndicatorFeatures.java
│ │ ├── TokenDictFeatures.java
│ │ ├── TokenLexicalFeatures.java
│ │ └── TokenListFeatures.java
├── greedysample
│ ├── HasScore.java
│ ├── HeuristicSampler.java
│ ├── PerceptronSampler.java
│ └── Sampling.java
├── models
│ ├── CrfClassifier.java
│ ├── HigherSpanModel.java
│ └── QuotationPerceptrons.java
├── perceptron
│ ├── Perceptron.java
│ ├── PerceptronTrainer.java
│ └── Weights.java
├── run
│ ├── Common.java
│ ├── PlainTextCorpusReader.java
│ ├── QSample.java
│ ├── RunCrf.java
│ ├── RunHeuristicTest.java
│ └── RunPerceptronSampler.java
└── spans
│ ├── Span.java
│ ├── SpanBegin.java
│ └── SpanEnd.java
└── util
├── MultiOutputStream.java
├── NewStaticPrinter.java
└── StaticConfig.java
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 |
3 | # Mobile Tools for Java (J2ME)
4 | .mtj.tmp/
5 |
6 | # Package Files #
7 | *.jar
8 | *.war
9 | *.ear
10 |
11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
12 | hs_err_pid*
13 |
14 | # model binaries
15 | resources/PARC/models/*
16 | models.tar.gz
17 |
18 | # local configuration files
19 | resources/PARC/configs/local/
20 |
21 | # build output
22 | target/*
23 |
24 | # tool output
25 | output
26 |
27 | # intellij project
28 | *.iml
29 | *.idea
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | QSample
2 | =======
3 |
4 | QSample is a natural language processing tool for automatically
5 | detecting quotations in text.
6 |
7 |
8 | **Example:** In the sentence
9 |
10 | > Witnesses said that several passengers have broken bones.
11 |
12 | the span
13 |
14 | > *that several passengers have broken bones*
15 |
16 | is a quotation.
17 |
18 |
19 | Requirements
20 | ------------
21 |
22 | Java JVM (>= 1.7) and Maven (>= 3.0.0) need to be installed. All other
23 | dependencies will be downloaded automatically. The dependencies all
24 | together will amount to ~250 MB. The trained model files take up another
25 | ~80 MB.
26 |
27 |
28 | Setup
29 | --------
30 |
31 | Install the tool by running the following commands (NOTE: this will trigger a
32 | **~250 MB** Maven dependency download and will produce a .jar file of
33 | comparable size):
34 |
35 | git clone https://github.com/christianscheible/qsample.git
36 | cd qsample
37 | mvn compile
38 | mvn package
39 |
40 | If the build was successful, you will find two .jar files in `target/`
41 | (with and without dependencies, respectively).
42 |
43 | Next, download and unpack the pre-trained models (**~80 MB**):
44 |
45 | wget https://github.com/christianscheible/qsample/releases/download/0.1/models.tar.gz
46 | tar xzfv models.tar.gz
47 |
48 |
49 | Usage
50 | -----
51 |
52 | Now we are ready to detect quotations. As a first step, you can run the
53 | tool on the example documents we provide in `example/documents`. The
54 | expected format is a directory of plain text files, each containing a
55 | single document. To process the documents, run the following command:
56 |
57 | java -jar target/qsample-0.1-jar-with-dependencies.jar --sample example/documents/ output
58 |
59 | QSample will produce several files in the output directory:
60 |
61 | * `.log` file storing the messages that were also output to command line
62 | * `.conf` file documenting the configuration used by the tool for this run
63 | * one `.quotations.gz` file for each document in the input directory
64 | containing the detected quotations
65 |
66 | The `.quotations.gz` files contain the predictions made by the model. As
67 | an example, take the following snippet:
68 |
69 | Witnesses 230 239 O O
70 | said 240 244 O C
71 | that 245 249 O B
72 | several 250 257 O I
73 | passengers 258 268 O I
74 | have 269 273 O I
75 | broken 274 280 O I
76 | bones 281 286 O E
77 | . 286 287 O O
78 |
79 | The output format consists of five columns. The first column contains
80 | the tokens; the second and third columns contains the byte begin and end
81 | positions of the tokens in the original input file; the fourth column
82 | contains the gold labels (if there are any); the fifth column contains
83 | the predicted quotes. The predictions are encoded using BIOE-style
84 | labels. The label `C` marks the occurrence of a *cue*, and all words
85 | between the `B` (begin) and `E` (end) tag are the *content* of the
86 | quotation.
87 |
88 |
89 | Data
90 | ----
91 |
92 | This repository includes the following data:
93 |
94 | * `example/documents`: Three news articles from WikiNews for
95 | testing. QSample expects one plain text document per file. You can
96 | mark paragraph boundaries in the text by adding an empty line after
97 | each paragraph. Knowledge about paragraphs is useful for detecting
98 | quotations. Linguistic pre-processing is performed by Stanford
99 | CoreNLP.
100 | * `resources/PARC/configs`: Configuration files for running experiments
101 | (see below). The `acl2016*` configurations use gold pre-processing,
102 | whereas the `predpipeline*` configurations use CoreNLP processing. For
103 | each setup, we supply one file for each of the methods used in the
104 | paper.
105 | * `resources/PARC/listfeatures`: Word lists for extracting features. We
106 | supply lists of attribution nouns and verbs, organizations and
107 | persons, titles, as well as a mapping of verbs to VerbNet
108 | classes. These lists were generated from third-party resources, see
109 | `licenses/LICENSE.md`.
110 | * `resources/news.txt`: A list of WSJ ID's that contain news documents.
111 |
112 |
113 | Running an experiment
114 | ---------------------
115 |
116 | To run an experiment on annotated data, you need to obtain several
117 | resources:
118 |
119 | * Penn Attribution Relations Corpus (PARC3, http://homepages.inf.ed.ac.uk/s1052974/resources.php)
120 | * Penn Treebank 2 (https://catalog.ldc.upenn.edu/LDC95T7)
121 | * BBN Pronoun Coreference and Entity Type Corpus (https://catalog.ldc.upenn.edu/LDC2005T33)
122 |
123 | Afterwards, you can run experiments based on the configuration files in
124 | `resources/PARC/configs/`. To test the pre-trained models, you need to
125 | adapt the paths in the configuration files. To train a model, you can
126 | simply switch from `TEST` to `TRAIN` mode in the configuration.
127 |
128 |
129 | More information
130 | ----------------
131 |
132 | For more information, refer to our paper (available at
133 | http://www.aclweb.org/anthology/P/P16/P16-1164.pdf):
134 |
135 | @InProceedings{scheibleklingerpado2016,
136 | author = {Scheible, Christian and Klinger, Roman and Pad\'{o}, Sebastian},
137 | title = {Model Architectures for Quotation Detection},
138 | booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics},
139 | pages = {1736-1745},
140 | year = {2016}
141 | }
142 |
143 |
144 | or check the tool's website at
145 | http://www.ims.uni-stuttgart.de/data/qsample for news.
146 |
147 |
148 | License
149 | -------
150 |
151 | Please see `licenses/LICENSE.md`.
152 |
--------------------------------------------------------------------------------
/example/documents/wikinews-20070223-train.txt:
--------------------------------------------------------------------------------
1 | A 9 Carriage Class 390 Pendolino train, with as many as 180 people onboard, operated by Virgin Trains has derailed and crashed in Cumbria, England.
2 |
3 | The train was the 17:15 service from London's Euston Station to Glasgow Central. Witnesses said that several passengers have broken bones. All but one carriage have slid down the embankment and all of the carriages were derailed. 5 people are still seriously injured in hospital, including the driver Ian Black, and 11 people altogether are still in hospital. Hospital reports early on Saturday morning indicate 1 death, 84 year old lady Margaret Masson from Glasgow.
4 |
5 | "It's our understanding there are a number of people injured on the train. We think there are numerous injuries," said a spokesman for the Cumbria Fire and Rescue squad, Brian Mitchelhill.
6 |
7 | "A train has crashed between Oxenholme and Tebay, but that is all we know at the moment. We have got two cars travelling there now and local police are attending," said a spokeswoman for the British Transport Police.
8 |
9 | At least twelve ambulances and at least five fire trucks are at the scene, along with 3 Royal Air Force Sea King helicopters, two mountain rescue teams and one police helicopter. The rescue effort was hampered by limited access to the remote site and poor weather conditions.
10 |
11 | "At the moment, we have reports of various injuries, from leg injuries to back injuries and head injuries -- ranging from minor to quite serious," said an unnamed ambulance official.
12 |
13 | Virgin West Coast Class 390 Pendolino EMU 390009 at platform 1 of Carlisle railway station.
14 | "The carriage I am in is completely on its side, it did a sort of bump - and I was thinking don't worry this fine - but then the swaying became very dramatic," said a BBC executive, Caroline Thomson. "Where I am there are some injuries - a woman with a very badly hurt back but I suspect further up the train it's a bit more serious."
15 |
16 | First reports suggested that the train hit something on the track at 20:15 near Grayrigg, between Oxenholme and Tebay, derailed and slid down an embankment. Early investigators reports say a set of points will be the primary focus of the investigation. All but one of the train's carriages are said to have totally come off the tracks, a spokeswoman said.
17 |
18 | So far 12 passengers have been taken to The Royal Preston Hospital, where three casualties are said to be in a "critical condition" and one serious. 3 passengers with limb injuries are at the Royal Lancaster Infirmary. People with minor injuries were taken to a nearby farm before being transferred to a Kendal hotel.
19 |
20 | As of 22:16, Sleeper services over the affected line have been suspended. Other major services are terminating at Preston or Carlisle according to (UK) National Rail Enquires. A five mile cordon has been set up to seal the crash site as investigators from the Rail Accident Investigation Branch attempt to determine the cause of the crash.
21 |
22 | Virgin owner Richard Branson came to the site from his holiday in Switzerland and held a press conference at a safe distance, roughly 200 meters, from the site. Branson stated that the Virgin Train Pendolino was "built like a tank", and believed the track was to blame. All of the carriages retained their integrity and none of the windows broke. Mr Branson also praised the driver of the train for attempting to stop the train and not leaving the cockpit.
23 |
24 | Network Rail has checked 700 sets of rail points in response to the accident, though no speed restrictions have been imposed.
25 |
--------------------------------------------------------------------------------
/example/documents/wikinews-20070306-drain.txt:
--------------------------------------------------------------------------------
1 | A rupture in the underground stormwater drain system opened a huge sinkhole on February 23, killing three people and bringing down twelve houses in Guatemala City.
2 |
3 | Teenagers Irma and David Soyos and their father, 53-year old Domingo Soyos were killed when their house collapsed into the sinkhole. Nearly a thousand people were evacuated from the San Antonio neighborhood after the collapse.
4 |
5 | Wikinews interviewed Eric Haddox, a civil engineer who has visited the site of the sinkhole and spoken to the engineers working on fixing the drain. Mr. Haddox, who specialises in the building of earthworks, roads, water supply and sewage systems, and is working as a missionary in Guatemala, visited the site following the collapse to help in the recovery effort.
6 |
7 | Mr. Haddox told us that the size of the hole is much smaller than the 330 feet depth originally reported and that the erosion causing the collapse is believed to have happened over a long time, and not just during the recent rains as initially suspected.
8 |
9 | There are also concerns that a four-story building less than a metre from the edge of the hole may collapse as the earth under the building continues to be eroded.
10 |
11 | Before the collapse, a junction box linked two collector pipes to a 3.5m main pipe leading to a nearby canyon in a system believed to be 20 to 50 years old. The surrounding earth had been filled in artificially to level the ground, but the fill was not well compacted before being built upon. Such leveling of the ground is widespread in Guatemala city.
12 |
13 | It is thought that, at some point in the last 20 years, either one of the collector pipes ruptured or was detached from the junction box, possibly because of seismic activity. Water gushing out of the break following rainstorms gradually eroded the loosely compacted soil, creating an expanding cavern around the junction box. On February 23, the roof of this cavern collapsed, creating the sinkhole, 20m wide at the top and tapering out towards the bottom, which is about 60m (204 feet) deep, not 330 feet as originally reported.
14 |
15 | "Things like this don't happen often and there are many interesting engineering lessons to be learned with them", Mr. Haddox said.
16 |
17 | The sinkhole has continued to expand even after the collapse, since the collector pipes continue to carry water, which cascades 15m down the sinkhole to the main pipe, further eroding the sides of the sinkhole. The hole was about 25m wide at the top and 40m wide at the bottom a week ago.
18 |
19 | A bypass pipe is being laid to divert the water away from the junction to arrest further erosion. The sinkhole will then have to be drained before repair work can begin.
20 |
21 | Authorities are also concerned that similar breakages and undermining may be happening at other locations, Mr. Haddox said. Muddy water has been seen coming out of the main collector pipes, but it is not certain whether this is due to ruptures elsewhere or simply mud from the surface that has been washed into the drainage system.
22 |
23 |
--------------------------------------------------------------------------------
/example/documents/wikinews-20160706-telescope.txt:
--------------------------------------------------------------------------------
1 | On Sunday, China announced the attachment of the final panel to its telescope named Five hundred meter Aperture Spherical Telescope (FAST). This piece marks the end of a five-year-long US$180 million (CNY¥1.2 billion) construction project.
2 |
3 | FAST comprises about 4,500 panels and spans a diameter of 500 meters (about 1640 feet). The telescope is part of a series of ventures into space exploration by China, including planning another robotic Moon mission and creating a Chinese space station, with its core module set to be launched into space in 2018. With the country's founding centenary coming in 2049, Chinese President Xi Jinping said during a Beijing conference, "great scientific and technological capacity is a must for China to be strong".
4 |
5 | In order to achieve optimal electromagnetic performance for FAST with minimal signal interference, it was built in the South China Karst. This ultimately forced the relocation of about 9,100 inhabitants within a 3.1-mile (5km) radius of the telescope. The residents received about US$1,800 (CNY¥12,000) in reimbursement, with those experiencing difficulties with housing receiving about US$1,500 (CNY¥10,000) in extra compensation. The Chinese government supports the resettlement, with senior party official Li Yuecheng saying the relocation would provide a "sound electromagnetic wave environment".
6 |
7 | The telescope is now the largest-diameter single-dish radio telescope. It took the spot from the 305-meter diameter Arecibo Observatory telescope in Puerto Rico. Russia's RATAN-600 multi-element radio telescope has a diameter of 576 meters. This adds to China's record-defying achievements; it contains the world's largest bridge and the world's longest wall, the Great Wall of China.
8 |
9 | The telescope is set to be ready for use in September. Its possible uses include exploration for pulsars, a special type of neutron stars detected through their emission of radio pulses. Scientists have also described the telescope's potential to explore alien civilization, with NAO Radio Astronomy Technology Laboratory director Peng Bo saying FAST's "potential to discover an alien civilization will be 5 to 10 times that of current equipment, as it can see farther and darker planets".
10 |
11 |
--------------------------------------------------------------------------------
/example/documents/wikinews-20160714-UN.txt:
--------------------------------------------------------------------------------
1 | On Tuesday, a United Nations (UN) tribunal in The Hague dismissed China's sovereignty claims to the South China Sea, a body of water connecting to the Pacific Ocean which is also bordered by the Philippines, Vietnam, Brunei, Malaysia, Taiwan, and Indonesia. Court battles over the claims between China and the Philippines go back to 2013.
2 |
3 | These claims were established by China during the reign of its Nationalist government in the 1940s, marked by a demarcation line nicknamed its Nine-dash line. Its line stretched hundreds of miles from the Chinese mainland, including about nine tenths of the entire sea. The South China Sea is a valuable property, providing passage for about US$5 trillion in trade by planes and boats every year. China is not the only country to claim large parts of the sea; notably, Taiwan and Vietnam have also done so, but other large-scale claimants have been less militarily active about their claims than China.
4 |
5 | China has built several artificial islands and military bases in the South China Sea. The tribunal scolded the impeding of fishing and exploration in the sea by China, which it deemed against the United Nations Convention on the Law of the Sea (UNCLOS), signed by China in 1982. The tribunal also concluded China knowingly permitted the poaching of endangered turtles and clams as well as destroyed coral reefs to construct artificial islands.
6 |
7 | UNCLOS permits countries to claim a 200-nautical mile area from their mainland, referred to as an exclusive economic zone. It also permits freedom of navigation, allowing unimpeded exploration through "high seas": international waters also available for the use of fishing and trade passages.
8 |
9 | There is no process to enforce the decision. UNCLOS allows countries to exclude themselves from "compulsory binding procedures for the settlement of disputes" as defined in Part XV, Section 3 - Article 298. China exercised this right to exclude themselves from compulsory binding procedures on August 25, 2006. They reject the jurisdiction or authority of the tribunal's findings. Various other countries have also exercised Article 298 partially or fully, such as Australia, Canada, the UK, Russia, and France.
10 |
11 | Many nations made statements after the decision. The Chinese government opposed the decision, calling it "ill-founded". It said "China neither accepts nor recognizes" the decision. The Philippine government referred to the decision as a "milestone decision". The US, a key ally with many of the countries claiming parts of the sea, said it was an "important contribution to the shared goal of a peaceful resolution to disputes in the South China Sea".
12 |
--------------------------------------------------------------------------------
/licenses/LICENSE.md:
--------------------------------------------------------------------------------
1 | Code
2 | ====
3 |
4 | Our code is, unless otherwise specified below, subject to the GPL 3.0 license (`gpl-3.0/`).
5 |
6 | MultiOutputStream based on code by
7 | http://www.codeproject.com/Tips/315892/A-quick-and-easy-way-to-direct-Java-System-out-to
8 | (CodeProject license, http://www.codeproject.com/info/cpol10.aspx)
9 |
10 |
11 | Resources
12 | =========
13 |
14 | `resources/PARC/listfeatures/`: Lists and dictionaries for feature extraction
15 |
16 | * `attribution_nouns.txt`: List of attribution nouns published by Pareti (2015).
17 | * `krestel_verbs.txt`: List of attribution verbs published by Krestel et al. (2008).
18 | * `organization.hyponyms.txt`, `person.hyponyms.txt`: List of persons and organizations, extracted from WordNet (WordNet license, wordnet-license.txt)
19 | * `titles.txt`: List of titles collected from Wikipedia page https://en.wikipedia.org/wiki/Title (CC-SA license, http://creativecommons.org/licenses/by-sa/3.0/)
20 | * `verbnet.txt`: VerbNet category mappings (VerbNet license, verbnet-license.3.0.txt)
21 |
22 | `resources/PARC/news.txt`: List of WSJ news articles by http://www.let.rug.nl/~bplank/metadata/genre_files_updated.html
23 |
24 | `examples/documents`: Three news documents from WikiNews (CC-SA license)
25 |
--------------------------------------------------------------------------------
/licenses/verbnet-license-3.0/LICENSE.txt:
--------------------------------------------------------------------------------
1 |
2 | VerbNet 3.0 License (also applies to VerbNet 3.X versions)
3 |
4 | This software and database is being provided to you, the LICENSEE, by
5 | the University of Colorado under the following license. By obtaining, using
6 | and/or copying this software and database, you agree that you have
7 | read, understood, and will comply with these terms and conditions.:
8 |
9 | Permission to use, copy, modify and distribute this software and
10 | database and its documentation for any purpose and without fee or
11 | royalty is hereby granted, provided that you agree to comply with
12 | the following copyright notice and statements, including the disclaimer,
13 | and that the same appear on ALL copies of the software, database and
14 | documentation, including modifications that you make for internal
15 | use or for distribution.
16 |
17 | VerbNet 3.0 (or 3.X) Copyright 2009 by University of Colorado. All rights reserved.
18 |
19 | THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND THE UNIVERSITY
20 | OF COLORADO MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
21 | IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, UNIVERSITY
22 | OF COLORADO MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
23 | ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
24 | OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
25 | INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
26 | OTHER RIGHTS.
27 |
28 | The name of University of Colorado or CU may not be used in
29 | advertising or publicity pertaining to distribution of the software
30 | and/or database. Title to copyright in this software, database and
31 | any associated documentation shall at all times remain with
32 | University of Colorado and LICENSEE agrees to preserve same.
33 |
34 | Please reference the following document(s) in any description of
35 | applications based on VerbNet 3.0 or 3.X:
36 |
37 | Karin Kipper, Anna Korhonen, Neville Ryant, Martha Palmer,
38 | A Large-scale Classification of English Verbs,
39 | Language Resources and Evaluation Journal, 42(1), pp. 21-40,
40 | Springer Netherland, 2008.
41 |
42 | and/or
43 |
44 | Karin Kipper Schuler, Anna Korhonen, Susan W. Brown, VerbNet overview,
45 | extensions, mappings and apps, Tutorial, NAACL-HLT 2009, Boulder,
46 | Colorado.
47 |
--------------------------------------------------------------------------------
/licenses/wordnet-license/LICENSE.txt:
--------------------------------------------------------------------------------
1 | WordNet Release 3.0
2 |
3 | This software and database is being provided to you, the LICENSEE, by Princeton University under the following license. By obtaining, using and/or copying this software and database, you agree that you have read, understood, and will comply with these terms and conditions.: Permission to use, copy, modify and distribute this software and database and its documentation for any purpose and without fee or royalty is hereby granted, provided that you agree to comply with the following copyright notice and statements, including the disclaimer, and that the same appear on ALL copies of the software, database and documentation, including modifications that you make for internal use or for distribution. WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved. THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of Princeton University or Princeton may not be used in advertising or publicity pertaining to distribution of the software and/or database. Title to copyright in this software, database and any associated documentation shall at all times remain with Princeton University and LICENSEE agrees to preserve same.
4 |
5 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 |
5 | QSample
6 | ims.cs
7 | qsample
8 | 0.1
9 | 2016
10 |
11 | 1.7
12 | 4.10
13 | UTF-8
14 |
15 |
16 | IMS, University of Stuttgart, Germany
17 | http://www.ims.uni-stuttgart.de/~scheibcn
18 |
19 |
20 |
21 |
22 |
23 |
24 | junit
25 | junit
26 | ${junit.version}
27 |
28 |
29 |
30 | edu.stanford.nlp
31 | stanford-corenlp
32 | 3.9.2
33 |
34 |
35 |
36 | edu.stanford.nlp
37 | stanford-corenlp
38 | 3.9.2
39 | models
40 |
41 |
42 |
43 | cc.mallet
44 | mallet
45 | 2.0.7
46 |
47 |
48 |
49 |
50 | org.apache.commons
51 | commons-lang3
52 | 3.0
53 |
54 |
55 |
56 |
57 | net.sf.jgrapht
58 | jgrapht
59 | 0.8.3
60 |
61 |
62 |
63 |
64 |
65 | src/main/java
66 |
67 |
68 |
69 | org.apache.maven.plugins
70 | maven-compiler-plugin
71 | 3.5.1
72 |
73 | 1.7
74 | 1.7
75 |
76 |
77 |
78 |
79 |
80 |
81 | maven-assembly-plugin
82 | 2.6
83 |
84 |
85 | make-assembly
86 | package
87 |
88 | attached
89 |
90 |
91 |
92 |
93 |
94 |
95 | ims.cs.qsample.run.QSample
96 |
97 |
98 |
99 | jar-with-dependencies
100 |
101 |
102 |
103 |
104 |
105 | org.apache.maven.plugins
106 | maven-jar-plugin
107 | 3.0.0
108 |
109 |
110 | make-assembly
111 | package
112 |
113 |
114 |
115 |
116 | **/log4j.properties
117 |
118 |
119 |
120 | ims.cs.qsample.run.QSample
121 |
122 |
123 |
124 |
125 |
126 |
127 | maven-release-plugin
128 | 2.1
129 |
130 |
131 | release
132 | deploy package
133 |
134 |
135 |
136 |
137 |
138 |
139 |
--------------------------------------------------------------------------------
/resources/PARC/configs/acl2016.crf.prop:
--------------------------------------------------------------------------------
1 | # Properties file to replicate the ACL 2016 CRF results
2 | # To run this, please set the following paths first:
3 | # path for writing output
4 | outputDirectory=/path/to/output
5 | # path to PARC3
6 | parcRoot=/path/to/PARC3_complete
7 | # path to PTB raw data
8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/
9 | # path to BBN named entities
10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/
11 | # path for caching CoreNLP output (optional, see below)
12 | coreNlpOutputDirectory=/tmp/
13 |
14 | # you may optionally change the following settings
15 | # switch this to true if you want a lot of debug output
16 | verbose=false
17 | # switch this to TRAIN if you also want to train the model
18 | cliMode=TEST
19 | # switch this on to cache CoreNLP output in a file
20 | cacheParses=true
21 | # switch off if you want all outputs in one file
22 | oneFilePerInput=false
23 |
24 | # the following settings need to remain unchanged
25 | # Training options
26 | modelForTextFileMode=CRF
27 | quotationTypes=DIM
28 | jackknifing=false
29 | outerIter=30
30 | innerIter=50
31 | predictionIter=1000
32 | maxNumTrials=10
33 | predictEvery=10
34 | maxCueDistanceSampling=30
35 | maxCueDistanceHeuristic=30
36 | maxLengthSampling=75
37 | maxLengthHeuristic=50
38 | cueMargin=25.0
39 | beginMargin=25.0
40 | endMargin=25.0
41 | samplerMarginPositive=15
42 | cueTemperature=10.0
43 | beginTemperature=10.0
44 | endTemperature=10.0
45 | useGoldPreprocessing=true
46 |
47 | # Paths
48 | inputDirectory=*NOT USED IN EXPERIMENT MODE*
49 | crfModelFile=resources/PARC/models/acl2016.goldtok.crfmodel
50 | perceptronModelFile=resources/PARC/models/acl2016.goldtok.models
51 |
52 | # Features
53 | dependencyParentRelHead=true
54 | constituentLevel=true
55 | lexicalLemma=false
56 | sentenceHasQuote=true
57 | constituentBinningStacked=false
58 | sentenceHasCue=false
59 | sentenceLength=true
60 | dependencyChildRel=true
61 | lexicalPos=false
62 | sentenceLengthBinningStacked=false
63 | dependencyChildRelHead=true
64 | constituentParent=true
65 | dependencyFeatures=true
66 | lexicalWindowSize=5
67 | documentQuotationFeature=true
68 | sentenceHasNe=true
69 | samplerMarginNegative=1
70 | constituentLeftmost=true
71 | flattenQuotes=true
72 | useBioeTags=true
73 | constituentAncestorL=true
74 | sentenceHasPronoun=true
75 | lexicalToken=false
76 | sentenceLengthBinning=false
77 | dependencyCueDependent=true
78 | constituentGoverning=true
79 | constituentBinning=false
80 | dependencyParentRel=true
81 | constituentFeatures=true
82 | lexicalBigram=false
83 | documentOffsetConjunction=true
84 |
85 |
--------------------------------------------------------------------------------
/resources/PARC/configs/acl2016.greedy.prop:
--------------------------------------------------------------------------------
1 | # Properties file to replicate the ACL 2016 greedy results
2 | # To run this, please set the following paths first:
3 | # path for writing output
4 | outputDirectory=/path/to/output
5 | # path to PARC3
6 | parcRoot=/path/to/PARC3_complete
7 | # path to PTB raw data
8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/
9 | # path to BBN named entities
10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/
11 | # path for caching CoreNLP output (optional, see below)
12 | coreNlpOutputDirectory=/tmp/
13 |
14 | # you may optionally change the following settings
15 | # switch this to true if you want a lot of debug output
16 | verbose=false
17 | # switch this to TRAIN if you also want to train the model
18 | cliMode=TEST
19 | # switch this on to cache CoreNLP output in a file
20 | cacheParses=true
21 | # switch off if you want all outputs in one file
22 | oneFilePerInput=false
23 |
24 | # the following settings need to remain unchanged
25 | # Training options
26 | modelForTextFileMode=GREEDY
27 | quotationTypes=DIM
28 | jackknifing=false
29 | outerIter=30
30 | innerIter=50
31 | predictionIter=1000
32 | maxNumTrials=10
33 | predictEvery=10
34 | maxCueDistanceSampling=30
35 | maxCueDistanceHeuristic=30
36 | maxLengthSampling=75
37 | maxLengthHeuristic=50
38 | cueMargin=25.0
39 | beginMargin=25.0
40 | endMargin=25.0
41 | samplerMarginPositive=15
42 | cueTemperature=10.0
43 | beginTemperature=10.0
44 | endTemperature=10.0
45 | useGoldPreprocessing=true
46 |
47 | # Paths
48 | inputDirectory=*NOT USED IN EXPERIMENT MODE*
49 | crfModelFile=resources/PARC/models/acl2016.goldtok.crfmodel
50 | perceptronModelFile=resources/PARC/models/acl2016.goldtok.models
51 |
52 | # Features
53 | dependencyParentRelHead=true
54 | constituentLevel=true
55 | lexicalLemma=false
56 | sentenceHasQuote=true
57 | constituentBinningStacked=false
58 | sentenceHasCue=false
59 | sentenceLength=true
60 | dependencyChildRel=true
61 | lexicalPos=false
62 | sentenceLengthBinningStacked=false
63 | dependencyChildRelHead=true
64 | constituentParent=true
65 | dependencyFeatures=true
66 | lexicalWindowSize=5
67 | documentQuotationFeature=true
68 | sentenceHasNe=true
69 | samplerMarginNegative=1
70 | constituentLeftmost=true
71 | flattenQuotes=true
72 | useBioeTags=true
73 | constituentAncestorL=true
74 | sentenceHasPronoun=true
75 | lexicalToken=false
76 | sentenceLengthBinning=false
77 | dependencyCueDependent=true
78 | constituentGoverning=true
79 | constituentBinning=false
80 | dependencyParentRel=true
81 | constituentFeatures=true
82 | lexicalBigram=false
83 | documentOffsetConjunction=true
84 |
85 |
--------------------------------------------------------------------------------
/resources/PARC/configs/acl2016.sampling.prop:
--------------------------------------------------------------------------------
1 | # Properties file to replicate the ACL 2016 greedy results
2 | # To run this, please set the following paths first:
3 | # path for writing output
4 | outputDirectory=/path/to/output
5 | # path to PARC3
6 | parcRoot=/path/to/PARC3_complete
7 | # path to PTB raw data
8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/
9 | # path to BBN named entities
10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/
11 | # path for caching CoreNLP output (optional, see below)
12 | coreNlpOutputDirectory=/tmp/
13 |
14 | # you may optionally change the following settings
15 | # switch this to true if you want a lot of debug output
16 | verbose=false
17 | # switch this to TRAIN if you also want to train the model
18 | cliMode=TEST
19 | # switch this on to cache CoreNLP output in a file
20 | cacheParses=true
21 | # switch off if you want all outputs in one file
22 | oneFilePerInput=false
23 |
24 | # the following settings need to remain unchanged
25 | # Training options
26 | modelForTextFileMode=SAMPLE
27 | quotationTypes=DIM
28 | jackknifing=false
29 | outerIter=30
30 | innerIter=50
31 | predictionIter=1000
32 | maxNumTrials=10
33 | predictEvery=10
34 | maxCueDistanceSampling=30
35 | maxCueDistanceHeuristic=30
36 | maxLengthSampling=75
37 | maxLengthHeuristic=50
38 | cueMargin=25.0
39 | beginMargin=25.0
40 | endMargin=25.0
41 | samplerMarginPositive=15
42 | cueTemperature=10.0
43 | beginTemperature=10.0
44 | endTemperature=10.0
45 | useGoldPreprocessing=true
46 |
47 | # Paths
48 | inputDirectory=*NOT USED IN EXPERIMENT MODE*
49 | crfModelFile=resources/PARC/models/acl2016.goldtok.crfmodel
50 | perceptronModelFile=resources/PARC/models/acl2016.goldtok.models
51 |
52 | # Features
53 | dependencyParentRelHead=true
54 | constituentLevel=true
55 | lexicalLemma=false
56 | sentenceHasQuote=true
57 | constituentBinningStacked=false
58 | sentenceHasCue=false
59 | sentenceLength=true
60 | dependencyChildRel=true
61 | lexicalPos=false
62 | sentenceLengthBinningStacked=false
63 | dependencyChildRelHead=true
64 | constituentParent=true
65 | dependencyFeatures=true
66 | lexicalWindowSize=5
67 | documentQuotationFeature=true
68 | sentenceHasNe=true
69 | samplerMarginNegative=1
70 | constituentLeftmost=true
71 | flattenQuotes=true
72 | useBioeTags=true
73 | constituentAncestorL=true
74 | sentenceHasPronoun=true
75 | lexicalToken=false
76 | sentenceLengthBinning=false
77 | dependencyCueDependent=true
78 | constituentGoverning=true
79 | constituentBinning=false
80 | dependencyParentRel=true
81 | constituentFeatures=true
82 | lexicalBigram=false
83 | documentOffsetConjunction=true
84 |
85 |
--------------------------------------------------------------------------------
/resources/PARC/configs/predpipeline.crf.prop:
--------------------------------------------------------------------------------
1 | # Properties file to replicate the ACL 2016 CRF results
2 | # To run this, please set the following paths first:
3 | # path for writing output
4 | outputDirectory=/path/to/output
5 | # path to PARC3
6 | parcRoot=/path/to/PARC3_complete
7 | # path to PTB raw data
8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/
9 | # path to BBN named entities
10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/
11 | # path for caching CoreNLP output (optional, see below)
12 | coreNlpOutputDirectory=/tmp/
13 |
14 | # you may optionally change the following settings
15 | # switch this to true if you want a lot of debug output
16 | verbose=false
17 | # switch this to TRAIN if you also want to train the model
18 | cliMode=TEST
19 | # switch this on to cache CoreNLP output in a file
20 | cacheParses=false
21 | # switch off if you want all outputs in one file
22 | oneFilePerInput=true
23 |
24 | # the following settings need to remain unchanged
25 | # Training options
26 | modelForTextFileMode=CRF
27 | quotationTypes=DIM
28 | jackknifing=false
29 | outerIter=30
30 | innerIter=50
31 | predictionIter=1000
32 | maxNumTrials=10
33 | predictEvery=10
34 | maxCueDistanceSampling=30
35 | maxCueDistanceHeuristic=30
36 | maxLengthSampling=75
37 | maxLengthHeuristic=50
38 | cueMargin=25.0
39 | beginMargin=25.0
40 | endMargin=25.0
41 | samplerMarginPositive=15
42 | cueTemperature=10.0
43 | beginTemperature=10.0
44 | endTemperature=10.0
45 | useGoldPreprocessing=false
46 |
47 | # Paths
48 | inputDirectory=*NOT USED IN EXPERIMENT MODE*
49 | crfModelFile=resources/PARC/models/predpipeline.crfmodel
50 | perceptronModelFile=resources/PARC/models/predpipeline.models
51 |
52 | # Features
53 | dependencyParentRelHead=true
54 | constituentLevel=true
55 | lexicalLemma=false
56 | sentenceHasQuote=true
57 | constituentBinningStacked=false
58 | sentenceHasCue=false
59 | sentenceLength=true
60 | dependencyChildRel=true
61 | lexicalPos=false
62 | sentenceLengthBinningStacked=false
63 | dependencyChildRelHead=true
64 | constituentParent=true
65 | dependencyFeatures=true
66 | lexicalWindowSize=5
67 | documentQuotationFeature=true
68 | sentenceHasNe=true
69 | samplerMarginNegative=1
70 | constituentLeftmost=true
71 | flattenQuotes=true
72 | useBioeTags=true
73 | constituentAncestorL=true
74 | sentenceHasPronoun=true
75 | lexicalToken=false
76 | sentenceLengthBinning=false
77 | dependencyCueDependent=true
78 | constituentGoverning=true
79 | constituentBinning=false
80 | dependencyParentRel=true
81 | constituentFeatures=true
82 | lexicalBigram=false
83 | documentOffsetConjunction=true
84 |
85 |
--------------------------------------------------------------------------------
/resources/PARC/configs/predpipeline.greedy.prop:
--------------------------------------------------------------------------------
1 | # Properties file to replicate the ACL 2016 CRF results
2 | # To run this, please set the following paths first:
3 | # path for writing output
4 | outputDirectory=/path/to/output
5 | # path to PARC3
6 | parcRoot=/path/to/PARC3_complete
7 | # path to PTB raw data
8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/
9 | # path to BBN named entities
10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/
11 | # path for caching CoreNLP output (optional, see below)
12 | coreNlpOutputDirectory=/tmp/
13 |
14 | # you may optionally change the following settings
15 | # switch this to true if you want a lot of debug output
16 | verbose=false
17 | # switch this to TRAIN if you also want to train the model
18 | cliMode=TEST
19 | # switch this on to cache CoreNLP output in a file
20 | cacheParses=false
21 | # switch off if you want all outputs in one file
22 | oneFilePerInput=true
23 |
24 | # the following settings need to remain unchanged
25 | # Training options
26 | modelForTextFileMode=GREEDY
27 | quotationTypes=DIM
28 | jackknifing=false
29 | outerIter=30
30 | innerIter=50
31 | predictionIter=1000
32 | maxNumTrials=10
33 | predictEvery=10
34 | maxCueDistanceSampling=30
35 | maxCueDistanceHeuristic=30
36 | maxLengthSampling=75
37 | maxLengthHeuristic=50
38 | cueMargin=25.0
39 | beginMargin=25.0
40 | endMargin=25.0
41 | samplerMarginPositive=15
42 | cueTemperature=10.0
43 | beginTemperature=10.0
44 | endTemperature=10.0
45 | useGoldPreprocessing=false
46 |
47 | # Paths
48 | inputDirectory=*NOT USED IN EXPERIMENT MODE*
49 | crfModelFile=resources/PARC/models/predpipeline.crfmodel
50 | perceptronModelFile=resources/PARC/models/predpipeline.models
51 |
52 | # Features
53 | dependencyParentRelHead=true
54 | constituentLevel=true
55 | lexicalLemma=false
56 | sentenceHasQuote=true
57 | constituentBinningStacked=false
58 | sentenceHasCue=false
59 | sentenceLength=true
60 | dependencyChildRel=true
61 | lexicalPos=false
62 | sentenceLengthBinningStacked=false
63 | dependencyChildRelHead=true
64 | constituentParent=true
65 | dependencyFeatures=true
66 | lexicalWindowSize=5
67 | documentQuotationFeature=true
68 | sentenceHasNe=true
69 | samplerMarginNegative=1
70 | constituentLeftmost=true
71 | flattenQuotes=true
72 | useBioeTags=true
73 | constituentAncestorL=true
74 | sentenceHasPronoun=true
75 | lexicalToken=false
76 | sentenceLengthBinning=false
77 | dependencyCueDependent=true
78 | constituentGoverning=true
79 | constituentBinning=false
80 | dependencyParentRel=true
81 | constituentFeatures=true
82 | lexicalBigram=false
83 | documentOffsetConjunction=true
84 |
85 |
--------------------------------------------------------------------------------
/resources/PARC/configs/predpipeline.sampling.prop:
--------------------------------------------------------------------------------
1 | # Properties file to replicate the ACL 2016 CRF results
2 | # To run this, please set the following paths first:
3 | # path for writing output
4 | outputDirectory=/path/to/output
5 | # path to PARC3
6 | parcRoot=/path/to/PARC3_complete
7 | # path to PTB raw data
8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/
9 | # path to BBN named entities
10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/
11 | # path for caching CoreNLP output (optional, see below)
12 | coreNlpOutputDirectory=/tmp/
13 |
14 | # you may optionally change the following settings
15 | # switch this to true if you want a lot of debug output
16 | verbose=false
17 | # switch this to TRAIN if you also want to train the model
18 | cliMode=TEST
19 | # switch this on to cache CoreNLP output in a file
20 | cacheParses=false
21 | # switch off if you want all outputs in one file
22 | oneFilePerInput=true
23 |
24 | # the following settings need to remain unchanged
25 | # Training options
26 | modelForTextFileMode=SAMPLE
27 | quotationTypes=DIM
28 | jackknifing=false
29 | outerIter=30
30 | innerIter=50
31 | predictionIter=1000
32 | maxNumTrials=10
33 | predictEvery=10
34 | maxCueDistanceSampling=30
35 | maxCueDistanceHeuristic=30
36 | maxLengthSampling=75
37 | maxLengthHeuristic=50
38 | cueMargin=25.0
39 | beginMargin=25.0
40 | endMargin=25.0
41 | samplerMarginPositive=15
42 | cueTemperature=10.0
43 | beginTemperature=10.0
44 | endTemperature=10.0
45 | useGoldPreprocessing=false
46 |
47 | # Paths
48 | inputDirectory=*NOT USED IN EXPERIMENT MODE*
49 | crfModelFile=resources/PARC/models/predpipeline.crfmodel
50 | perceptronModelFile=resources/PARC/models/predpipeline.models
51 |
52 | # Features
53 | dependencyParentRelHead=true
54 | constituentLevel=true
55 | lexicalLemma=false
56 | sentenceHasQuote=true
57 | constituentBinningStacked=false
58 | sentenceHasCue=false
59 | sentenceLength=true
60 | dependencyChildRel=true
61 | lexicalPos=false
62 | sentenceLengthBinningStacked=false
63 | dependencyChildRelHead=true
64 | constituentParent=true
65 | dependencyFeatures=true
66 | lexicalWindowSize=5
67 | documentQuotationFeature=true
68 | sentenceHasNe=true
69 | samplerMarginNegative=1
70 | constituentLeftmost=true
71 | flattenQuotes=true
72 | useBioeTags=true
73 | constituentAncestorL=true
74 | sentenceHasPronoun=true
75 | lexicalToken=false
76 | sentenceLengthBinning=false
77 | dependencyCueDependent=true
78 | constituentGoverning=true
79 | constituentBinning=false
80 | dependencyParentRel=true
81 | constituentFeatures=true
82 | lexicalBigram=false
83 | documentOffsetConjunction=true
84 |
85 |
--------------------------------------------------------------------------------
/resources/PARC/listfeatures/attribution_nouns.txt:
--------------------------------------------------------------------------------
1 | accord
2 | bill
3 | counterclaim
4 | document
5 | formulation
6 | according
7 | call
8 | criticism
9 | doubt
10 | guess
11 | accusation
12 | challenge
13 | critic
14 | effort
15 | highlight
16 | acknowledgement
17 | charge
18 | cry
19 | elaboration
20 | hint
21 | ad
22 | chart
23 | data
24 | encouragement
25 | hope
26 | admission
27 | citation
28 | decision
29 | eruption
30 | idea
31 | advice
32 | claim
33 | declaration
34 | estimate
35 | illustration
36 | agreement
37 | command
38 | deduction
39 | eulogy
40 | implication
41 | allegation
42 | comment
43 | defence
44 | evidence
45 | imposition
46 | amendment
47 | commercial
48 | definition
49 | exclamation
50 | indication
51 | announcement
52 | complaint
53 | deliberation
54 | expectation
55 | information
56 | answer
57 | concern
58 | demand
59 | explanation
60 | insinuation
61 | anticipation
62 | concession
63 | denial
64 | expression
65 | inspiration
66 | argument
67 | conclusion
68 | depiction
69 | fear
70 | instruction
71 | article
72 | condition
73 | description
74 | feeling
75 | intention
76 | assertion
77 | confession
78 | dictate
79 | file
80 | interjection
81 | assumption
82 | confidence
83 | disappointment
84 | filing
85 | interpretation
86 | assurance
87 | confirmation
88 | disapproval
89 | find
90 | issue
91 | belief
92 | consideration
93 | disclosure
94 | finding
95 | joke
96 | bet
97 | contention
98 | discovery
99 | figure
100 | knowledge
101 | book
102 | convinction
103 | dispute
104 | forecast
105 | lament
106 | laugh
107 | offer
108 | question
109 | response
110 | support
111 | law
112 | opinion
113 | quotation
114 | revelation
115 | supposition
116 | lawsuit
117 | order
118 | realization
119 | rule
120 | survey
121 | lecture
122 | pact
123 | reason
124 | rumor
125 | suspicion
126 | legislation
127 | paper
128 | recognition
129 | saying
130 | talk
131 | lesson
132 | permission
133 | recollection
134 | scream
135 | temptation
136 | letter
137 | plan
138 | recommendation
139 | shout
140 | testimony
141 | list
142 | pledge
143 | recount
144 | sigh
145 | theory
146 | menace
147 | point
148 | reflection
149 | sign
150 | thought
151 | mention
152 | policy
153 | reform
154 | signal
155 | threat
156 | message
157 | poll
158 | refusal
159 | snort
160 | understandment
161 | mind
162 | praise
163 | rejection
164 | specification
165 | urge
166 | moan
167 | prediction
168 | remark
169 | speculation
170 | view
171 | need
172 | press
173 | repetition
174 | spell
175 | voice
176 | news
177 | proclamation
178 | reply
179 | statement
180 | want
181 | note
182 | project
183 | report
184 | statistic
185 | warning
186 | notice
187 | promise
188 | reproach
189 | story
190 | wisdom
191 | notification
192 | proposal
193 | request
194 | strategy
195 | worry
196 | oath
197 | protest
198 | requirement
199 | study
200 | yell
201 | objection
202 | prove
203 | research
204 | suggestion
205 | observation
206 | provision
207 | resentment
208 | suit
--------------------------------------------------------------------------------
/resources/PARC/listfeatures/krestel_verbs.txt:
--------------------------------------------------------------------------------
1 | according
2 | accuse
3 | acknowledge
4 | add
5 | admit
6 | agree
7 | allege
8 | announce
9 | argue
10 | assert
11 | believe
12 | blame
13 | charge
14 | cite
15 | claim
16 | complain
17 | concede
18 | conclude
19 | confirm
20 | contend
21 | criticize
22 | declare
23 | decline
24 | deny
25 | describe
26 | disagree
27 | disclose
28 | estimate
29 | explain
30 | fear
31 | hope
32 | insist
33 | maintain
34 | mention
35 | note
36 | order
37 | predict
38 | promise
39 | recall
40 | recommend
41 | reply
42 | report
43 | say
44 | state
45 | stress
46 | suggest
47 | tell
48 | testify
49 | think
50 | urge
51 | warn
52 | worry
53 | write
54 | observe
--------------------------------------------------------------------------------
/resources/PARC/listfeatures/titles.txt:
--------------------------------------------------------------------------------
1 | Mr
2 | Mrs
3 | Ms
4 | Mr.
5 | Mrs.
6 | Ms.
7 | Miss
8 | Mister
9 | Madam
10 | Hon.
11 | MP
12 | MYP
13 | Representative
14 | Senator
15 | Speaker
16 | President
17 | Councillor
18 | Alderman
19 | Selectman
20 | Delegate
21 | Mayor
22 | Lady
23 | Mayoress
24 | Lord
25 | Governor
26 | Lieutenant
27 | Prefect
28 | Prelate
29 | Premier
30 | Burgess
31 | Ambassador
32 | Envoy
33 | Secretary
34 | Cardinal
35 | Attaché
36 | Chargé
37 | Provost
38 | Prince
39 | Princess
40 | Archduke
41 | Archduchess
42 | Duke
43 | Duchess
44 | Marquis
45 | Marquess
46 | Marquise
47 | Marchioness
48 | Count
49 | Countess
50 | Earl
51 | Viscount
52 | Viscountess
53 | Baron
54 | Baroness
55 | Emperor
56 | Empress
57 | King
58 | Queen
59 | Tsar
60 | Tsarina
61 | Leader
62 | Pope
63 | Sir
64 | Dame
65 | Advocate
66 | Attorney
67 | Bailiff
68 | Barrister
69 | Chancellor
70 | Judge
71 | Justice
72 | Clerk
73 | Magistrate
74 | Promagistrate
75 | Mufti
76 | Grand Mufti
77 | Privy
78 | Counsellor
79 | Majesty
80 | Solicitor
81 | Abbess
82 | Abbot
83 | Brother
84 | Sister
85 | Mother
86 | Superior
87 | Friar
88 | Bishop
89 | Archbishop
90 | Metropolitan
91 | Presbyter
92 | Priest
93 | Priestess
94 | Father
95 | Fr.
96 | Patriarch
97 | Pope
98 | Catholicos
99 | Vicar
100 | Chaplain
101 | Canon
102 | Pastor
103 | Prelate
104 | Primate
105 | Dom
106 | Cardinal
107 | Venerable
108 | Blessed
109 | Saint
110 | Christ
111 | Deacon
112 | Archdeacon
113 | Acolyte
114 | Dean
115 | Elder
116 | Minister
117 | Monsignor
118 | Reader
119 | Almoner
120 | Dr.
121 | Dr
122 | MD
123 | PhD
124 | EdD
125 | PharmD
126 | LLD
127 | JD
128 | Prof
129 | Prof.
130 | Professor
131 | Colonel
132 | General
133 | Commodore
134 | Corporal
135 | Mate
136 | Sergeant
137 | Admiral
138 | Brigadier
139 | Captain
140 | Commander
141 | General
142 | Officer
143 | Lieutenant
144 | Major
145 | Private
146 | Constable
147 | Agent
148 | Principal
149 | Nurse
150 | Nanny
151 | Coach
152 | Wizard
153 | Chief
154 | Scout
155 | Lama
156 | Dalai
157 | Panchen
158 | Druid
159 | Archdruid
160 | Rabbi
161 | Rebbe
162 | Hakham
163 | Buddha
164 | Ayatollah
165 | Imam
166 | Bodhisattva
167 | Mullah
168 | Kohen
169 | Nat
170 | Mahdi
171 | Rosh
172 | HaYeshiva
173 | Saoshyant
174 | Tirthankar
175 | Vardapet
176 | Mahatma
177 | Pandit
178 | Swami
179 | Ustad
180 | Sheikh
181 | Emir
182 | Emira
183 | Sultan
184 | Sultana
185 | Maharajah
186 | Maharani
187 | Eze
188 | Mwami
189 | Nizam
190 | Dato
191 | Oba
192 | Tor
193 | Tiv
194 | Obi
195 | Elder
196 | Vizier
197 | Grand
--------------------------------------------------------------------------------
/src/main/java/ims/cs/bbn/BbnNeHandler.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.bbn;
19 |
20 | import org.xml.sax.Attributes;
21 | import org.xml.sax.SAXException;
22 | import org.xml.sax.helpers.DefaultHandler;
23 |
24 | import java.util.ArrayList;
25 | import java.util.HashMap;
26 | import java.util.List;
27 | import java.util.Map;
28 |
29 | /**
30 | * XML handler to process named entity information from the BBN dataset.
31 | */
32 | public class BbnNeHandler extends DefaultHandler {
33 |
34 | StringBuffer accumulator = new StringBuffer(); /* Accumulate parsed text */
35 | List tags;
36 | Map> tagMap = new HashMap<>();
37 | String currentTag;
38 | String fileNo;
39 | boolean tagPreceded = false;
40 | boolean disableNextTag = false;
41 | private String docNo;
42 |
43 |
44 | public void characters(char[] buffer, int start, int length) {
45 | accumulator.append(buffer, start, length);
46 | }
47 |
48 |
49 | @Override
50 | public void startDocument() throws SAXException {
51 | }
52 |
53 | @Override
54 | public void endDocument() throws SAXException {
55 | }
56 |
57 | /**
58 | * Returns all currently unprocessed text read so far
59 | * @return
60 | */
61 | public String popText() {
62 | String text = accumulator.toString();
63 | accumulator.setLength(0);
64 | return text;
65 | }
66 |
67 | /**
68 | * Counts number of spaces. Double spaces are conflated.
69 | * @param s
70 | * @return
71 | */
72 | public int numSpaces(String s) {
73 | int numSpaces = 0;
74 | boolean prevIsWhitespace = false;
75 |
76 | for (int i = 0; i < s.length(); i++) {
77 | if (Character.isWhitespace(s.charAt(i))) {
78 | if (!prevIsWhitespace)
79 | numSpaces++;
80 | prevIsWhitespace = true;
81 | } else {
82 | prevIsWhitespace = false;
83 | }
84 | }
85 |
86 | return numSpaces;
87 | }
88 |
89 | /**
90 | * Counts number of words.
91 | * @param s
92 | * @return
93 | */
94 | public int numWords(String s) {
95 | int numWords;
96 | if (s.equals("")) {
97 | numWords = 0;
98 | } else {
99 | numWords = numSpaces(s) + 1;
100 | }
101 | return numWords;
102 | }
103 |
104 | @Override
105 | public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
106 |
107 | if (qName.equals("DOC")) { /* document starts, reset accumulator */
108 | accumulator.setLength(0);
109 | tags = new ArrayList<>();
110 | } else if (qName.endsWith("EX")) { /* NE tag starts */
111 | String text = popText();
112 | String trimText = text.trim();
113 |
114 | // count words to align with the tokenized text
115 | int numWords = numWords(trimText);
116 |
117 |
118 | // adjust word counters in case of mid-word tags
119 | if (tagPreceded && (text.length() == 0 || !Character.isWhitespace(text.charAt(0)))) numWords--;
120 | if (text.length() == 0 || !Character.isWhitespace(text.charAt(text.length()-1))) numWords--;
121 |
122 | if (trimText.length() > 0 && numWords < 0) {
123 | disableNextTag = true;
124 | }
125 |
126 | // pad with outside tags
127 | for (int i = 0; i < numWords; i++) tags.add("O");
128 | currentTag = atts.getValue("TYPE");
129 | } else if (qName.equals("DOCNO")) { /* new document, reset accumulator (to be sure) */
130 | accumulator.setLength(0);
131 | }
132 | }
133 |
134 | @Override
135 | public void endElement(String uri, String localName, String qName) {
136 | if (qName.equals("DOC")) { //* document ends */
137 | String text = popText();
138 | String trimText = text.trim();
139 | int numWords = numWords(trimText);
140 |
141 | // adjust word counters in case of mid-word tags
142 | if (tagPreceded && (text.length() == 0 || !Character.isWhitespace(text.charAt(0)))) numWords--;
143 |
144 | // pad with outside tags
145 | for (int i = 0; i < numWords; i++) tags.add("O");
146 |
147 | // store annotation
148 | tagMap.put(fileNo, tags);
149 | tagPreceded = false;
150 | } else if (qName.endsWith("EX")) { /* NE tag ends */
151 | if (disableNextTag) {
152 | disableNextTag = false;
153 | return;
154 | }
155 |
156 | String text = popText();
157 | String trimText = text.trim();
158 | int numWords = numWords(trimText);
159 | for (int i = 0; i < numWords; i++) tags.add(currentTag);
160 | tagPreceded = true;
161 | } else if (qName.equals("DOCNO")) { /* document number ends, parse document number */
162 | docNo = popText();
163 | fileNo = docNo.trim().substring(5);
164 | tagPreceded = false;
165 | }
166 | }
167 |
168 | /**
169 | * Returns the NE annotations for a given file ID
170 | * @param fileId
171 | * @return
172 | */
173 | public List getTags(String fileId) {
174 | return tagMap.get(fileId);
175 | }
176 |
177 |
178 | }
179 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/bbn/BbnNeParser.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 | package ims.cs.bbn;
18 |
19 | import ims.cs.lingdata.Document;
20 | import ims.cs.lingdata.DocumentId;
21 | import ims.cs.lingdata.Token;
22 | import ims.cs.util.StaticConfig;
23 | import org.xml.sax.InputSource;
24 | import org.xml.sax.SAXException;
25 | import org.xml.sax.XMLReader;
26 |
27 | import javax.xml.parsers.ParserConfigurationException;
28 | import javax.xml.parsers.SAXParser;
29 | import javax.xml.parsers.SAXParserFactory;
30 | import java.io.File;
31 | import java.io.IOException;
32 | import java.util.List;
33 |
34 | /**
35 | * XML parser for BBN named entity dataset
36 | */
37 | public class BbnNeParser {
38 |
39 |
40 | private static BbnNeParser instance;
41 | private static SAXParser saxParser;
42 | private static XMLReader xmlReader;
43 | private static BbnNeHandler handler;
44 |
45 | public String currentBbnFile;
46 |
47 |
48 | private BbnNeParser() throws ParserConfigurationException, SAXException {
49 | SAXParserFactory spf = SAXParserFactory.newInstance();
50 | saxParser = spf.newSAXParser();
51 | xmlReader = saxParser.getXMLReader();
52 | handler = new BbnNeHandler();
53 | xmlReader.setContentHandler(handler);
54 | }
55 |
56 | /**
57 | * BBN splits each section in up to 4 files. This function determines which one contains the document in question.
58 | * @param document
59 | * @return
60 | */
61 | public String getBbnFileName(Document document) {
62 | DocumentId id = document.docId;
63 | String sectionStr = id.getSectionStr();
64 | String fileStr = id.getFileStr();
65 | int num = Integer.parseInt(fileStr);
66 | char partitionChar;
67 |
68 | // BBN partition rule
69 | if (num < 25) {
70 | partitionChar = 'a';
71 | } else if (num < 50) {
72 | partitionChar = 'b';
73 | } else if (num < 75) {
74 | partitionChar = 'c';
75 | } else {
76 | partitionChar = 'd';
77 | }
78 |
79 | String fileName = "wsj" + sectionStr + partitionChar + ".qa";
80 |
81 | return fileName;
82 | }
83 |
84 |
85 | /**
86 | * Takes a previously loaded WSJ document and adds BBN named entities.
87 | * This function does some rudimentary caching, which requires the WSJ documents to be parsed in order to stay fast.
88 | * @param document
89 | * @return
90 | * @throws IOException
91 | * @throws SAXException
92 | */
93 | public Document augmentDocumentXml(Document document) throws IOException, SAXException {
94 | String fileName = getBbnFileName(document);
95 |
96 | // move to the next BBN file if necessary
97 | // this will be efficient if the documents are passed in WSJ order as it avoids reloading the same file
98 | if (!fileName.equals(currentBbnFile)) {
99 | File xmlFile = new File(StaticConfig.bbnPath + fileName);
100 | xmlReader.parse(new InputSource(xmlFile.getPath()));
101 | currentBbnFile = fileName;
102 | }
103 |
104 | List tags = handler.getTags(document.docId.getFileStr());
105 | List tokenList = document.tokenList;
106 |
107 | // sanity check: same number of tokens?
108 | if (tags.size() != tokenList.size()) {
109 | throw new Error("Tag and token counts differ");
110 | }
111 |
112 | // align tags and tokens
113 | for (int i = 0; i < tokenList.size(); i++) {
114 | Token token = tokenList.get(i);
115 | String neTag = tags.get(i);
116 | token.goldNer = neTag;
117 | }
118 |
119 | return document;
120 | }
121 |
122 |
123 | public static BbnNeParser getInstance() throws ParserConfigurationException, SAXException {
124 | if (instance == null) {
125 | instance = new BbnNeParser();
126 | }
127 | return instance;
128 | }
129 |
130 | }
131 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/corenlp/DocumentAligner.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.corenlp;
19 |
20 | import java.util.ArrayList;
21 | import java.util.Iterator;
22 | import java.util.List;
23 |
24 | import ims.cs.lingdata.Document;
25 | import ims.cs.lingdata.Sentence;
26 | import ims.cs.lingdata.Token;
27 | import edu.stanford.nlp.ling.CoreAnnotations;
28 | import edu.stanford.nlp.ling.CoreLabel;
29 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
30 | import edu.stanford.nlp.util.CoreMap;
31 | import ims.cs.util.StaticConfig;
32 |
33 | /**
34 | * Aligns CoreNLP parser output with the original document. This is necessary since CoreNLP may produce a
35 | * tokenization that deviates from the input.
36 | */
37 | public class DocumentAligner {
38 |
39 | private Document pDocument;
40 | private List pcSentenceList;
41 | private boolean useCoreNlpQuoteCompletion = true;
42 |
43 |
44 | public DocumentAligner(Document pDocument, CoreMap cDocument) {
45 | this.pDocument = pDocument;
46 | alignSentences(pDocument, cDocument);
47 | }
48 |
49 | /**
50 | * Aligns original document and CoreNLP processed document.
51 | * @param pDocument
52 | * @param cDocument
53 | */
54 | private void alignSentences(Document pDocument, CoreMap cDocument) {
55 |
56 | // get sentences
57 | List cSentenceList = cDocument.get(SentencesAnnotation.class);
58 |
59 | // state variables
60 | pcSentenceList = new ArrayList<>();
61 | Iterator cSentenceIter = cSentenceList.iterator();
62 | Iterator pTokenIter = pDocument.tokenList.iterator();
63 | Token nextPToken = pTokenIter.next();
64 |
65 | // now iterate over CoreNLP sentences
66 | while (cSentenceIter.hasNext()) {
67 | // get sentence tokens
68 | CoreMap cSentence = cSentenceIter.next();
69 | List cTokens = cSentence.get(CoreAnnotations.TokensAnnotation.class);
70 | List currentSentencePTokens = new ArrayList<>(cTokens.size());
71 |
72 |
73 | // identify last token
74 | CoreLabel finalToken = cTokens.get(cTokens.size()-1);
75 | int endPosition = finalToken.endPosition();
76 |
77 | // align tokens by byte count until the end of the sentence
78 | while (nextPToken.goldByteCount.getBegin() <= endPosition) {
79 | currentSentencePTokens.add(nextPToken);
80 | if (nextPToken.goldByteCount.getEnd() <= endPosition) {
81 | if (pTokenIter.hasNext()) {
82 | nextPToken = pTokenIter.next();
83 | } else {
84 | break;
85 | }
86 | } else {
87 | break;
88 | }
89 | }
90 |
91 |
92 | // check if any tokens need to be aligned at all
93 | if (currentSentencePTokens.size() > 0) {
94 | TokenAligner ta = new TokenAligner(currentSentencePTokens, cSentence);
95 | ta.setUseCoreNlpQuoteCompletion(useCoreNlpQuoteCompletion);
96 | Sentence combinedSentence = ta.getCombinedSentence();
97 |
98 | if (combinedSentence == null) {
99 | if (StaticConfig.verbose)
100 | System.out.println("Discarding empty combined sentence: " +
101 | cSentence.toString() + currentSentencePTokens.toString());
102 | } else {
103 | pcSentenceList.add(combinedSentence);
104 | }
105 | } else { /* sentence may be empty if CoreNLP produced spurious tokens */
106 | if (StaticConfig.verbose)
107 | System.out.println("Discarding empty PARC sentence: " +
108 | cSentence.toString() + currentSentencePTokens.toString());
109 | }
110 |
111 | }
112 |
113 | }
114 |
115 | /**
116 | * Returns the aligned document
117 | * @return
118 | */
119 | public Document getDocument() {
120 | Document combinedDocument = new Document(pDocument);
121 |
122 | combinedDocument.sentenceList = pcSentenceList;
123 |
124 | List documentTokenList = new ArrayList(pcSentenceList.size() * 5);
125 |
126 | for (Sentence sentence: pcSentenceList) {
127 | sentence.document = combinedDocument;
128 | documentTokenList.addAll(sentence.tokenList);
129 | }
130 |
131 | combinedDocument.tokenList = documentTokenList;
132 |
133 | // set token positions in the new document
134 | for (int i = 0; i < combinedDocument.tokenList.size(); i++) {
135 | combinedDocument.tokenList.get(i).predPosition = i;
136 | }
137 |
138 | return combinedDocument;
139 |
140 | }
141 |
142 | }
143 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/corenlp/IndexedWordIterator.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.corenlp;
19 |
20 | import java.util.Iterator;
21 |
22 | import edu.stanford.nlp.ling.IndexedWord;
23 |
24 | /**
25 | * Iterates over all indexed words safely -- this is useful as punctuation may not have an associated indexed word
26 | */
27 | public class IndexedWordIterator implements Iterator {
28 |
29 | Iterator iter;
30 | IndexedWord currentWord;
31 | int index = 1;
32 |
33 | private void fetch() {
34 | if (iter.hasNext()) {
35 | currentWord = iter.next();
36 | } else {
37 | currentWord = null;
38 | }
39 | }
40 |
41 | public IndexedWordIterator(Iterator iter) {
42 | this.iter = iter;
43 | fetch();
44 | }
45 |
46 | public boolean hasNext() {
47 | return true;
48 | }
49 |
50 | public IndexedWord next() {
51 | IndexedWord returnVal;
52 |
53 | if (currentWord == null) {
54 | returnVal = null;
55 | } else if (currentWord.index() == index) {
56 | returnVal = currentWord;
57 | fetch();
58 | } else {
59 | returnVal = null;
60 | }
61 |
62 | index++;
63 | return returnVal;
64 | }
65 |
66 | public void remove() {
67 | throw new UnsupportedOperationException("no remove allowed");
68 | }
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/ByteCount.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.lingdata;
19 |
20 | /**
21 | * Byte offset information
22 | */
23 | public class ByteCount {
24 | public int begin;
25 | public int end;
26 |
27 | public ByteCount (int begin, int end) {
28 | this.begin = begin;
29 | this.end = end;
30 | }
31 |
32 | public ByteCount(String value) {
33 | String[] tokens = value.split(",");
34 | begin = Integer.parseInt(tokens[0]);
35 | end = Integer.parseInt(tokens[1]);
36 | }
37 |
38 | public int getBegin() {
39 | return begin;
40 | }
41 |
42 |
43 | public int getEnd() {
44 | return end;
45 | }
46 |
47 | @Override
48 | public String toString() {
49 | return "" + begin + "," + end;
50 | }
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/Corpus.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.lingdata;
19 |
20 | import java.util.List;
21 | import java.util.Map;
22 |
23 | import ims.cs.lingdata.Types.PartitionName;
24 |
25 | /**
26 | * Abstract Corpus class.
27 | * A corpus has training, dev, and test partitions as well as a document list
28 | */
29 | public abstract class Corpus {
30 |
31 | List docList;
32 | private Map partitionMap;
33 |
34 | public abstract Partition getTrain();
35 | public abstract Partition getDev();
36 | public abstract Partition getTest();
37 |
38 | public List getDocumentList() {
39 | return docList;
40 | }
41 |
42 | public void setDocumentList(List docList) {
43 | this.docList = docList;
44 | }
45 | public Map getPartitionMap() {
46 | return partitionMap;
47 | }
48 | public void setPartitionMap(Map partitionMap) {
49 | this.partitionMap = partitionMap;
50 | }
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/Document.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.lingdata;
19 |
20 | import java.util.HashSet;
21 | import java.util.List;
22 | import java.util.Set;
23 |
24 | import ims.cs.lingdata.Types.Genre;
25 | import ims.cs.qsample.spans.Span;
26 |
27 | /**
28 | * Representation of a document.
29 | * Has a list of sentences and a list of tokens; holds span predictions.
30 | */
31 | public class Document {
32 |
33 | public List sentenceList;
34 | public List tokenList;
35 | public DocumentId docId;
36 | public Genre genre;
37 | public String text;
38 | public String sourceCorpusName;
39 |
40 | // span predictions
41 | public Set predictedSpanSet;
42 | public Set goldSpanSet;
43 |
44 |
45 | // CoreNLP flag to avoid multiple processing
46 | public boolean isCoreNlpProcessed;
47 |
48 | public Document(Document pDocument) {
49 | this.docId = pDocument.docId;
50 | this.genre = pDocument.genre;
51 | this.text = pDocument.text;
52 | this.sourceCorpusName = pDocument.sourceCorpusName;
53 |
54 | this.predictedSpanSet = new HashSet();
55 | this.goldSpanSet = new HashSet();
56 | }
57 |
58 |
59 | public Document() { }
60 |
61 |
62 | public List getTokenList() {
63 | return tokenList;
64 | }
65 |
66 | public Set goldSpansOfLabel(String label) {
67 | Set selectedGoldSpans = new HashSet<>();
68 | for (Span gs : goldSpanSet) {
69 | if (gs.label.equals(label)) {
70 | selectedGoldSpans.add(gs);
71 | }
72 | }
73 | return selectedGoldSpans;
74 | }
75 |
76 | public Set predictedSpansOfLabel(String label) {
77 | Set predGoldSpans = new HashSet<>();
78 | for (Span ps : predictedSpanSet) {
79 | if (ps.label.equals(label)) {
80 | predGoldSpans.add(ps);
81 | }
82 | }
83 | return predGoldSpans;
84 | }
85 |
86 | public Token getPrevToken(Token t) {
87 | return getPrevToken(t, 1);
88 | }
89 |
90 | public Token getNextToken(Token t) {
91 | return getNextToken(t, 1);
92 | }
93 |
94 | public Token getPrevToken(Token t, int dist) {
95 | if (t.predPosition - dist >= 0) return tokenList.get(t.predPosition-dist);
96 | else return null;
97 | }
98 |
99 | public Token getNextToken(Token t, int dist) {
100 | if (t.predPosition < tokenList.size()-dist) return tokenList.get(t.predPosition+dist);
101 | else return null;
102 | }
103 |
104 | }
105 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/DocumentId.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.lingdata;
19 |
20 | /**
21 | * Interface to represent a document ID.
22 | * In our world, all documents have WSJ behavior, so they are part of a section and have a file number.
23 | */
24 | public interface DocumentId {
25 | String getSectionStr();
26 | String getFileStr();
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/GornAddressList.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.lingdata;
19 |
20 | /**
21 | * Dummy class for Gorn addresses which turns out were not needed. Remains for compatibility.
22 | */
23 | public class GornAddressList {
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/Partition.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.lingdata;
19 |
20 | import java.util.List;
21 | import java.util.Map;
22 |
23 | /**
24 | * A partition is a list of documents which may belong to different sections
25 | */
26 | public class Partition {
27 |
28 | public List docList;
29 | public Map> sectionMap;
30 |
31 |
32 | public Partition() {}
33 |
34 | public Partition(List docList) {
35 | this.docList = docList;
36 | }
37 |
38 |
39 | public List getDocumentList() {
40 | return docList;
41 | }
42 |
43 | public int size() {
44 | return docList.size();
45 | }
46 |
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/PlainTextCorpus.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.lingdata;
19 |
20 | import java.util.List;
21 |
22 | /**
23 | * A corpus to hold documents read from plain text files.
24 | * Has only one partition and consists only of test data.
25 | */
26 | public class PlainTextCorpus extends Corpus {
27 |
28 | Partition partition;
29 |
30 | public PlainTextCorpus(List documentList) {
31 | setDocumentList(documentList);
32 | partition = new Partition();
33 | partition.docList = documentList;
34 | }
35 |
36 | @Override
37 | public Partition getTrain() {
38 | return null;
39 | }
40 |
41 | @Override
42 | public Partition getDev() {
43 | return null;
44 | }
45 |
46 | @Override
47 | public Partition getTest() {
48 | return partition;
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/PlainTextDocId.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.lingdata;
19 |
20 |
21 | /**
22 | * A document ID for plain text documents.
23 | * Since we require a WSJ-like directory structure, we can generate IDs from that.
24 | */
25 | public class PlainTextDocId implements DocumentId {
26 |
27 | String sectionStr;
28 | String fileStr;
29 |
30 | public PlainTextDocId (String section, String file) {
31 | sectionStr = section;
32 | fileStr = file;
33 | }
34 |
35 | @Override
36 | public String getSectionStr() {
37 | return sectionStr;
38 | }
39 |
40 | @Override
41 | public String getFileStr() {
42 | return fileStr;
43 | }
44 |
45 | @Override
46 | public String toString() {
47 | return sectionStr + "," + fileStr;
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/Sentence.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.lingdata;
19 |
20 | import java.util.HashMap;
21 | import java.util.List;
22 | import java.util.Map;
23 |
24 | import edu.stanford.nlp.ling.IndexedWord;
25 | import edu.stanford.nlp.semgraph.SemanticGraph;
26 | import edu.stanford.nlp.trees.Tree;
27 | import org.jgrapht.alg.FloydWarshallShortestPaths;
28 |
29 | /**
30 | * Representation of a sentence.
31 | * Is part of a document; contains a list of tokens; may have a constituency and a dependency tree.
32 | */
33 | public class Sentence {
34 |
35 | public List tokenList;
36 | public GornAddressList gorn;
37 | public SentenceId sentenceId;
38 | public int positionInDocument;
39 | public Document document;
40 |
41 | // CoreLabel backwards lookup
42 | public Map indexedWordLookup;
43 | public HashMap treeLookup;
44 |
45 | // CoreNLP output
46 | public Tree tree;
47 | public SemanticGraph dependencyGraph;
48 | public FloydWarshallShortestPaths fw;
49 |
50 |
51 | public Sentence () {}
52 | public Sentence (Document d) {
53 | document = d;
54 | }
55 |
56 | public List getTokenList() {
57 | return tokenList;
58 | }
59 |
60 | public Token first() { return tokenList.get(0); }
61 | public Token last() { return tokenList.get(tokenList.size()-1); }
62 |
63 | @Override
64 | public String toString() {
65 | return tokenList.toString();
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/SentenceId.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.lingdata;
19 |
20 | /**
21 | * Holds an ID for a sentence.
22 | * Can be calculated from the document's ID together with the Gorn address of the sentence.
23 | */
24 | public class SentenceId {
25 |
26 | private WSJId wsjId;
27 | private GornAddressList gorn;
28 |
29 | public SentenceId(WSJId wsdId, GornAddressList gorn) {
30 | this.gorn = gorn;
31 | this.wsjId = wsdId;
32 | }
33 |
34 | public WSJId getWsjId () {
35 | return wsjId;
36 | }
37 |
38 | public GornAddressList getGorn() {
39 | return gorn;
40 | }
41 |
42 | @Override
43 | public String toString() {
44 | return "" + wsjId + ":" + gorn;
45 | }
46 |
47 | @Override
48 | public boolean equals(Object obj) {
49 | if (obj instanceof SentenceId) {
50 | SentenceId objId = (SentenceId) obj;
51 | return this.wsjId.equals(objId.wsjId) && this.gorn.equals(objId.gorn);
52 | //FIXME: maybe the gorn thing doesn't work
53 | } else {
54 | return false;
55 | }
56 | }
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/Types.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.lingdata;
19 |
20 | public abstract class Types {
21 | public enum PartitionName {TRAIN, DEV, TEST};
22 | public enum Genre {FICTION, NEWS, BIOGRAPHY};
23 | }
24 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/WSJId.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.lingdata;
19 |
20 | import java.io.Serializable;
21 |
22 | /**
23 | * Document ID as used in the Wall Street Journal corpus.
24 | * Each document is part of a section and is stored in a file, each of which have an integral ID.
25 | */
26 | public class WSJId implements Serializable, DocumentId {
27 |
28 | private static final long serialVersionUID = 4443044961863001270L;
29 |
30 | private Integer section;
31 | private Integer file;
32 |
33 |
34 | public WSJId (Integer section) {
35 | this(section, null);
36 | }
37 |
38 | public WSJId (String section, String file) {
39 | this(Integer.parseInt(section), Integer.parseInt(file));
40 | }
41 |
42 | public WSJId (String section) {
43 | this(Integer.parseInt(section));
44 | }
45 |
46 | public WSJId (Integer section, Integer file) {
47 | this.section = section;
48 | this.file = file;
49 | }
50 |
51 | public int getSectionInt() {
52 | return section;
53 | }
54 |
55 | public int getFileInt() {
56 | return file;
57 | }
58 |
59 | private static String addOffset(int i) {
60 | if (i < 10) {
61 | return "0" + i;
62 | } else {
63 | return "" + i;
64 | }
65 | }
66 |
67 | public String getSectionStr() {
68 | return addOffset(section);
69 | }
70 |
71 | public String getFileStr() {
72 | return addOffset(file);
73 | }
74 |
75 | @Override
76 | public boolean equals(Object other) {
77 | if (other instanceof WSJId) {
78 | WSJId otherId = (WSJId) other;
79 | return (this.section == otherId.section) && (this.file == otherId.file);
80 | } else {
81 | return false;
82 | }
83 | }
84 |
85 | public boolean sectionEquals(WSJId other) {
86 | return this.section == other.section;
87 | }
88 |
89 | @Override
90 | public String toString() {
91 | return getSectionStr() + getFileStr();
92 | }
93 | }
94 |
95 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/mallet/DocumentFeatureSet2TokenSequence.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.mallet;
19 |
20 | import java.util.List;
21 |
22 | import cc.mallet.pipe.Pipe;
23 | import cc.mallet.types.Instance;
24 | import cc.mallet.types.Token;
25 | import cc.mallet.types.TokenSequence;
26 | import ims.cs.qsample.features.FeatureSet;
27 |
28 | /**
29 | * Pipe to convert our internal feature set to mallet token feature entries
30 | * @author scheibcn
31 | */
32 | public class DocumentFeatureSet2TokenSequence extends Pipe {
33 |
34 | private static final long serialVersionUID = 3218174517742238232L;
35 |
36 | @Override
37 | public Instance pipe(Instance inst) {
38 |
39 | // ensure that the instance is of the right type
40 | if (!(inst instanceof PARCDocumentInstance)) {
41 | throw new UnsupportedOperationException("Expected CoreMap, got " + inst.getClass());
42 | }
43 |
44 |
45 | List tokenList = ((PARCDocumentInstance) inst).document.getTokenList();
46 | TokenSequence ts = new TokenSequence();
47 |
48 | // iterate over tokens and convert their internal feature sets into Mallet feature sets
49 | for (ims.cs.lingdata.Token cToken : tokenList) {
50 | FeatureSet fs = cToken.boundaryFeatureSet;
51 | Token mToken = new Token(cToken.predText);
52 |
53 | // copy each feature
54 | for (Object entry : fs) {
55 | mToken.setFeatureValue(entry.toString(), 1);
56 | }
57 |
58 | ts.add(mToken);
59 | }
60 |
61 | inst.setData(ts);
62 |
63 | return inst;
64 | }
65 |
66 | }
67 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/mallet/PARCDocumentInstance.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.mallet;
19 |
20 |
21 | import cc.mallet.types.Instance;
22 | import ims.cs.lingdata.Document;
23 |
24 | /**
25 | * Mallet "Instance" wrapper class for documents
26 | */
27 | public class PARCDocumentInstance extends Instance {
28 |
29 | private static final long serialVersionUID = -6933321582801583924L;
30 |
31 | public transient Document document;
32 |
33 | private PARCDocumentInstance() {
34 | super(null, null, null, null);
35 | };
36 |
37 | public PARCDocumentInstance(Document document) {
38 | super(document, null, document.docId, document);
39 | this.document = document;
40 | }
41 |
42 |
43 | public Document getDocument() {
44 | return document;
45 | }
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/parc/PARCAttribution.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.parc;
19 |
20 | /**
21 | * A single attribution.
22 | */
23 | public class PARCAttribution {
24 |
25 | // attribution roles as annotated in the PARC corpus
26 | public enum Role { SOURCE, CONTENT, CUE, SUPPLEMENT };
27 |
28 | // types by Pareti et al.
29 | public enum Type { DIRECT, INDIRECT, MIXED };
30 |
31 | public Role role;
32 | public String id;
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/parc/ParcUtils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.parc;
19 |
20 | import edu.stanford.nlp.ling.IndexedWord;
21 | import edu.stanford.nlp.semgraph.SemanticGraphEdge;
22 | import edu.stanford.nlp.trees.GrammaticalRelation;
23 | import ims.cs.corenlp.Helper;
24 | import ims.cs.lingdata.ByteCount;
25 | import ims.cs.lingdata.Document;
26 | import ims.cs.lingdata.Sentence;
27 | import ims.cs.lingdata.Token;
28 | import edu.stanford.nlp.trees.Tree;
29 | import org.jgrapht.alg.FloydWarshallShortestPaths;
30 | import org.jgrapht.graph.SimpleDirectedGraph;
31 |
32 | import java.util.Iterator;
33 | import java.util.List;
34 |
35 | /**
36 | * Collection of Utility functions
37 | */
38 | public abstract class ParcUtils {
39 |
40 | /**
41 | * Find all head verbs in the corpus. The algorithm is taken from Pareti (2015).
42 | * @param sentence
43 | */
44 | public static void markHeadVerbs (Sentence sentence) {
45 |
46 | for (Tree tree : sentence.tree.preOrderNodeList()) {
47 | if (tree.label().value().equals("VP")) {
48 | boolean valid = true;
49 | for (Tree child : tree.children()) {
50 | if (child.label().value().equals("VP")) {
51 | valid = false;
52 | break;
53 | }
54 | }
55 |
56 | if (valid) {
57 | for (Tree child : tree.children()) {
58 | if (child.firstChild().isLeaf() && child.label().value().startsWith("V")) {
59 | Token token = sentence.treeLookup.get(child.firstChild());
60 | if (token != null)
61 | token.isHeadVerb = true;
62 | }
63 | }
64 | }
65 | }
66 | }
67 | }
68 |
69 | /**
70 | * Annotates paragraph-continuing quotation marks. doParagraphAnnotation() needs to be called before this.
71 | * @param document
72 | */
73 | public static void markParagraphQuotes(Document document) {
74 | int quoteIndex = 1;
75 |
76 | for (Token token: document.tokenList) {
77 | if (Helper.isQuote(token)) {
78 | // ignore even quotes at paragraph begins
79 | if (token.paragraphBegins && quoteIndex % 2 == 0)
80 | token.ignoreQuote = true;
81 | else
82 | quoteIndex++;
83 | }
84 | }
85 | }
86 |
87 | /**
88 | * Annotates for each token whether it starts a paragraph by its raw text
89 | * @param document
90 | */
91 | public static void doParagraphAnnotation (Document document) {
92 | String documentText = document.text;
93 | Iterator tokenIter = document.tokenList.iterator();
94 |
95 | if (!tokenIter.hasNext()) {
96 | System.err.println("Skipping paragraph annotation empty document: " + document.docId);
97 | return;
98 | }
99 |
100 | Token token = tokenIter.next();
101 | ByteCount bc = token.goldByteCount;
102 |
103 | // iterate over all character positions in the text
104 | char prevC = 0;
105 |
106 | for (int i = 0; i < documentText.length(); i++) {
107 | if (i > bc.getEnd()) {
108 | if (!tokenIter.hasNext()) break; /* reached the last token */
109 |
110 | token = tokenIter.next();
111 | bc = token.goldByteCount;
112 | }
113 |
114 | char c = documentText.charAt(i);
115 |
116 | // two consecutive newlines indicate a paragraph
117 | if (prevC == '\n' && c == '\n') {
118 | token.paragraphBegins = true;
119 | }
120 |
121 | prevC = c;
122 | }
123 | }
124 |
125 | /**
126 | * Anonymizes certain named entities in the text
127 | * @param document
128 | */
129 | public static void anonymizeNamedEntities (Document document) {
130 | for (Token token: document.getTokenList()) {
131 | if (token.predNer.startsWith("ORGANIZATION") || token.predNer.startsWith("PERSON")) {
132 | String substText = "[NE]";
133 | token.originalPredText = token.predText;
134 |
135 | token.predLemma = substText;
136 | token.predText = substText;
137 | token.goldLemma = substText;
138 | token.goldText = substText;
139 | }
140 | }
141 | }
142 |
143 | /**
144 | * CoreNLP tries to predict opening and closing quotation marks.
145 | * This method maps the variation back to one symbol.
146 | * @param document
147 | */
148 | public static void sanitizeQuotationMarks (Document document) {
149 | for (Token token : document.getTokenList()) {
150 | // double quotes
151 | if (token.predLemma.equals("``") || token.predLemma.equals("\"") || token.predLemma.equals("''")) {
152 | token.predLemma = "\"";
153 | token.predPosTag = "\"";
154 | token.predText = "\"";
155 | token.goldPosTag = "\"";
156 | token.goldLemma = "\"";
157 | token.goldText = "\"";
158 | }
159 |
160 | // single quotes
161 | if (token.predLemma.equals("`") || token.predLemma.equals("''")) {
162 | token.predLemma = "'";
163 | token.predPosTag = "'";
164 | token.predText = "'";
165 | token.goldLemma = "'";
166 | token.goldPosTag = "'";
167 | token.goldText = "'";
168 | }
169 |
170 | }
171 | }
172 |
173 | /**
174 | * The FW implementation needs distinct objects as edges, which this class accomplishes.
175 | * CoreNLP seems to optimize storage by caching strings, so different edges have identical label strings.
176 | */
177 | public static class IndexedEdge {
178 | public GrammaticalRelation rel;
179 | public int index;
180 |
181 | public IndexedEdge(GrammaticalRelation rel, int index) {
182 | this.rel = rel;
183 | this.index = index;
184 | }
185 | }
186 |
187 | /**
188 | * Compute cached dependency paths using Floyd Warshall
189 | * @param dependencies
190 | * @return
191 | */
192 | public static FloydWarshallShortestPaths computeFloydWarshallSGE(List dependencies) {
193 | SimpleDirectedGraph graph = new SimpleDirectedGraph(IndexedEdge.class);
194 | int edgeId = 0;
195 | for (SemanticGraphEdge dep : dependencies) {
196 | graph.addVertex(dep.getGovernor());
197 | graph.addVertex(dep.getDependent());
198 | graph.addEdge(dep.getGovernor(), dep.getDependent(), new IndexedEdge(dep.getRelation(), edgeId));
199 | }
200 | return new FloydWarshallShortestPaths(graph);
201 | }
202 |
203 | }
204 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/parc/xml/PARCParser.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.parc.xml;
19 |
20 | import java.io.File;
21 | import java.io.IOException;
22 |
23 | import javax.xml.parsers.ParserConfigurationException;
24 | import javax.xml.parsers.SAXParser;
25 | import javax.xml.parsers.SAXParserFactory;
26 |
27 | import org.xml.sax.InputSource;
28 | import org.xml.sax.SAXException;
29 | import org.xml.sax.XMLReader;
30 |
31 | import ims.cs.lingdata.Document;
32 |
33 | /**
34 | * XML parser for the PARC corpus
35 | */
36 | public class PARCParser {
37 |
38 | private static PARCParser instance;
39 | private static SAXParser saxParser;
40 | private static XMLReader xmlReader;
41 | private static PARCHandler handler;
42 |
43 | private PARCParser () throws ParserConfigurationException, SAXException {
44 | SAXParserFactory spf = SAXParserFactory.newInstance();
45 | saxParser = spf.newSAXParser();
46 | xmlReader = saxParser.getXMLReader();
47 | handler = new PARCHandler();
48 | xmlReader.setContentHandler(handler);
49 | }
50 |
51 | public Document parseFile(File xmlFile) throws IOException, SAXException {
52 | xmlReader.parse(new InputSource(xmlFile.getPath()));
53 | return handler.getDocument();
54 |
55 | }
56 |
57 | public static PARCParser getInstance() throws ParserConfigurationException, SAXException {
58 | if (instance == null) {
59 | instance = new PARCParser();
60 | }
61 | return instance;
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/evaluate/EvaluateClassifier.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.evaluate;
19 |
20 | import ims.cs.lingdata.Document;
21 |
22 | import java.util.List;
23 |
24 | /**
25 | * Evaluation functions for single-token classifiers
26 | * Created by scheibcn on 3/2/16.
27 | */
28 | public class EvaluateClassifier {
29 |
30 | /**
31 | * Container class for quotation classifier results, i.e., begin, end, and cue F1
32 | */
33 | public static class ClassifierResults {
34 | F1.Stats beginStats;
35 | F1.Stats endStats;
36 | F1.Stats cueStats;
37 |
38 | public String toString() {
39 | return String.format("Pb=%1.3f Rb=%1.3f Fb=%1.3f Pe=%1.3f Re=%1.3f Fe=%1.3f Pc=%1.3f Rc=%1.3f Fc=%1.3f",
40 | beginStats.precision, beginStats.recall, beginStats.f1,
41 | endStats.precision, endStats.recall, endStats.f1,
42 | cueStats.precision, cueStats.recall, cueStats.f1);
43 |
44 | }
45 | }
46 |
47 | /**
48 | * Evaluate begin, end, and cue classifier output over all tokens in the specified documents
49 | * @param trainDocs
50 | * @return
51 | */
52 | public static ClassifierResults evaluateClassifier (List trainDocs) {
53 | if (trainDocs == null) return null;
54 | ClassifierResults results = new ClassifierResults();
55 |
56 | results.beginStats = F1.evalPerceptron(trainDocs, "begin");
57 | results.endStats = F1.evalPerceptron(trainDocs, "end");
58 | results.cueStats = F1.evalPerceptron(trainDocs, "cue");
59 |
60 | return results;
61 | }
62 |
63 |
64 | /**
65 | * Print begin, end, and cue classifier evaluations over all tokens in the specified training, test, val, and
66 | * resubstitution documents
67 | * @param trainDocs
68 | * @param testDocs
69 | * @param valDocs
70 | * @param resDocs
71 | * @param prefix
72 | */
73 | public static void evaluateAndPrint(List trainDocs, List testDocs, List valDocs, List resDocs, String prefix) {
74 | ClassifierResults trainResults = evaluateClassifier(trainDocs);
75 | ClassifierResults testResults = evaluateClassifier(testDocs);
76 | ClassifierResults valResults = evaluateClassifier(valDocs);
77 | ClassifierResults resResults = evaluateClassifier(resDocs);
78 |
79 | if (trainResults != null) System.out.println(prefix + " TRAIN " + trainResults.toString());
80 | if (testResults != null) System.out.println(prefix + " TEST " + testResults.toString());
81 | if (valResults != null) System.out.println(prefix + " VAL " + valResults.toString());
82 | if (resResults != null) System.out.println(prefix + " RES " + resResults.toString());
83 | }
84 |
85 | }
86 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/evaluate/EvaluateSpan.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.evaluate;
19 |
20 | import ims.cs.lingdata.Document;
21 | import ims.cs.parc.PARCAttribution;
22 |
23 | import java.util.List;
24 |
25 | /**
26 | * Evaluation functions for span prediction models.
27 | * Created by scheibcn on 3/2/16.
28 | */
29 | public class EvaluateSpan {
30 |
31 | /**
32 | * Container class for all necessary F1 statistics to do Pareti-style quotation evaluation
33 | */
34 | public static class SpanResults {
35 | public F1.Stats strictCue;
36 | public F1.Stats strictContent;
37 | public F1.Stats partialContent;
38 | public F1.Stats strictContentDirect;
39 | public F1.Stats partialContentDirect;
40 | public F1.Stats strictContentIndirect;
41 | public F1.Stats partialContentIndirect;
42 | public F1.Stats strictContentMixed;
43 | public F1.Stats partialContentMixed;
44 |
45 | public String toString(String sep) {
46 | return strictContent.toString() + sep
47 | + strictContentDirect + sep
48 | + strictContentIndirect + sep
49 | + strictContentMixed + sep
50 | + strictCue + sep + sep
51 | + partialContent + sep
52 | + partialContentDirect + sep
53 | + partialContentIndirect + sep
54 | + partialContentMixed;
55 | }
56 | }
57 |
58 | /**
59 | * SpanResults for training, test, validation, and resubstitution data
60 | */
61 | public static class ResultSet {
62 | public SpanResults trainResults;
63 | public SpanResults testResults;
64 | public SpanResults valResults;
65 | public SpanResults resResults;
66 | }
67 |
68 |
69 | /**
70 | * Evaluate cue and content span models
71 | * @param documentList
72 | * @return
73 | */
74 | public static SpanResults cueContentEvaluation (List documentList) {
75 | SpanResults evaluation = new SpanResults();
76 | evaluation.strictCue = F1.evalSpans(documentList, "cue", false, null);
77 | evaluation.strictContent = F1.evalSpans(documentList, "content", false, null);
78 | evaluation.partialContent = F1.evalSpans(documentList, "content", true, null);
79 | evaluation.strictContentDirect = F1.evalSpans(documentList, "content", false, PARCAttribution.Type.DIRECT);
80 | evaluation.partialContentDirect = F1.evalSpans(documentList, "content", true, PARCAttribution.Type.DIRECT);
81 | evaluation.strictContentIndirect = F1.evalSpans(documentList, "content", false, PARCAttribution.Type.INDIRECT);
82 | evaluation.partialContentIndirect = F1.evalSpans(documentList, "content", true, PARCAttribution.Type.INDIRECT);
83 | evaluation.strictContentMixed = F1.evalSpans(documentList, "content", false, PARCAttribution.Type.MIXED);
84 | evaluation.partialContentMixed = F1.evalSpans(documentList, "content", true, PARCAttribution.Type.MIXED);
85 |
86 | return evaluation;
87 | }
88 |
89 | /**
90 | * Returns a string where the input s is repeated n times
91 | * @param s
92 | * @param n
93 | * @return
94 | */
95 | private static String generateN(String s, int n) {
96 | StringBuilder sb = new StringBuilder();
97 | for (int i = 0; i < n; i++) {
98 | sb.append(s);
99 | }
100 |
101 | return sb.toString();
102 | }
103 |
104 | private static void printHeader(String sep, int offset) {
105 | System.out.println(generateN("-", offset) + "--------------------------------------------------------------------------------------------------------------------------------------------------------");
106 | System.out.println(generateN(" ", offset) + " exact "+sep+""+sep+" partial");
107 | System.out.println(generateN(" ", offset) + " ALL "+sep+" DIRECT "+sep+" INDIRECT "+sep+" MIXED "+sep+" cue "+sep+""+sep+" ALL "+sep+" DIRECT "+sep+" INDIRECT "+sep+" MIXED ");
108 | System.out.println(generateN(" ", offset) + " P R F "+sep+" P R F "+sep+" P R F "+sep+" P R F "+sep+" P R F "+sep+""+sep+" P R F "+sep+" P R F "+sep+" P R F "+sep+" P R F ");
109 |
110 | }
111 |
112 | private static void printFooter(int offset) {
113 | System.out.println(generateN("-", offset) + "--------------------------------------------------------------------------------------------------------------------------------------------------------");
114 | }
115 |
116 |
117 | private static void printResults(String prefix, String sep, SpanResults trainingEval, SpanResults testEval, SpanResults valEval, SpanResults resEval) {
118 | printHeader(sep, prefix.length() + 1);
119 |
120 | if (trainingEval != null) System.out.println(prefix + " TRAIN " + trainingEval.toString(sep));
121 | if (testEval != null) System.out.println(prefix + " TEST " + testEval.toString(sep));
122 | if (valEval != null) System.out.println(prefix + " VAL " + valEval.toString(sep));
123 | if (resEval != null) System.out.println(prefix + " RES " + resEval.toString(sep));
124 |
125 | printFooter(prefix.length() + 1);
126 | }
127 |
128 | public static ResultSet evaluateAndPrint(String prefix, String sep, List trainingDocuments, List testDocuments, List valDocuments, List resDocuments) {
129 | ResultSet resultSet = new ResultSet();
130 |
131 | if (trainingDocuments != null) resultSet.trainResults = cueContentEvaluation(trainingDocuments);
132 | if (testDocuments != null) resultSet.testResults = cueContentEvaluation(testDocuments);
133 | if (valDocuments != null) resultSet.valResults = cueContentEvaluation(valDocuments);
134 | if (resDocuments != null) resultSet.resResults = cueContentEvaluation(resDocuments);
135 |
136 | printResults(prefix, sep, resultSet.trainResults, resultSet.testResults,
137 | resultSet.valResults, resultSet.resResults);
138 |
139 | return resultSet;
140 | }
141 |
142 |
143 | }
144 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/Binning.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.features;
19 |
20 | import java.util.ArrayList;
21 | import java.util.List;
22 |
23 | /**
24 | * Binning for distances
25 | * Created by scheibcn on 3/4/16.
26 | */
27 | public class Binning {
28 | /**
29 | * Bins that stack up from 0 to 100
30 | * @param distance
31 | * @param prefix
32 | * @return
33 | */
34 | public static List distanceBinsStackUp (int distance, String prefix) {
35 | List features = new ArrayList<>();
36 |
37 | if (distance > 0) features.add(prefix + ">=1");
38 | if (distance > 1) features.add(prefix + ">=2");
39 | if (distance > 2) features.add(prefix + ">=3");
40 | if (distance > 3) features.add(prefix + ">=4");
41 | if (distance > 4) features.add(prefix + ">=5");
42 | if (distance > 5) features.add(prefix + ">=6");
43 | if (distance > 6) features.add(prefix + ">=7");
44 | if (distance > 7) features.add(prefix + ">=8");
45 | if (distance > 10) features.add(prefix + ">=11");
46 | if (distance > 15) features.add(prefix + ">=16");
47 | if (distance > 20) features.add(prefix + ">=21");
48 | if (distance > 25) features.add(prefix + ">=26");
49 | if (distance > 30) features.add(prefix + ">=31");
50 | if (distance > 40) features.add(prefix + ">=41");
51 | if (distance > 50) features.add(prefix + ">=51");
52 | if (distance > 60) features.add(prefix + ">=61");
53 | if (distance > 70) features.add(prefix + ">=71");
54 | if (distance > 80) features.add(prefix + ">=81");
55 | if (distance > 90) features.add(prefix + ">=91");
56 | if (distance > 100) features.add(prefix + ">=101");
57 |
58 | return features;
59 | }
60 |
61 | /**
62 | * Bins that stack down from 0 to 100
63 | * @param distance
64 | * @param prefix
65 | * @return
66 | */
67 | public static List distanceBinsStackDown (int distance, String prefix) {
68 | List features = new ArrayList<>();
69 |
70 | if (distance < 2) features.add(prefix + "<=1");
71 | if (distance < 3) features.add(prefix + "<=2");
72 | if (distance < 4) features.add(prefix + "<=3");
73 | if (distance < 5) features.add(prefix + "<=4");
74 | if (distance < 6) features.add(prefix + "<=5");
75 | if (distance < 7) features.add(prefix + "<=6");
76 | if (distance < 8) features.add(prefix + "<=7");
77 | if (distance < 9) features.add(prefix + "<=8");
78 | if (distance < 12) features.add(prefix + "<=11");
79 | if (distance < 17) features.add(prefix + "<=16");
80 | if (distance < 22) features.add(prefix + "<=21");
81 | if (distance < 27) features.add(prefix + "<=26");
82 | if (distance < 32) features.add(prefix + "<=31");
83 | if (distance < 42) features.add(prefix + "<=41");
84 | if (distance < 52) features.add(prefix + "<=51");
85 | if (distance < 62) features.add(prefix + "<=61");
86 | if (distance < 72) features.add(prefix + "<=71");
87 | if (distance < 82) features.add(prefix + "<=81");
88 | if (distance < 92) features.add(prefix + "<=91");
89 | if (distance < 102) features.add(prefix + "<=101");
90 |
91 | return features;
92 | }
93 |
94 | /**
95 | * Interval bins from 0 to 100
96 | * @param distance
97 | * @param prefix
98 | * @return
99 | */
100 | public static List distanceBins1to100(int distance, String prefix) {
101 | List features = new ArrayList<>();
102 |
103 | if (distance > 0 && distance < 5) features.add(prefix + "_in_[0,5)");
104 | if (distance >= 5 && distance < 10) features.add(prefix + "_in_[5,10)");
105 | if (distance >= 10 && distance < 20) features.add(prefix + "_in_[10,20)");
106 | if (distance >= 20 && distance < 40) features.add(prefix + "_in_[20,40)");
107 | if (distance >= 40 && distance < 60) features.add(prefix + "_in_[40,60)");
108 | if (distance >= 60 && distance < 80) features.add(prefix + "_in_[60,80)");
109 | if (distance >= 80 && distance <= 100) features.add(prefix + "_in_[60,100]");
110 |
111 | return features;
112 | }
113 |
114 |
115 | /**
116 | * Bins from 0 to 100, intervals and stacking up & down
117 | * @param distance
118 | * @param prefix
119 | * @return
120 | */
121 | public static List distanceBinsAll (int distance, String prefix) {
122 | List features = new ArrayList<>();
123 | features.addAll(distanceBins1to100(distance, prefix));
124 | features.addAll(distanceBinsStackDown(distance,prefix));
125 | features.addAll(distanceBinsStackUp(distance,prefix));
126 | return features;
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/FeatureExtraction.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.features;
19 |
20 | import java.io.IOException;
21 |
22 |
23 | import ims.cs.qsample.features.components.SentenceConstituentFeatures;
24 | import ims.cs.qsample.features.components.SentenceDependencyFeatures;
25 | import ims.cs.qsample.features.components.SentenceFeaturesDerivedFromListCue;
26 | import ims.cs.qsample.features.components.SentenceIndicatorFeatures;
27 | import ims.cs.qsample.features.components.TokenDictFeatures;
28 | import ims.cs.qsample.features.components.TokenLexicalFeatures;
29 | import ims.cs.qsample.features.components.TokenListFeatures;
30 | import ims.cs.lingdata.Document;
31 | import ims.cs.lingdata.Sentence;
32 | import ims.cs.lingdata.Token;
33 | import ims.cs.qsample.features.components.DocumentOffsetConjunction;
34 | import ims.cs.qsample.features.components.DocumentQuotationFeature;
35 | import ims.cs.util.StaticConfig;
36 |
37 | /**
38 | * Feature extractor class for (mostly) those features that require non-static code.
39 | */
40 | public class FeatureExtraction {
41 |
42 | private TokenListFeatures tokenPersonFeatures;
43 | private TokenListFeatures tokenOrganizationFeatures;
44 | private TokenListFeatures tokenTitleFeatures;
45 | private TokenListFeatures tokenListFeatures;
46 | private TokenListFeatures tokenNounListFeatures;
47 | private TokenDictFeatures verbNetFeatures;
48 | private DocumentOffsetConjunction documentOffsetConjunction;
49 |
50 |
51 | public FeatureExtraction () throws IOException, ClassNotFoundException {
52 | // non-static extractors
53 | tokenPersonFeatures = new TokenListFeatures("resources/PARC/listfeatures/person.hyponyms.txt", "EK:PER");
54 | tokenOrganizationFeatures = new TokenListFeatures("resources/PARC/listfeatures/organization.hyponyms.txt", "EK:ORG");
55 | tokenTitleFeatures = new TokenListFeatures("resources/PARC/listfeatures/titles.txt", "EK:TITLE");
56 | tokenListFeatures = new TokenListFeatures("resources/PARC/listfeatures/krestel_verbs.txt", "CUELIST");
57 | tokenNounListFeatures = new TokenListFeatures("resources/PARC/listfeatures/attribution_nouns.txt", "NOUNCUELIST");
58 | verbNetFeatures = new TokenDictFeatures("resources/PARC/listfeatures/verbnet.txt", "VERBNET");
59 |
60 | // restrict extractors to certain pos tags
61 | tokenNounListFeatures.posStart = "N";
62 | tokenListFeatures.posStart = "V";
63 | verbNetFeatures.posStart = "V";
64 |
65 | // Offset conjunction on non-static features
66 | documentOffsetConjunction = new DocumentOffsetConjunction();
67 | }
68 |
69 |
70 | /**
71 | * Runs token-level feature extraction on the tokens in the document
72 | * @param document
73 | */
74 | public void extractTokenFeatures(Document document) {
75 | for (Token token : document.tokenList) {
76 | tokenPersonFeatures.extract(token);
77 | tokenOrganizationFeatures.extract(token);
78 | tokenTitleFeatures.extract(token);
79 | TokenLexicalFeatures.extract(token);
80 |
81 | tokenListFeatures.extract(token);
82 | tokenNounListFeatures.extract(token);
83 | verbNetFeatures.extract(token);
84 | }
85 | }
86 |
87 |
88 | /**
89 | * Runs sentence-level feature extraction on the sentences in the document
90 | * @param document
91 | */
92 | public void extractSentenceFeatures (Document document) {
93 | for (Sentence sentence : document.sentenceList) {
94 | SentenceIndicatorFeatures.extract(sentence);
95 | if (StaticConfig.dependencyFeatures) SentenceDependencyFeatures.extract(sentence);
96 | if (StaticConfig.constituentFeatures) SentenceConstituentFeatures.extract(sentence);
97 | SentenceFeaturesDerivedFromListCue.extract(sentence);
98 | }
99 | }
100 |
101 |
102 | public void setUpFeatureSets(Document doc) {
103 | for (Token token : doc.tokenList)
104 | if (token.boundaryFeatureSet == null)
105 | token.boundaryFeatureSet = new FeatureIntSet();
106 | }
107 |
108 |
109 | /**
110 | * Runs feature extraction on a single document
111 | * @param document
112 | */
113 | public void extractAllFeatures (Document document) {
114 | // initialize empty feature sets
115 | setUpFeatureSets(document);
116 |
117 | // Token features & sentence features
118 | extractTokenFeatures(document);
119 | extractSentenceFeatures(document);
120 |
121 | // quotation mark features
122 | if (StaticConfig.documentQuotationFeature)
123 | DocumentQuotationFeature.extract(document);
124 |
125 | // offset conjunction
126 | if (StaticConfig.documentOffsetConjunction)
127 | documentOffsetConjunction.extract(document);
128 |
129 | // additional features
130 | BoundaryFeatures.additionalBoundaryFeatures(document);
131 | }
132 |
133 |
134 | }
135 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/FeatureIndexMap.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.features;
19 |
20 | import java.util.ArrayList;
21 | import java.util.HashMap;
22 | import java.util.List;
23 | import java.util.Map;
24 |
25 | /**
26 | * Automatically counting string to int mapping for feature sets.
27 | * Created by scheibcn on 6/1/16.
28 | */
29 | public class FeatureIndexMap {
30 | Map f2i;
31 | List i2f;
32 |
33 | int maxIndex = -1;
34 |
35 | FeatureIndexMap () {
36 | f2i = new HashMap<>();
37 | i2f = new ArrayList<>();
38 | }
39 |
40 | /**
41 | * Translate string to index. If the string is unknown, it is assigned a new index.
42 | * @param feature
43 | * @return
44 | */
45 | public int getIndex(String feature) {
46 | if (f2i.containsKey(feature)) {
47 | return f2i.get(feature);
48 | } else {
49 | maxIndex++;
50 | f2i.put(feature, maxIndex);
51 | i2f.add(feature);
52 | return maxIndex;
53 | }
54 | }
55 |
56 | /**
57 | * Translate index to string.
58 | * @param index
59 | * @return
60 | */
61 | public String getFeature(int index) {
62 | if (index <= maxIndex) {
63 | return i2f.get(index);
64 | } else {
65 | throw new Error("Lookup error");
66 | }
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/FeatureIntSet.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.features;
19 |
20 | import java.util.Collection;
21 | import java.util.HashSet;
22 | import java.util.Iterator;
23 | import java.util.Set;
24 |
25 | /**
26 | * A feature set storing features as integers.
27 | * Created by scheibcn on 6/1/16.
28 | */
29 | public class FeatureIntSet implements FeatureSet {
30 |
31 | // internal mapping from feature strings to integers
32 | static FeatureIndexMap featureIndexMap = new FeatureIndexMap(); // a static map across all feature sets
33 | Set featureIndices = new HashSet<>();
34 |
35 | @Override
36 | public int size() {
37 | return featureIndices.size();
38 | }
39 |
40 | @Override
41 | public boolean isEmpty() {
42 | return featureIndices.isEmpty();
43 | }
44 |
45 |
46 |
47 | @Override
48 | public boolean add(String s) {
49 | int index = featureIndexMap.getIndex(s);
50 | featureIndices.add(index);
51 | return true;
52 | }
53 |
54 | @Override
55 | public Iterator iterator() { return new StringIterator(); }
56 |
57 | @Override
58 | public boolean addAll(Collection extends String> c) {
59 | if (c instanceof FeatureIntSet) {
60 | // just call addAll on the index sets
61 | featureIndices.addAll(((FeatureIntSet) c).featureIndices);
62 | } else if (c instanceof Collection) {
63 | for (String s : c) this.add(s);
64 | } else {
65 | throw new Error("Incompatible types");
66 | }
67 |
68 | return true;
69 | }
70 |
71 |
72 | @Override
73 | public boolean contains(Object o) {
74 | int targetIndex = featureIndexMap.getIndex((String) o);
75 | return featureIndices.contains(targetIndex);
76 | }
77 |
78 | @Override
79 | public void clear() { featureIndices.clear(); }
80 |
81 |
82 | /**
83 | * Iterator that automatically maps the stored indices to strings
84 | */
85 | class StringIterator implements Iterator {
86 |
87 | Iterator featureIndexIter;
88 |
89 | StringIterator () { featureIndexIter = featureIndices.iterator(); }
90 |
91 | @Override
92 | public boolean hasNext() {
93 | return featureIndexIter.hasNext();
94 | }
95 |
96 | @Override
97 | public String next() {
98 | int index = featureIndexIter.next();
99 | return featureIndexMap.getFeature(index);
100 | }
101 |
102 | @Override
103 | public void remove() {
104 | featureIndexIter.remove();
105 | }
106 | }
107 |
108 |
109 |
110 | // NOTE: for compatibility, FeatureSets are collections
111 | // BELOW: interfaces inherited from collection that we do not need to implement
112 |
113 | @Override
114 | public Object[] toArray() { throw new Error("Not implemented"); }
115 |
116 | @Override
117 | public T[] toArray(T[] a) { throw new Error("Not implemented"); }
118 |
119 | @Override
120 | public boolean remove(Object o) { throw new Error("Not implemented"); }
121 |
122 | @Override
123 | public boolean containsAll(Collection> c) { throw new Error("Not implemented"); }
124 |
125 | @Override
126 | public boolean removeAll(Collection> c) { throw new Error("Not implemented"); }
127 |
128 | @Override
129 | public boolean retainAll(Collection> c) { throw new Error("Not implemented"); }
130 |
131 | }
132 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/FeatureSet.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.features;
19 |
20 | import java.util.Collection;
21 |
22 | /**
23 | * Interface for feature sets. For now just a collection of String.
24 | * Created by scheibcn on 6/1/16.
25 | */
26 | public interface FeatureSet extends Collection {
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/FeatureStringSet.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.features;
19 |
20 | import java.util.HashSet;
21 | import java.util.Iterator;
22 |
23 | /**
24 | * A feature set that stores features as strings internally. Essentially just a HashSet.
25 | */
26 | public class FeatureStringSet extends HashSet implements FeatureSet {
27 | public FeatureStringSet(FeatureStringSet f) {
28 | super(f);
29 | }
30 |
31 | public FeatureStringSet() {
32 | super();
33 | }
34 |
35 | public FeatureStringSet(int size) {
36 | super(size);
37 | }
38 |
39 | @Override
40 | public boolean add(String e) { return super.add(e); }
41 |
42 | @Override
43 | public Iterator iterator() {
44 | return super.iterator();
45 | }
46 |
47 |
48 | }
49 |
50 |
51 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/DocumentOffsetConjunction.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.features.components;
19 |
20 | import java.util.Arrays;
21 | import java.util.LinkedList;
22 | import java.util.List;
23 |
24 | import ims.cs.lingdata.Document;
25 | import ims.cs.lingdata.Token;
26 | import ims.cs.qsample.features.FeatureSet;
27 | import org.apache.commons.lang3.StringUtils;
28 |
29 | /**
30 | * Offset conjunction over a selection of features.
31 | * Idea here: enumerate all possible patterns of feature conjunctions. Then test for each feature set whether it
32 | * contains each of the conjunctions. If so, add the conjunction.
33 | */
34 | public class DocumentOffsetConjunction {
35 |
36 | // features subject to conjunction
37 | private static final String[] features = new String[] {"SENT:QUOT", "SENT:NE", "SENT:PRO", "SENT:HASCUE", "CUE-DEP", "IS-LEFTMOST", "SENT-BEGIN-WIN", "SENT-END-WIN"};
38 |
39 | private List patternList;
40 |
41 |
42 | public DocumentOffsetConjunction() {
43 | patternList = new LinkedList<>();
44 |
45 | // add empty entry to start
46 | patternList.add(new String[] {});
47 |
48 |
49 | for (String s : features) {
50 | List newPatterns = new LinkedList();
51 | for (String[] pattern : patternList) {
52 | String[] concat = append(pattern, s);
53 | newPatterns.add(concat);
54 | }
55 |
56 | patternList.addAll(newPatterns);
57 | }
58 |
59 | // remove the empty entry
60 | patternList.remove(0);
61 | }
62 |
63 | /**
64 | * Add feature conjunctions to all tokens in the document
65 | * @param document
66 | */
67 | public void extract (Document document) {
68 | List tokenList = document.getTokenList();
69 |
70 | for (Token token : tokenList) {
71 | FeatureSet fs = token.boundaryFeatureSet;
72 |
73 | // check for each pattern whether the feature set satisfies it
74 | for (String[] features : patternList) {
75 | boolean matches = true;
76 | for (String feature: features) {
77 | if (!fs.contains(feature)) {
78 | matches = false;
79 | break;
80 | }
81 | }
82 |
83 | // if the pattern is satisfied, add the conjunction
84 | if (matches) {
85 | fs.add("CONJUNCTION:" + StringUtils.join(",", features));
86 | }
87 | }
88 | }
89 | }
90 |
91 |
92 | public static String[] append (String[] a1, String s) {
93 | String[] ret = new String[a1.length + 1];
94 | System.arraycopy(a1, 0, ret, 0, a1.length);
95 | ret[ret.length-1] = s;
96 | return ret;
97 | }
98 |
99 | public void printPatterns() {
100 | for(String[] p : patternList) {
101 | System.out.println(Arrays.toString(p));
102 | }
103 | }
104 |
105 |
106 | }
107 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/DocumentQuotationFeature.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.features.components;
19 |
20 | import java.util.List;
21 |
22 | import ims.cs.lingdata.Document;
23 | import ims.cs.lingdata.Token;
24 | import ims.cs.corenlp.Helper;
25 |
26 | /**
27 | * Check for each token whether it is enclosed in quotation marks
28 | */
29 | public abstract class DocumentQuotationFeature {
30 |
31 | public static final String INQ_PREFIX = "DOC:INQ";
32 | public static final String NOTINQ_PREFIX = "DOC:NOTINQ";
33 | public static final String OPEN_PREFIX = "DOC:Q-OPENS";
34 | public static final String CLOSE_PREFIX = "DOC:Q-CLOSES";
35 |
36 | public static void extract(Document document) {
37 | boolean inQuote = false;
38 |
39 | List tokenList = document.getTokenList();
40 |
41 | for (Token token : tokenList) {
42 | // check if token is a quotation mark and is not to be ignored
43 | // (paragraph-initial tokens may be marked to be ignored)
44 | if (Helper.isQuote(token) && !token.ignoreQuote) {
45 |
46 | // add respective feature ...
47 | if (inQuote)
48 | token.boundaryFeatureSet.add(CLOSE_PREFIX);
49 | else
50 | token.boundaryFeatureSet.add(OPEN_PREFIX);
51 |
52 | // toggle in-quote state
53 | inQuote = !inQuote;
54 | token.boundaryFeatureSet.add(INQ_PREFIX);
55 | } else if (inQuote) { /* currently in quote */
56 | token.boundaryFeatureSet.add(INQ_PREFIX);
57 | } else { /* currently not in quote */
58 | token.boundaryFeatureSet.add(NOTINQ_PREFIX);
59 | }
60 | }
61 | }
62 |
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/SentenceConstituentFeatures.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.features.components;
19 |
20 | import java.util.LinkedList;
21 | import java.util.List;
22 |
23 | import ims.cs.lingdata.Sentence;
24 | import ims.cs.lingdata.Token;
25 | import ims.cs.qsample.features.FeatureSet;
26 | import edu.stanford.nlp.trees.Tree;
27 | import ims.cs.util.StaticConfig;
28 |
29 | /**
30 | * Token features derived from the constituency parse of a sentence
31 | */
32 | public abstract class SentenceConstituentFeatures {
33 |
34 | // feature names
35 | private static final String LEVEL_FEATURE = "LVL";
36 | private static final String LEFTMOST_FEATURE = "IS-LEFTMOST";
37 | private static final String GOV_FEATURE = "GOV:";
38 | private static final String AL_FEATURE = "AL:";
39 | private static final String PARENT_FEATURE = "PARENT:";
40 |
41 | public static void extract(Sentence s) {
42 | addTreeFeatures(s, s.tree);
43 | }
44 |
45 | /**
46 | * Class for keeping track of node-level pairs
47 | */
48 | private static class NodeFeatures {
49 | String label;
50 | Integer level;
51 |
52 | public NodeFeatures(String label, int depth) {
53 | this.label = label;
54 | this.level = depth;
55 | }
56 | }
57 |
58 | /**
59 | * Add tree features recursively
60 | * @param s
61 | * @param t
62 | */
63 | private static void addTreeFeatures(Sentence s, Tree t) {
64 | addTreeFeatures(s, t, 0, new LinkedList(), null, true, null);
65 | }
66 |
67 | /**
68 | * Recursion step for tree featues
69 | * @param sentence
70 | * @param t complete tree
71 | * @param level current level
72 | * @param governingLabels list of governing labels
73 | * @param parent information about direct parent
74 | * @param isLeftmost is the node the leftmost one in the constituent specified by ancestorWhereLeftmost
75 | * @param ancestorWhereLeftmost
76 | */
77 | private static void addTreeFeatures(Sentence sentence, Tree t, int level, List governingLabels, NodeFeatures parent, boolean isLeftmost, NodeFeatures ancestorWhereLeftmost) {
78 |
79 |
80 | if (t.isLeaf()) { /* terminal nodes */
81 | // get the current token represented by this subtree
82 | Token pToken = sentence.treeLookup.get(t);
83 |
84 | // check if token is null. this can happen if the token was unaligned previously (e.g., because of
85 | // a parser error)
86 | if (pToken == null) {
87 | if (StaticConfig.verbose)
88 | System.err.println(sentence.sentenceId + " Dropping tree without associated token: " + t + " ");
89 | return;
90 | }
91 |
92 | FeatureSet fs = pToken.boundaryFeatureSet;
93 |
94 | // leftmost feature (see Pareti paper for description)
95 | if (StaticConfig.constituentLeftmost && isLeftmost)
96 | fs.add(LEFTMOST_FEATURE);
97 |
98 | // level in tree
99 | if (StaticConfig.constituentLevel) {
100 | fs.add(LEVEL_FEATURE + level);
101 | addLevelBinHeuristic(pToken, LEVEL_FEATURE, level);
102 | }
103 |
104 | // leftmost feature label
105 | if (StaticConfig.constituentAncestorL) {
106 | fs.add(AL_FEATURE + "LBL:" + ancestorWhereLeftmost.label);
107 | fs.add(AL_FEATURE + "LVL:" + ancestorWhereLeftmost.level);
108 |
109 | addLevelBinHeuristic(pToken, AL_FEATURE + "LVL", ancestorWhereLeftmost.level);
110 | }
111 |
112 | // parent in constituent tree
113 | if (StaticConfig.constituentParent) {
114 | fs.add(PARENT_FEATURE + "LBL:" + parent.label);
115 | }
116 |
117 | // labels of all ancestors
118 | if (StaticConfig.constituentGoverning) { /* "Ancestor" features in the paper */
119 | for (NodeFeatures nf: governingLabels) {
120 | // label with and without depth
121 | fs.add(GOV_FEATURE + nf.label + "@" + nf.level); /* ambiguous in paper */
122 | fs.add(GOV_FEATURE + nf.label);
123 | fs.add(GOV_FEATURE + nf.label + "@-" + (level - nf.level)); /* ambiguous in paper */
124 |
125 | addLevelBinHeuristic(pToken, GOV_FEATURE + nf.label + "@", nf.level);
126 | addLevelBinHeuristic(pToken, GOV_FEATURE + nf.label + "@-", (level - nf.level));
127 | }
128 | }
129 | } else { // non-terminal node
130 | List childList = t.getChildrenAsList();
131 | String label = t.label().toString();
132 |
133 | // copy governing node features for next recursion step
134 | List governingLabelsUpdate = new LinkedList(governingLabels);
135 | governingLabelsUpdate.add(new NodeFeatures(label, level));
136 |
137 | // set leftmost ancestor
138 | if (ancestorWhereLeftmost == null) {
139 | ancestorWhereLeftmost = new NodeFeatures(label, level);
140 | }
141 |
142 | // check for pre-terminals -- otherwise, set the leftmost flag for the first constituent
143 | if (childList.size() > 1) {
144 | isLeftmost = true;
145 | }
146 |
147 | // call function for all children
148 | for (Tree child : childList) {
149 | addTreeFeatures(sentence, child, level + 1, governingLabelsUpdate, new NodeFeatures(label, level), isLeftmost, ancestorWhereLeftmost);
150 | isLeftmost = false;
151 | ancestorWhereLeftmost = null;
152 | }
153 | }
154 | }
155 |
156 | /**
157 | * Binning for levels
158 | * @param mToken
159 | * @param feature
160 | * @param value
161 | */
162 | private static void addLevelBinHeuristic(Token mToken, String feature, int value) {
163 | if (!StaticConfig.constituentBinning) return;
164 |
165 | FeatureSet fs = mToken.boundaryFeatureSet;
166 |
167 | int[] bins = new int[] {0, 1, 2, 3, 5, 7, 10, 13, 16, 20, 25, 40, 1000 };
168 |
169 | for (int i=0; i < bins.length - 1; i++) {
170 | int threshLower = bins[i];
171 | int threshUpper = bins[i + 1];
172 |
173 | // threshold satisfied? add bin feature!
174 | if (value <= threshUpper) {
175 | if (StaticConfig.constituentBinningStacked) {
176 | fs.add(feature + "(<=)" + threshLower);
177 | if (value >= threshLower)
178 | fs.add(feature + "(>=)" + threshLower);
179 | } else if (value > threshLower) {
180 | fs.add(feature + "(EXACT)" + threshLower);
181 | }
182 | }
183 | }
184 | }
185 |
186 |
187 | }
188 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/SentenceDependencyFeatures.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.features.components;
19 |
20 | import java.util.List;
21 |
22 | import ims.cs.lingdata.Sentence;
23 | import ims.cs.lingdata.Token;
24 | import ims.cs.corenlp.Helper;
25 | import ims.cs.qsample.features.FeatureSet;
26 | import edu.stanford.nlp.semgraph.SemanticGraphEdge;
27 | import ims.cs.util.StaticConfig;
28 |
29 | /**
30 | * Token features derived from the dependency parse of a sentence
31 | */
32 | public abstract class SentenceDependencyFeatures {
33 |
34 | // feature names
35 | private static final String PARENT_REL_PREFIX = "PARENT-REL";
36 | private static final String PARENT_RELHEAD_PREFIX = "PARENT-REL+HD";
37 | private static final String CHILD_REL_PREFIX = "CHILD-REL";
38 | private static final String CHILD_RELHEAD_PREFIX = "CHILD-REL+HD";
39 |
40 | /**
41 | * Extract dependency features for all tokens in this sentence
42 | * @param sentence
43 | */
44 | public static void extract (Sentence sentence) {
45 | for (Token pToken : sentence.tokenList) {
46 | if (StaticConfig.dependencyParentRel || StaticConfig.dependencyParentRelHead) addParentFeature(pToken);
47 | if (StaticConfig.dependencyChildRel || StaticConfig.dependencyChildRelHead) addChildFeatures(pToken);
48 | }
49 | }
50 |
51 | /**
52 | * Add features about the parent of the token
53 | * @param token
54 | */
55 | private static void addParentFeature(Token token) {
56 | SemanticGraphEdge parentEdge = Helper.getDependencyParentRel(token);
57 |
58 | FeatureSet fs = token.boundaryFeatureSet;
59 |
60 | if (parentEdge != null) {
61 | // plain parent
62 | if (StaticConfig.dependencyParentRel)
63 | fs.add(PARENT_REL_PREFIX + "=" + parentEdge.getRelation());
64 |
65 | // parent and relation label
66 | if (StaticConfig.dependencyParentRelHead)
67 | fs.add(PARENT_RELHEAD_PREFIX + "=" + parentEdge.getRelation() + "," + parentEdge.getGovernor().lemma());
68 | }
69 | }
70 |
71 | /**
72 | * Add features about the child of a token
73 | * @param pcToken
74 | */
75 | private static void addChildFeatures(Token pcToken) {
76 | List childEdgeList = Helper.getDependencyChildrenRels(pcToken);
77 | FeatureSet fs = pcToken.boundaryFeatureSet;
78 |
79 | if (childEdgeList != null) {
80 | for (SemanticGraphEdge childEdge : childEdgeList) {
81 | // plain child
82 | if (StaticConfig.dependencyChildRel)
83 | fs.add(CHILD_REL_PREFIX + "=" + childEdge.getRelation());
84 |
85 | // child and relation label
86 | if (StaticConfig.dependencyChildRelHead)
87 | fs.add(CHILD_RELHEAD_PREFIX + "=" + childEdge.getRelation() + "," + childEdge.getDependent().lemma());
88 | }
89 | }
90 | }
91 |
92 |
93 | }
94 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/SentenceFeaturesDerivedFromListCue.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.features.components;
19 |
20 | import ims.cs.lingdata.Sentence;
21 | import ims.cs.lingdata.Token;
22 | import ims.cs.corenlp.Helper;
23 | import ims.cs.util.StaticConfig;
24 |
25 | import java.util.LinkedList;
26 | import java.util.List;
27 |
28 | /**
29 | * Token features based on cue information from the noun cue list.
30 | */
31 | public abstract class SentenceFeaturesDerivedFromListCue {
32 |
33 | private static final String CUE_DEP_PREFIX = "CUE-DEP:NOUNCUE";
34 | private static final String CUE_PREFIX = "SENT:HASCUE:NOUNCUE";
35 |
36 | /**
37 | * Extract features for all tokens in the sentence
38 | * @param sentence
39 | */
40 | public static void extract (Sentence sentence) {
41 | boolean sentenceHasCueFeature = sentenceHasCue(sentence.tokenList);
42 |
43 | // check each token for noun-cue-ness, push features to its dependents (transitively)
44 | for (Token pToken : sentence.tokenList) {
45 | if (StaticConfig.dependencyCueDependent) {
46 | // token is in noun cue list
47 | if (pToken.boundaryFeatureSet.contains("NOUNCUELIST"))
48 | addCueDependentFeature("LIST", pToken, sentence);
49 |
50 | // token is "according to"
51 | if (pToken.predText.toLowerCase().equals("according")
52 | && pToken.nextToken != null
53 | && pToken.nextToken.predText.equals("to"))
54 | addCueDependentFeature("ACCORDINGTO", pToken, sentence);
55 | }
56 |
57 | SentenceIndicatorFeatures.addFeaturePositiveAndNegative(CUE_PREFIX, sentenceHasCueFeature, pToken);
58 | }
59 | }
60 |
61 | /**
62 | * Push features to all dependents of a cue
63 | * @param type
64 | * @param token
65 | * @param sentence
66 | */
67 | private static void addCueDependentFeature(String type, Token token, Sentence sentence) {
68 | List stack = new LinkedList();
69 | stack.add(token);
70 |
71 | // recursively iterate over all children (and their children ...)
72 | while (stack.size() > 0) {
73 | Token current = stack.remove(0);
74 | current.boundaryFeatureSet.add(CUE_DEP_PREFIX + "-" + type);
75 |
76 | List children = Helper.getDependencyChildren(current);
77 |
78 | if (children == null) continue;
79 |
80 | for (Token c : children) {
81 | if (c != null) stack.add(c);
82 | }
83 | }
84 | }
85 |
86 | /**
87 | * Check whether the sentence has any noun cues
88 | * @param data
89 | * @return
90 | */
91 | private static boolean sentenceHasCue(List data) {
92 | for (Token token: data) {
93 | if (token.boundaryFeatureSet.contains("NOUNCUELIST")) {
94 | return true;
95 | }
96 | }
97 | return false;
98 | }
99 |
100 | }
101 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/SentenceIndicatorFeatures.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.features.components;
19 |
20 | import ims.cs.lingdata.Sentence;
21 | import ims.cs.lingdata.Token;
22 | import ims.cs.corenlp.Helper;
23 | import ims.cs.qsample.features.FeatureSet;
24 | import ims.cs.util.StaticConfig;
25 |
26 | /**
27 | * Add sentence-level indicator features to each token
28 | */
29 | public abstract class SentenceIndicatorFeatures {
30 |
31 | // feature names
32 | private static final String QUOT_PREFIX = "SENT:QUOT";
33 | private static final String NE_PREFIX = "SENT:NE";
34 | private static final String PRO_PREFIX = "SENT:PRO";
35 | private static final String SL_PREFIX = "SL=";
36 | private static final String SL_LT_PREFIX = "SL<=";
37 | private static final String SL_GT_PREFIX = "SL>=";
38 | private static final String SL_EXACT_PREFIX = "SL-EXACT-BIN=";
39 | private static final String SENT_BEGIN_WINDOW = "SENT-BEGIN-WIN";
40 | private static final String SENT_END_WINDOW = "SENT-END-WIN";
41 | private static final String INVERT_PREFIX = "NOT:";
42 |
43 | /**
44 | * Extract indicator features for all tokens in this sentence
45 | * @param sentence
46 | */
47 | public static void extract (Sentence sentence) {
48 | // pre-compute features
49 | boolean sentenceHasQuotFeature = sentenceHasQuotationMark(sentence);
50 | boolean sentenceHasProFeature = sentenceHasPro(sentence);
51 | boolean sentenceHasNeFeature = sentenceHasNe(sentence);
52 | int sentenceLength = sentence.tokenList.size();
53 |
54 | // distance to sentence boundaries
55 | sentenceBoundDistance(sentence);
56 |
57 | // now add pre-computed features to token list
58 | for (Token mToken : sentence.tokenList) {
59 | if (StaticConfig.sentenceHasQuote) addFeaturePositiveAndNegative(QUOT_PREFIX, sentenceHasQuotFeature, mToken);
60 | if (StaticConfig.sentenceHasPronoun) addFeaturePositiveAndNegative(PRO_PREFIX, sentenceHasProFeature, mToken);
61 |
62 | if (StaticConfig.sentenceHasNe) addFeaturePositiveAndNegative(NE_PREFIX, sentenceHasNeFeature, mToken);
63 | if (StaticConfig.sentenceLength) {
64 | addLengthLogBinHeuristic(mToken, sentenceLength);
65 | mToken.boundaryFeatureSet.add(SL_PREFIX + sentenceLength);
66 | }
67 | }
68 | }
69 |
70 | /**
71 | * Add positive or negative version of a feature (i.e., also explicitly mark the absence of a feature)
72 | * @param featureName
73 | * @param featureOn
74 | * @param token
75 | */
76 | public static void addFeaturePositiveAndNegative(String featureName, boolean featureOn, Token token) {
77 | if (featureOn)
78 | token.boundaryFeatureSet.add(featureName);
79 | else
80 | token.boundaryFeatureSet.add(INVERT_PREFIX + featureName);
81 | }
82 |
83 | /**
84 | * Binning for lengths, exponential bin spacing
85 | * @param pToken
86 | * @param length
87 | */
88 | private static void addLengthLogBinHeuristic(Token pToken, int length) {
89 | if (!StaticConfig.sentenceLengthBinning) return;
90 |
91 | FeatureSet fs = pToken.boundaryFeatureSet;
92 |
93 | int[] bins = new int[] {0, 2, 4, 8, 16, 32, 64, 1000};
94 |
95 | for (int i=0; i < bins.length - 1; i++) {
96 | int threshLower = bins[i];
97 | int threshUpper = bins[i+1];
98 |
99 | if (length <= threshUpper) {
100 | if (StaticConfig.sentenceLengthBinningStacked) {
101 | fs.add(SL_LT_PREFIX + "STACKED-" + threshLower);
102 | } else if (length > threshLower) {
103 | fs.add(SL_EXACT_PREFIX + threshLower);
104 | }
105 | }
106 |
107 | if ((length >= threshLower) && StaticConfig.sentenceLengthBinningStacked) {
108 | fs.add(SL_GT_PREFIX + threshLower);
109 | }
110 | }
111 |
112 | }
113 |
114 | /**
115 | * Add features about the distance of each token to the sentence boundary
116 | * @param sentence
117 | */
118 | private static void sentenceBoundDistance(Sentence sentence) {
119 | int pos = 0;
120 | int sl = sentence.tokenList.size();
121 |
122 | for (Token token : sentence.tokenList) {
123 | // compute distance to end
124 | int endDist = sl - pos - 1;
125 |
126 | // if distance to either boundary is within a window of 5, add respective feature
127 | if (pos < 5) token.boundaryFeatureSet.add(SENT_BEGIN_WINDOW);
128 | if (endDist < 5) token.boundaryFeatureSet.add(SENT_END_WINDOW);
129 |
130 | pos++;
131 | }
132 | }
133 |
134 | /**
135 | * Determines whether a sentence contains a quotation mark
136 | * @param sentence
137 | * @return
138 | */
139 | private static boolean sentenceHasQuotationMark(Sentence sentence) {
140 | for (Token token: sentence.tokenList) {
141 | if (Helper.isQuote(token)) {
142 | return true;
143 | }
144 | }
145 | return false;
146 | }
147 |
148 | /**
149 | * Determines whether a sentence contains a pronoun
150 | * @param sentence
151 | * @return
152 | */
153 | private static boolean sentenceHasPro(Sentence sentence) {
154 | for (Token token: sentence.tokenList) {
155 | if (token.predPosTag.startsWith("PR")) {
156 | return true;
157 | }
158 | }
159 | return false;
160 | }
161 |
162 | /**
163 | * Determines whether a sentence contains a named entity
164 | * @param sentence
165 | * @return
166 | */
167 | private static boolean sentenceHasNe(Sentence sentence) {
168 | for (Token token: sentence.tokenList) {
169 | if ((token.predNer.startsWith("PERSON")) || (token.predNer.startsWith("ORGANIZATION"))) {
170 | return true;
171 | }
172 | }
173 | return false;
174 | }
175 |
176 |
177 |
178 |
179 | }
180 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/TokenDictFeatures.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.features.components;
19 |
20 | import ims.cs.lingdata.Token;
21 |
22 | import java.io.BufferedReader;
23 | import java.io.FileReader;
24 | import java.io.IOException;
25 | import java.util.HashMap;
26 | import java.util.HashSet;
27 | import java.util.Map;
28 | import java.util.Set;
29 |
30 | /**
31 | * Feature extractor that extracts information about a token from a dictionary (read from a tab-separated file)
32 | */
33 | public class TokenDictFeatures {
34 |
35 | private String featureName = "VERBDICT";
36 | private String listFileName;
37 | private Map> wordMap;
38 | public String posStart = null;
39 |
40 |
41 | /**
42 | * Set up the feature extractor
43 | * @param listFileName name of the dictionary file (tab-separated)
44 | * @param featureName name of the feature that will be extracted
45 | * @throws IOException
46 | */
47 | public TokenDictFeatures(String listFileName, String featureName) throws IOException {
48 | this.listFileName = listFileName;
49 | this.featureName = featureName;
50 | loadDictionary();
51 | }
52 |
53 | /**
54 | * Extract dictionary information for the token t
55 | * @param t
56 | */
57 | public void extract(Token t) {
58 | // check if the token's lemma is in the dictionary
59 | if (wordMap.containsKey(t.predLemma)) {
60 | // check for POS restriction if necessary
61 | if (posStart == null || t.predPosTag.startsWith(posStart)) {
62 | for (String vclass : wordMap.get(t.predLemma))
63 | t.boundaryFeatureSet.add(featureName + "=" + vclass);
64 | }
65 | }
66 | }
67 |
68 | /**
69 | * Load dictionary from a tab-separated file
70 | * @throws IOException
71 | */
72 | private void loadDictionary() throws IOException {
73 | wordMap = new HashMap<>();
74 |
75 | BufferedReader br = new BufferedReader(new FileReader(listFileName));
76 | String line;
77 |
78 | while ((line = br.readLine()) != null) {
79 | line = line.trim();
80 | String[] tokens = line.split("\\s+");
81 | String word = tokens[0];
82 | String wordClass = tokens[1];
83 |
84 | if (!wordMap.containsKey(word)) {
85 | wordMap.put(word, new HashSet());
86 | }
87 |
88 | wordMap.get(word).add(wordClass);
89 | }
90 |
91 | br.close();
92 |
93 | }
94 |
95 |
96 | }
97 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/TokenLexicalFeatures.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.features.components;
19 |
20 | import ims.cs.lingdata.Token;
21 | import ims.cs.qsample.features.FeatureSet;
22 | import ims.cs.util.StaticConfig;
23 |
24 | /**
25 | * Extracts lexical information about a token (e.g., word, lemma, POS)
26 | */
27 | public abstract class TokenLexicalFeatures {
28 |
29 | private static final String TOK_PREFIX = "TOK";
30 | private static final String LEMMA_PREFIX = "LEMMA";
31 | private static final String POS_PREFIX = "POS";
32 | private static final String BG_PREFIX = "BG";
33 | private static final String NE_PREFIX = "NE";
34 | private static final String PARBEGIN_PREFIX = "PAR-BEGINS";
35 | private static final String PAREND_PREFIX = "PAR-ENDS";
36 |
37 |
38 | /**
39 | * Extract lexical features about a single token t
40 | * @param t
41 | */
42 | public static void extract(Token t) {
43 |
44 | if (StaticConfig.lexicalPos ||
45 | StaticConfig.lexicalLemma ||
46 | StaticConfig.lexicalToken)
47 | addWindowFeatures(t);
48 |
49 | if (StaticConfig.lexicalBigram) addBigramFeature(t);
50 | addNeFeature(t);
51 | addDocStructureFeature(t);
52 | }
53 |
54 | /**
55 | * Adds paragraph begin and end features
56 | * @param token
57 | */
58 | private static void addDocStructureFeature(Token token) {
59 | if (token.paragraphBegins) token.boundaryFeatureSet.add(PARBEGIN_PREFIX);
60 | if (token.nextToken == null || token.nextToken.paragraphBegins) token.boundaryFeatureSet.add(PAREND_PREFIX);
61 | }
62 |
63 | /**
64 | * Adds features about whether the token is part of a named entity
65 | * @param token
66 | */
67 | private static void addNeFeature(Token token) {
68 | if (!token.predNer.equals("?") && !token.predNer.equals("O")) {
69 | token.boundaryFeatureSet.add(NE_PREFIX+"-IS-NE");
70 | token.boundaryFeatureSet.add(NE_PREFIX+"-IS-NE-" + token.predNer);
71 | }
72 | }
73 |
74 | /**
75 | * Adds bigram features with the previous and next token
76 | * @param token
77 | */
78 | private static void addBigramFeature(Token token) {
79 | String prevWordForm;
80 | String prevLemma;
81 |
82 | String nextWordForm;
83 | String nextLemma;
84 |
85 | // find previous token
86 | if (token.previousToken == null) {
87 | prevWordForm = "null";
88 | prevLemma = "null";
89 | } else {
90 | Token prevToken = token.previousToken;
91 | prevWordForm = prevToken.predText;
92 | prevLemma = prevToken.predLemma;
93 | }
94 |
95 | // find next token
96 | if (token.nextToken == null) {
97 | nextWordForm = "null";
98 | nextLemma = "null";
99 | } else {
100 | Token nextToken = token.nextToken;
101 | nextWordForm = nextToken.predText;
102 | nextLemma = nextToken.predLemma;
103 | }
104 |
105 | // add features of word and lemma bigrams
106 | FeatureSet fs = token.boundaryFeatureSet;
107 |
108 | fs.add(BG_PREFIX + prevWordForm + "<--" + token.predText);
109 | fs.add(BG_PREFIX + "(LEMMA)" + prevLemma + "<--" + token.predLemma);
110 |
111 | fs.add(BG_PREFIX + nextWordForm + "-->" + token.predText);
112 | fs.add(BG_PREFIX + "(LEMMA)" + nextLemma + "-->" + token.predLemma);
113 | }
114 |
115 |
116 | /**
117 | * Adds features from other tokens within a window
118 | * @param pToken
119 | */
120 | private static void addWindowFeatures(Token pToken) {
121 | // current POS tag
122 | FeatureSet fs = pToken.boundaryFeatureSet;
123 |
124 | if (StaticConfig.lexicalPos) fs.add(POS_PREFIX + "-0=" + pToken.predPosTag);
125 | if (StaticConfig.lexicalToken) fs.add(TOK_PREFIX + "-0=" + pToken.predText);
126 | if (StaticConfig.lexicalLemma) fs.add(LEMMA_PREFIX + "-0=" + pToken.predLemma);
127 |
128 |
129 | // previous tokens
130 | Token currentToken = pToken;
131 | for (int i = 1; i <= StaticConfig.lexicalWindowSize; i++) {
132 | String leftPos;
133 | String leftTok;
134 | String leftLemma;
135 |
136 | Token prevToken = currentToken.previousToken;
137 | if (prevToken != null) {
138 | leftPos = prevToken.predPosTag;
139 | leftTok = prevToken.predText;
140 | leftLemma = prevToken.predLemma;
141 | currentToken = prevToken;
142 | } else {
143 | leftPos = "NONE";
144 | leftLemma = "NONE";
145 | leftTok = "NONE";
146 | }
147 |
148 | if (StaticConfig.lexicalPos) fs.add("WIN_" + POS_PREFIX + "-" + i + "=" + leftPos);
149 | if (StaticConfig.lexicalToken) fs.add("WIN_" + TOK_PREFIX + "-" + i + "=" + leftTok);
150 | if (StaticConfig.lexicalLemma) fs.add("WIN_" + LEMMA_PREFIX + "-" + i + "=" + leftLemma);
151 | }
152 |
153 | // subsequent tokens
154 | currentToken = pToken;
155 | for (int i = 1; i <= StaticConfig.lexicalWindowSize; i++) {
156 | String rightPos;
157 | String rightTok;
158 | String rightLemma;
159 |
160 | Token nextToken = currentToken.nextToken;
161 | if (nextToken != null) {
162 | rightPos = nextToken.predPosTag;
163 | rightTok = nextToken.predText;
164 | rightLemma = nextToken.predLemma;
165 | currentToken = nextToken;
166 |
167 | } else {
168 | rightPos = "NONE";
169 | rightLemma = "NONE";
170 | rightTok = "NONE";
171 |
172 | }
173 |
174 | if (StaticConfig.lexicalPos) fs.add("WIN_" + POS_PREFIX + "+" + i + "=" + rightPos);
175 | if (StaticConfig.lexicalToken) fs.add("WIN_" + TOK_PREFIX + "+" + i + "=" + rightTok);
176 | if (StaticConfig.lexicalLemma) fs.add("WIN_" + LEMMA_PREFIX + "+" + i + "=" + rightLemma);
177 |
178 | }
179 | }
180 |
181 |
182 |
183 | }
184 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/TokenListFeatures.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.features.components;
19 |
20 | import java.io.BufferedReader;
21 | import java.io.FileReader;
22 | import java.io.IOException;
23 | import java.util.HashSet;
24 | import java.util.Set;
25 |
26 | import ims.cs.lingdata.Token;
27 |
28 |
29 | /**
30 | * Feature extractor that check whether a token is in a list (specified in a file)
31 | */
32 | public class TokenListFeatures {
33 |
34 | private String featureName = "VERBLIST";
35 | private String listFileName;
36 | private Set wordSet;
37 | private int window = 5;
38 | public String posStart = null;
39 |
40 |
41 | /**
42 | * Set up the feature extractor
43 | * @param listFileName list of words (one word per line)
44 | * @param featureName
45 | * @throws IOException
46 | */
47 | public TokenListFeatures(String listFileName, String featureName) throws IOException {
48 | this.listFileName = listFileName;
49 | this.featureName = featureName;
50 | loadWordList();
51 | }
52 |
53 | /**
54 | * Extract list feature for the token t
55 | * @param t
56 | */
57 | public void extract(Token t) {
58 | // current token
59 | if ((posStart == null || t.predPosTag.startsWith(posStart)) && wordSet.contains(t.predLemma)) {
60 | t.boundaryFeatureSet.add(featureName);
61 | }
62 |
63 | // window before the token
64 | Token prevToken = t;
65 | for (int i = 0; i < window; i++) {
66 | prevToken = prevToken.previousToken;
67 | if (prevToken == null) break;
68 | if (wordSet.contains(prevToken.predLemma)) {
69 | t.boundaryFeatureSet.add("WIN_-" + (i+1) + "-" + featureName);
70 | }
71 | }
72 |
73 | // window after the token
74 | Token nextToken = t;
75 | for (int i = 0; i < window; i++) {
76 | nextToken = nextToken.nextToken;
77 | if (nextToken == null) break;
78 | if (wordSet.contains(nextToken.predLemma)) {
79 | t.boundaryFeatureSet.add("WIN_+" + (i+1) + "-" + featureName);
80 | }
81 | }
82 | }
83 |
84 | /**
85 | * Loads the word list (one word per line)
86 | * @throws IOException
87 | */
88 | private void loadWordList() throws IOException {
89 | wordSet = new HashSet<>();
90 |
91 | BufferedReader br = new BufferedReader(new FileReader(listFileName));
92 | String line;
93 |
94 | while ((line = br.readLine()) != null) {
95 | line = line.trim();
96 | wordSet.add(line);
97 | }
98 |
99 | br.close();
100 |
101 | }
102 |
103 |
104 | }
105 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/greedysample/HasScore.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.greedysample;
19 |
20 | /**
21 | * An interface for things that have a score.
22 | * Created by scheibcn on 11/5/15.
23 | */
24 | public interface HasScore {
25 | double getScore();
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/greedysample/Sampling.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 | package ims.cs.qsample.greedysample;
18 |
19 | import java.util.List;
20 | import java.util.Random;
21 |
22 | /**
23 | * Sample an element from a list of elements with a score.
24 | * Created by scheibcn on 11/5/15.
25 | */
26 | public class Sampling {
27 |
28 |
29 | Random random;
30 | public boolean doExp = true;
31 |
32 | public Sampling(Random random) {
33 | this.random = random;
34 | }
35 |
36 |
37 | /**
38 | * Sample an element proportionally to sigmoid-transformed scores
39 | * @param items
40 | */
41 | public int sampleOne(List items, double temperature, double bias) {
42 | double[] values = new double[items.size()];
43 | double sum = 0;
44 |
45 | // first compute scores and normalize
46 | for (int i = 0; i < values.length; i++) {
47 | double score = items.get(i).getScore();
48 | values[i] = (score + bias) / temperature;
49 |
50 | if (doExp) {
51 | values[i] = 1/(1+Math.exp(-values[i]));
52 | }
53 | sum += values[i];
54 | }
55 |
56 | // then sample proportionally
57 | double sumNorm = 0;
58 | double r = random.nextDouble();
59 | int resultPosition = 0;
60 |
61 | for (int i = 0; i < values.length; i++) {
62 | values[i] /= sum;
63 | sumNorm += values[i];
64 | if (sumNorm > r) {
65 | resultPosition = i;
66 | break;
67 | }
68 | }
69 |
70 | return resultPosition;
71 | }
72 |
73 | }
74 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/models/HigherSpanModel.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.models;
19 |
20 | import ims.cs.qsample.features.FeatureSet;
21 | import ims.cs.qsample.perceptron.Perceptron;
22 | import ims.cs.qsample.spans.Span;
23 |
24 | import java.io.FileNotFoundException;
25 | import java.io.FileOutputStream;
26 | import java.io.PrintStream;
27 | import java.io.Serializable;
28 |
29 | /**
30 | * A model for scoring a whole span (rather than just begin and end information)
31 | * Created by scheibcn on 3/5/16.
32 | */
33 | public class HigherSpanModel implements Serializable {
34 |
35 | private static final long serialVersionUID = 3509778136938744648L;
36 |
37 | // We actually make separate models for begin, end, and span-level information.
38 | // This makes feature management easier, among other things.
39 | Perceptron beginPerceptron;
40 | Perceptron endPerceptron;
41 | Perceptron higherOrderPerceptron;
42 |
43 | public HigherSpanModel() {
44 | this.beginPerceptron = new Perceptron();
45 | this.endPerceptron = new Perceptron();
46 | this.higherOrderPerceptron = new Perceptron();
47 | }
48 |
49 | /**
50 | * Computes the current score of a span according to the model
51 | * @param span
52 | * @param average use averaged perceptron?
53 | * @return
54 | */
55 | public double score(Span span, boolean average) {
56 | // we handle the begin, end, and span features separately
57 | FeatureSet beginFeatures = span.first().boundaryFeatureSet;
58 | FeatureSet endFeatures = span.last().boundaryFeatureSet;
59 | FeatureSet spanFeatures = span.featureSet;
60 |
61 | // ... then, we can compute three individual scores
62 | double score = 0;
63 | score += beginPerceptron.score(beginFeatures, average);
64 | score += endPerceptron.score(endFeatures, average);
65 | score += higherOrderPerceptron.score(spanFeatures, average);
66 |
67 | return score;
68 | }
69 |
70 | /**
71 | * Train the model using a given span, updating with a specified learning rate
72 | * @param span
73 | * @param isPositive Has the example been correctly classified?
74 | * @param rate learning rate
75 | */
76 | public void train(Span span, boolean isPositive, double rate) {
77 | FeatureSet leftFeatures = span.first().boundaryFeatureSet;
78 | FeatureSet rightFeatures = span.last().boundaryFeatureSet;
79 | FeatureSet spanFeatures = span.featureSet;
80 |
81 | // negate the learning rate if the example was wrong
82 | double effectiveRate = rate;
83 | if (!isPositive) effectiveRate = -effectiveRate;
84 |
85 | // update the three models separately
86 | // (use the update function directly as the train function would first check the score, which is nonsensical
87 | // for the individual models)
88 | beginPerceptron.update(leftFeatures, effectiveRate);
89 | endPerceptron.update(rightFeatures, effectiveRate);
90 | higherOrderPerceptron.update(spanFeatures, effectiveRate);
91 | }
92 |
93 |
94 | /**
95 | * Writes the current feature weights to a file
96 | * @param fileName
97 | * @throws FileNotFoundException
98 | */
99 | public void printWeights(String fileName) throws FileNotFoundException {
100 | FileOutputStream fos = new FileOutputStream(fileName);
101 | PrintStream ps = new PrintStream(fos);
102 | beginPerceptron.printWeights(ps, "BEGIN");
103 | endPerceptron.printWeights(ps, "END");
104 | higherOrderPerceptron.printWeights(ps, "HIGHER");
105 | ps.close();
106 | }
107 |
108 | }
109 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/perceptron/Perceptron.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.perceptron;
19 |
20 | import ims.cs.qsample.features.FeatureSet;
21 | import ims.cs.qsample.spans.Span;
22 |
23 | import java.io.FileNotFoundException;
24 | import java.io.FileOutputStream;
25 | import java.io.PrintStream;
26 | import java.io.Serializable;
27 | import java.util.Map;
28 |
29 | /**
30 | * Implementation of perceptron model
31 | * Created by scheibcn on 11/5/15.
32 | */
33 | public class Perceptron implements Serializable {
34 | private static final long serialVersionUID = 3436601656314837271L;
35 |
36 | // this model can actually also do logistic regression
37 | public enum UpdateType {PERCEPTRON, LR};
38 |
39 | // .. default is perceptron
40 | public UpdateType updateType = UpdateType.PERCEPTRON;
41 |
42 | public Weights weights = new Weights();
43 |
44 | // parameters
45 | public double fixedBias = 0; /* optional bias that can be manually adjusted */
46 | public double marginPositive = 1; /* margin for positive class */
47 | public double marginNegative = 1; /* margin for negative class */
48 |
49 | // some debugging data
50 | public int numUpdates = 0;
51 |
52 |
53 | public Perceptron() {
54 | weights.weightMap.put("BIAS", 0.0);
55 | }
56 |
57 |
58 | /**
59 | * Score a feature set
60 | * @param featureSet
61 | * @return
62 | */
63 | public double score(FeatureSet featureSet, boolean average) {
64 | double score = 0;
65 |
66 | // first, add bias
67 | if (average) {
68 | score += weights.getAvg("BIAS");
69 | score += fixedBias;
70 | } else {
71 | score += weights.get("BIAS");
72 | }
73 |
74 | // then, score all features in the data
75 | for (String feature: featureSet) {
76 | if (average) {
77 | score += weights.getAvg(feature);
78 | } else {
79 | score += weights.get(feature);
80 | }
81 | }
82 |
83 | return score;
84 | }
85 |
86 | /**
87 | * Perform an update with a given training example
88 | * @param featureSet
89 | * @param isPositive is this example a positive one?
90 | * @param rate
91 | */
92 | public void train(FeatureSet featureSet, boolean isPositive, double rate) {
93 | if (updateType == UpdateType.PERCEPTRON)
94 | trainPerceptron(featureSet, isPositive, rate); /* perceptron update */
95 | else if (updateType == UpdateType.LR)
96 | trainLr(featureSet, isPositive, rate); /* logistic regression update */
97 | }
98 |
99 | /**
100 | * Perform a perceptron-style update
101 | * @param featureSet
102 | * @param isPositive
103 | * @param rate
104 | */
105 | public void trainPerceptron(FeatureSet featureSet, boolean isPositive, double rate) {
106 | double predScore = score(featureSet, false);
107 |
108 | if (isPositive && predScore - marginPositive <= 0) { /* positive example and negative margin violation */
109 | update(featureSet, rate);
110 | } else if (!isPositive && predScore + marginNegative > 0) { /* negative example and positive margin violation */
111 | update(featureSet, -rate);
112 | }
113 | }
114 |
115 | /**
116 | * Perform a logistic regression update
117 | * @param featureSet
118 | * @param isPositive
119 | * @param rate
120 | */
121 | public void trainLr(FeatureSet featureSet, boolean isPositive, double rate) {
122 | double predScore = score(featureSet, false);
123 |
124 | // true probability of the example?
125 | int trueProb;
126 | if (isPositive) trueProb = 1;
127 | else trueProb = 0;
128 |
129 | // learning rate times LR gradient
130 | double step = rate * (trueProb - sigmoid(predScore));
131 |
132 | update(featureSet, step);
133 | }
134 |
135 |
136 | /**
137 | * Update the weights for each feature by the given rate
138 | * @param featureSet
139 | * @param rate
140 | */
141 | public void update(FeatureSet featureSet, double rate) {
142 | // bias
143 | weights.update("BIAS", rate);
144 |
145 | // features
146 | for (String feature : featureSet) {
147 | weights.update(feature, rate);
148 | }
149 |
150 | numUpdates++;
151 | }
152 |
153 | /**
154 | * Print the weights for the features of the span to debug
155 | * @param span
156 | * @param prefix
157 | */
158 | public void printInfo(Span span, String prefix) {
159 | for (String feature: span.featureSet) {
160 | double weight = weights.get(feature);
161 | System.out.println(prefix + feature + " " + weight);
162 | }
163 | }
164 |
165 | /**
166 | * Write the current feature weights to a file
167 | * @param fileName
168 | * @throws FileNotFoundException
169 | */
170 | public void printWeights(String fileName) throws FileNotFoundException {
171 | FileOutputStream fos = new FileOutputStream(fileName);
172 | PrintStream ps = new PrintStream(fos);
173 | printWeights(ps, "");
174 | }
175 |
176 | /**
177 | * Print the current feature weights to stdout
178 | */
179 | public void printWeights() {
180 | printWeights(System.out, "");
181 | }
182 |
183 | /**
184 | * Write the current feature weights to a stream, prepend each line with the specified prefix
185 | * @param out
186 | * @param prefix
187 | */
188 | public void printWeights(PrintStream out, String prefix) {
189 | for (Map.Entry entry : weights.weightMap.entrySet()) {
190 | out.println(prefix + "-->" + entry.getKey() + "\t" + entry.getValue());
191 | }
192 | }
193 |
194 |
195 | /**
196 | * Calculate the sigmoid of x
197 | * @param x
198 | * @return
199 | */
200 | public static double sigmoid(double x) {
201 | return 1/(1+Math.exp(-x));
202 | }
203 |
204 | }
205 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/perceptron/Weights.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.perceptron;
19 |
20 | import java.io.Serializable;
21 | import java.util.HashMap;
22 | import java.util.Map;
23 |
24 | /**
25 | * Store a set of weights associated to strings
26 | * Created by scheibcn on 11/5/15.
27 | */
28 | public class Weights implements Serializable {
29 | private static final long serialVersionUID = 2945274488514737545L;
30 |
31 | // map for holding weights
32 | Map weightMap;
33 |
34 | // map for storing the weight history for averaging
35 | // for a clean description of the algorithm,
36 | // see for example Chapter 3 in Hal Daume's "A Course in Machine Learning"
37 | Map weightCacheMap;
38 |
39 | public boolean doAveraging = true;
40 | int averagingCoefficient = 0;
41 |
42 | public Weights() {
43 | // allocate some large maps
44 | weightMap = new HashMap(100000);
45 | weightCacheMap = new HashMap(100000);
46 | }
47 |
48 | /**
49 | * Resets all weights to 0
50 | */
51 | public void resetWeights() {
52 | averagingCoefficient = 0;
53 | weightMap.clear();
54 | weightCacheMap.clear();
55 | }
56 |
57 | /**
58 | * Get the most recent weight of a feature. Returns 0 if the feature is unknown.
59 | * @param feature
60 | * @return
61 | */
62 | public double get(String feature) {
63 | if (weightMap.containsKey(feature)) {
64 | return weightMap.get(feature);
65 | } else {
66 | return 0;
67 | }
68 | }
69 |
70 | /**
71 | * Get the averaged weight of a feature. Returns 0 if the feature is unknown.
72 | * @param feature
73 | * @return
74 | */
75 | public double getAvg(String feature) {
76 | if (weightMap.containsKey(feature)) {
77 | Double cache = weightCacheMap.get(feature);
78 | if (cache == null) cache = 0.0;
79 | return weightMap.get(feature) - (cache/averagingCoefficient);
80 | } else {
81 | return 0;
82 | }
83 | }
84 |
85 | /**
86 | * Update the weight of a feature by value
87 | * @param feature
88 | * @param value
89 | */
90 | public void update(String feature, double value) {
91 | // update the weight of the feature
92 | if (!weightMap.containsKey(feature)) {
93 | weightMap.put(feature, value);
94 | } else {
95 | weightMap.put(feature, weightMap.get(feature) + value);
96 | }
97 |
98 | // also add to averaging map if averaging is on
99 | if (doAveraging) {
100 | if (!weightCacheMap.containsKey(feature)) {
101 | weightCacheMap.put(feature, value * averagingCoefficient);
102 | } else {
103 | weightCacheMap.put(feature, weightCacheMap.get(feature) + value * averagingCoefficient);
104 | }
105 |
106 | averagingCoefficient++;
107 | }
108 | }
109 | }
110 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/run/Common.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.run;
19 |
20 | import ims.cs.lingdata.Document;
21 | import ims.cs.parc.ProcessedCorpus;
22 | import ims.cs.qsample.features.SpanFeatures;
23 | import ims.cs.qsample.models.QuotationPerceptrons;
24 | import ims.cs.qsample.spans.Span;
25 | import ims.cs.util.StaticConfig;
26 |
27 | import java.io.*;
28 | import java.util.List;
29 | import java.util.zip.GZIPInputStream;
30 | import java.util.zip.GZIPOutputStream;
31 |
32 | /**
33 | * Some common functions
34 | * Created by scheibcn on 3/5/16.
35 | */
36 | public abstract class Common {
37 | /**
38 | * Writes the predictions to a file in BIO format
39 | * @param trainDocs
40 | * @param testDocs
41 | * @param valDocs
42 | * @param resDocs
43 | */
44 | public static void writePredictionsToFile(List trainDocs, List testDocs, List valDocs, List resDocs) {
45 | // if in text mode, write empty line after sentence ends and write cues
46 | boolean writeNewLineAfterSentence = StaticConfig.cliMode == StaticConfig.CliMode.TEXT;
47 | boolean writeCues = StaticConfig.cliMode == StaticConfig.CliMode.TEXT;
48 |
49 | // try to write predictions
50 | try {
51 | if (trainDocs != null) ProcessedCorpus.savePredictionsToFile(trainDocs, "train-final", writeNewLineAfterSentence, writeCues);
52 | if (testDocs != null) ProcessedCorpus.savePredictionsToFile(testDocs, "test-final", writeNewLineAfterSentence, writeCues);
53 | if (valDocs != null) ProcessedCorpus.savePredictionsToFile(valDocs, "val-final", writeNewLineAfterSentence, writeCues);
54 | if (resDocs != null) ProcessedCorpus.savePredictionsToFile(resDocs, "res-final", writeNewLineAfterSentence, writeCues);
55 | } catch (IOException e) {
56 | e.printStackTrace();
57 | System.out.println("Unable to write results to file");
58 | }
59 |
60 |
61 | }
62 |
63 | /**
64 | * Writes out all perceptron models
65 | * @param perceptrons
66 | * @param fileName
67 | * @throws IOException
68 | */
69 | public static void serializeModels(QuotationPerceptrons perceptrons, String fileName) throws IOException {
70 | System.out.println("Writing perceptron model to " + fileName);
71 | ObjectOutputStream outputStream = new ObjectOutputStream (new GZIPOutputStream(new FileOutputStream(fileName)));
72 | outputStream.writeObject(perceptrons);
73 | }
74 |
75 | /**
76 | * Reads all perceptron models from a file
77 | * @param fileName
78 | * @return
79 | * @throws IOException
80 | * @throws ClassNotFoundException
81 | */
82 | public static QuotationPerceptrons deserializeModels(String fileName) throws IOException, ClassNotFoundException {
83 | System.out.println("Loading perceptron model from " + fileName);
84 | ObjectInputStream inputStream = new ObjectInputStream(new GZIPInputStream(new FileInputStream(fileName)));
85 | return (QuotationPerceptrons) inputStream.readObject();
86 | }
87 |
88 | /**
89 | * Adds features to gold spans
90 | * @param documents
91 | */
92 | public static void addFeaturesToGoldSpans(List documents) {
93 | for (Document document : documents) {
94 | for (Span goldSpan : document.goldSpanSet) {
95 | SpanFeatures.addAllSpanFeatures(goldSpan);
96 | }
97 | }
98 | }
99 |
100 | public static String pathConcat (String path, String subDir) {
101 | return new File(new File(path), subDir).toString();
102 | }
103 |
104 |
105 | }
106 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/run/PlainTextCorpusReader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.run;
19 |
20 | import ims.cs.lingdata.*;
21 | import ims.cs.parc.ProcessedCorpus;
22 | import ims.cs.util.StaticConfig;
23 | import org.xml.sax.SAXException;
24 |
25 | import javax.xml.parsers.ParserConfigurationException;
26 | import java.io.*;
27 | import java.util.ArrayList;
28 | import java.util.Arrays;
29 | import java.util.List;
30 |
31 | /**
32 | * Created by scheibcn on 6/1/16.
33 | */
34 | public class PlainTextCorpusReader {
35 |
36 | /**
37 | * Read document, one sentence per line
38 | * @param file
39 | * @return
40 | */
41 | public static Document readDocument(File file) throws IOException {
42 | StringBuilder sb = new StringBuilder();
43 | BufferedReader reader = new BufferedReader(new FileReader(file));
44 |
45 | // read all text from file
46 | String line;
47 | while ((line = reader.readLine()) != null) {
48 | sb.append(line);
49 | sb.append('\n');
50 | }
51 |
52 | // build a document with some bogus structure
53 | String text = sb.toString();
54 |
55 | Document d = new Document();
56 | Sentence s = new Sentence();
57 | Token t = new Token();
58 |
59 | // add text and set byte count
60 | t.goldText = text;
61 | t.goldByteCount = new ByteCount(0, t.goldText.length());
62 |
63 | // bookkeeping
64 | s.tokenList = new ArrayList<>();
65 | s.tokenList.add(t);
66 |
67 | d.sentenceList = new ArrayList<>();
68 | d.sentenceList.add(s);
69 |
70 | d.tokenList = new ArrayList<>();
71 | d.tokenList.add(t);
72 | d.text = text;
73 |
74 | // build a document id from the file and directory names
75 | d.docId = new PlainTextDocId(file.getParentFile().getName(), file.getName());
76 |
77 | reader.close();
78 |
79 | return d;
80 | }
81 |
82 | public static ProcessedCorpus readDocuments(String directory) throws ClassNotFoundException, SAXException, ParserConfigurationException, IOException {
83 | List documentList = new ArrayList<>();
84 |
85 | // import all files in the directory
86 | File dir = new File(directory);
87 | File[] files = dir.listFiles();
88 | Arrays.sort(files);
89 |
90 | for (File file : files) {
91 | if (StaticConfig.verbose) System.out.println(file);
92 | Document document = readDocument(file);
93 | documentList.add(document);
94 | }
95 |
96 | PlainTextCorpus corpus = new PlainTextCorpus(documentList);
97 |
98 | return new ProcessedCorpus(corpus);
99 | }
100 |
101 |
102 | public static void pipeline() {
103 |
104 | }
105 |
106 | public static Document dummyDocument () {
107 | Document d = new Document();
108 | Sentence s = new Sentence();
109 | Token t = new Token();
110 |
111 | t.goldText = "\"I am very disappointed,\" said Dr. Miller.\n Futher, he reported that everything was fine.";
112 | t.goldByteCount = new ByteCount(0, t.goldText.length());
113 |
114 | s.tokenList = new ArrayList<>();
115 | s.tokenList.add(t);
116 |
117 | d.sentenceList = new ArrayList<>();
118 | d.sentenceList.add(s);
119 |
120 | d.tokenList = new ArrayList<>();
121 | d.tokenList.add(t);
122 | d.text = t.goldText;
123 |
124 | d.docId = new PlainTextDocId("dummyTestDirectory1", "dummyTestFile1");
125 |
126 | return d;
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/run/RunCrf.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.run;
19 |
20 | import ims.cs.lingdata.Document;
21 | import ims.cs.parc.PARCCorpus;
22 | import ims.cs.parc.ProcessedCorpus;
23 | import ims.cs.qsample.evaluate.EvaluateSpan;
24 | import ims.cs.qsample.models.CrfClassifier;
25 | import ims.cs.qsample.models.QuotationPerceptrons;
26 | import ims.cs.qsample.perceptron.PerceptronTrainer;
27 | import ims.cs.util.MultiOutputStream;
28 | import ims.cs.util.NewStaticPrinter;
29 | import ims.cs.util.StaticConfig;
30 | import org.xml.sax.SAXException;
31 |
32 | import javax.xml.parsers.ParserConfigurationException;
33 | import java.io.IOException;
34 | import java.util.List;
35 |
36 | /**
37 | * Run an experiment with a CRF model
38 | * Created by scheibcn on 3/3/16.
39 | */
40 | public class RunCrf {
41 |
42 |
43 | /**
44 | * Run the full CRF training and testing pipeline
45 | * @param trainDocs training documents
46 | * @param testDocs test documents (may be null)
47 | * @param valDocs validation documents (may be null)
48 | * @param resDocs resubstitution documents (may be null)
49 | * @param beginMargin positive margin for begin perceptron
50 | * @param endMargin positive margin for end perceptron
51 | * @param cueMargin positive margin for cue perceptron
52 | * @param numIter number of epochs for training
53 | * @param perceptrons optionally: specify some pre-trained perceptrons
54 | * @param crfClassifier optionally: specify a pre-trained CRF
55 | * @return final CRF model
56 | * @throws IOException
57 | * @throws ClassNotFoundException
58 | */
59 | public static CrfClassifier runCrfPipeline(List trainDocs, List testDocs, List valDocs, List resDocs,
60 | double beginMargin, double endMargin, double cueMargin,
61 | int numIter, QuotationPerceptrons perceptrons, CrfClassifier crfClassifier) throws IOException, ClassNotFoundException {
62 |
63 | // train a cue model if necessary, then predict
64 | if (perceptrons == null) {
65 | PerceptronTrainer.trainAllPerceptronsAndApply(trainDocs, testDocs, valDocs, resDocs, beginMargin, endMargin, cueMargin, true, 10, 10);
66 | } else {
67 | perceptrons.predictionPipelineCue(trainDocs, testDocs, valDocs, resDocs);
68 | perceptrons.predictionPipelineBoundary(trainDocs, testDocs, valDocs, resDocs);
69 | }
70 |
71 | // train CRF
72 | if (crfClassifier == null) {
73 | crfClassifier = new CrfClassifier();
74 | crfClassifier.numIter = numIter;
75 | crfClassifier.train(trainDocs, testDocs, valDocs, resDocs);
76 | }
77 |
78 | // apply CRF
79 | System.out.println("Applying CRF to test data");
80 | crfClassifier.test(trainDocs, testDocs, valDocs, resDocs);
81 |
82 | // evaluate
83 | EvaluateSpan.evaluateAndPrint("", "|", trainDocs, testDocs, valDocs, resDocs);
84 |
85 | // save predictions
86 | Common.writePredictionsToFile(trainDocs, testDocs, valDocs, resDocs);
87 |
88 | // output feature weights
89 | // this takes a lot of time, so it's deactivated right now
90 | if (false) crfClassifier.print();
91 |
92 | return crfClassifier;
93 | }
94 |
95 | /**
96 | * This runs the full experimental pipeline w/ training and testing
97 | * @return
98 | * @throws ClassNotFoundException
99 | * @throws SAXException
100 | * @throws ParserConfigurationException
101 | * @throws IOException
102 | */
103 | public static CrfClassifier fullExperiment() throws ClassNotFoundException, SAXException, ParserConfigurationException, IOException {
104 | ProcessedCorpus pc = new ProcessedCorpus(PARCCorpus.getInstance());
105 | List trainDocs = pc.getTrain();
106 | List testDocs = pc.getTest();
107 | List valDocs = pc.getDev();
108 | List resDocs = pc.getTrainSample(10);
109 |
110 | return runCrfPipeline(trainDocs, testDocs, valDocs, resDocs, StaticConfig.beginMargin, StaticConfig.endMargin, StaticConfig.cueMargin, 500, null, null);
111 | }
112 |
113 | /**
114 | * Running this program will train the CRF model as described in the paper
115 | * @param args
116 | * @throws ClassNotFoundException
117 | * @throws SAXException
118 | * @throws ParserConfigurationException
119 | * @throws IOException
120 | */
121 | public static void main(String[] args) throws ClassNotFoundException, SAXException, ParserConfigurationException, IOException {
122 | String logFileName = NewStaticPrinter.getLogFileName(Common.pathConcat(StaticConfig.outputDirectory, "crf-"));
123 | NewStaticPrinter.init(logFileName);
124 | MultiOutputStream.init(logFileName);
125 |
126 | CrfClassifier crf = fullExperiment();
127 | crf.saveCrf(logFileName + ".crfmodel");
128 | }
129 | }
130 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/run/RunHeuristicTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.run;
19 |
20 | import ims.cs.lingdata.Document;
21 | import ims.cs.parc.PARCCorpus;
22 | import ims.cs.parc.ProcessedCorpus;
23 | import ims.cs.qsample.evaluate.EvaluateSpan;
24 | import ims.cs.qsample.greedysample.HeuristicSampler;
25 | import ims.cs.qsample.models.QuotationPerceptrons;
26 | import ims.cs.qsample.perceptron.PerceptronTrainer;
27 | import ims.cs.util.MultiOutputStream;
28 | import ims.cs.util.NewStaticPrinter;
29 | import ims.cs.util.StaticConfig;
30 | import org.xml.sax.SAXException;
31 |
32 | import javax.xml.parsers.ParserConfigurationException;
33 | import java.io.IOException;
34 | import java.util.List;
35 |
36 | /**
37 | * Run an experiment with the greedy heuristic model
38 | * Created by scheibcn on 11/5/15.
39 | */
40 | public class RunHeuristicTest {
41 |
42 | // whether to shuffle tokens during prediction
43 | static boolean doShuffleTokens = false;
44 | static boolean incrementalPrediction = false;
45 |
46 | /**
47 | * Run the full greedy heuristic training and testing pipeline
48 | * @param trainDocs training documents
49 | * @param testDocs test documents (may be null)
50 | * @param valDocs validation documents (may be null)
51 | * @param resDocs resubstitution documents (may be null)
52 | * @param beginMargin positive margin for begin perceptron
53 | * @param endMargin positive margin for end perceptron
54 | * @param cueMargin positive margin for cue perceptron
55 | * @param model optionally: specify some pre-trained perceptrons
56 | * @return final perceptron models
57 | */
58 | public static QuotationPerceptrons runHeuristicPipeline(List trainDocs, List testDocs, List valDocs, List resDocs,
59 | double beginMargin, double endMargin, double cueMargin, QuotationPerceptrons model) {
60 |
61 | // train model or predict
62 | if (model == null) {
63 | model = PerceptronTrainer.trainAllPerceptronsAndApply(trainDocs, testDocs, valDocs, resDocs, beginMargin, endMargin, cueMargin, false, 10, 10);
64 | } else {
65 | model.predictionPipelineCue(trainDocs, testDocs, valDocs, resDocs);
66 | model.predictionPipelineBoundary(trainDocs, testDocs, valDocs, resDocs);
67 | }
68 |
69 |
70 | // debug output
71 | for (Document document: testDocs) NewStaticPrinter.printPerceptronPrediction(document, "PP");
72 | NewStaticPrinter.printN("-", 80);
73 |
74 | // SAMPLING
75 | HeuristicSampler sampler = new HeuristicSampler();
76 | sampler.doShuffleTokens = doShuffleTokens;
77 |
78 |
79 | int[] maxDistances;
80 | int[] maxLengths;
81 |
82 | if (incrementalPrediction) { /* version 1: incremental prediction -- performs slightly worse */
83 | maxDistances = new int[]{5, 10, 20, 30};
84 | maxLengths = new int[]{50, 50, 50, 50};
85 | } else { /* version 2: full prediction immediately */
86 | maxDistances = new int[]{30};
87 | maxLengths = new int[]{50};
88 | }
89 |
90 | for (int i = 0; i < maxDistances.length; i++) {
91 | // sample
92 | int maxDistance = maxDistances[i];
93 | int maxLength = maxLengths[i];
94 |
95 | if (trainDocs != null) sampler.sampleGreedy(trainDocs, maxDistance, maxLength);
96 | if (testDocs != null) sampler.sampleGreedy(testDocs, maxDistance, maxLength);
97 | if (valDocs != null) sampler.sampleGreedy(valDocs, maxDistance, maxLength);
98 | if (resDocs != null) sampler.sampleGreedy(resDocs, maxDistance, maxLength);
99 |
100 | // evaluate
101 | EvaluateSpan.evaluateAndPrint("" + maxDistance + " ", "|", trainDocs, testDocs, valDocs, resDocs);
102 | }
103 |
104 | // save predictions
105 | Common.writePredictionsToFile(trainDocs, testDocs, valDocs, resDocs);
106 |
107 | return model;
108 | }
109 |
110 | /**
111 | * This runs the full experimental pipeline w/ training and testing
112 | * @throws ClassNotFoundException
113 | * @throws SAXException
114 | * @throws ParserConfigurationException
115 | * @throws IOException
116 | */
117 | public static void fullExperiment() throws ClassNotFoundException, SAXException, ParserConfigurationException, IOException {
118 | ProcessedCorpus pc = new ProcessedCorpus(PARCCorpus.getInstance());
119 | List trainDocs = pc.getTrain();
120 | List testDocs = pc.getTest();
121 | List valDocs = pc.getDev();
122 | List resDocs = pc.getTrainSample(10);
123 |
124 | runHeuristicPipeline(trainDocs, testDocs, valDocs, resDocs, StaticConfig.beginMargin, StaticConfig.endMargin, StaticConfig.cueMargin, null);
125 | }
126 |
127 | /**
128 | * Run this to train a model without going through QSample.main()
129 | * @param args
130 | * @throws ClassNotFoundException
131 | * @throws SAXException
132 | * @throws ParserConfigurationException
133 | * @throws IOException
134 | */
135 | public static void main (String[] args) throws ClassNotFoundException, SAXException, ParserConfigurationException, IOException {
136 | String logFileName = NewStaticPrinter.getLogFileName("/home/users1/scheibcn/quotations/results/txt/joint-first-run/heuristic-");
137 | MultiOutputStream.init(logFileName);
138 | NewStaticPrinter.init(logFileName);
139 |
140 | fullExperiment();
141 | }
142 | }
143 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/spans/SpanBegin.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.spans;
19 |
20 | import ims.cs.qsample.greedysample.HasScore;
21 |
22 | /**
23 | * Representation of a span begin. It is useful to have this as a separate class since this makes sampling easier.
24 | * Created by scheibcn on 11/5/15.
25 | */
26 | public class SpanBegin implements HasScore {
27 | // each begin has a position and a score
28 | public int position;
29 | public Double score = null;
30 |
31 | public SpanBegin(int position, double score) {
32 | this.position = position;
33 | this.score = score;
34 | }
35 |
36 | public SpanBegin(int position) {
37 | this.position = position;
38 | }
39 |
40 | public double getScore() {
41 | return score;
42 | }
43 |
44 | @Override
45 | public String toString() {
46 | return "SpanBegin(pos=" + position + ",score=" + score + ")";
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/spans/SpanEnd.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.qsample.spans;
19 |
20 | import ims.cs.qsample.greedysample.HasScore;
21 |
22 | /**
23 | * Representation of a span end. It is useful to have this as a separate class since this makes sampling easier.
24 | * Created by scheibcn on 11/5/15.
25 | */
26 | public class SpanEnd implements HasScore {
27 | // each end has a position and a score
28 | public int position;
29 | public Double score = null;
30 |
31 | public SpanEnd(int position, double score) {
32 | this.position = position;
33 | this.score = score;
34 | }
35 |
36 | public SpanEnd(int position) {
37 | this.position = position;
38 | }
39 |
40 | public double getScore() {
41 | return score;
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/util/MultiOutputStream.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.util;
19 |
20 | import java.io.*;
21 | import java.text.DateFormat;
22 | import java.text.SimpleDateFormat;
23 | import java.util.Date;
24 |
25 | /**
26 | * An extension of the default output stream that provides functionality to write to multiple streams at once.
27 | * We can use this to "tee" standard out and standard error into a file, which makes for a cheap and somewhat dirty
28 | * logging alternative.
29 | *
30 | * Adapted from http://www.codeproject.com/Tips/315892/A-quick-and-easy-way-to-direct-Java-System-out-to
31 | */
32 | public class MultiOutputStream extends OutputStream
33 | {
34 |
35 | OutputStream[] outputStreams;
36 |
37 | public MultiOutputStream(OutputStream... outputStreams)
38 | {
39 | this.outputStreams= outputStreams;
40 | }
41 |
42 | @Override
43 | public void write(int b) throws IOException
44 | {
45 | for (OutputStream out: outputStreams)
46 | out.write(b);
47 | }
48 |
49 | @Override
50 | public void write(byte[] b) throws IOException
51 | {
52 | for (OutputStream out: outputStreams)
53 | out.write(b);
54 | }
55 |
56 | @Override
57 | public void write(byte[] b, int off, int len) throws IOException
58 | {
59 | for (OutputStream out: outputStreams)
60 | out.write(b, off, len);
61 | }
62 |
63 | @Override
64 | public void flush() throws IOException
65 | {
66 | for (OutputStream out: outputStreams)
67 | out.flush();
68 | }
69 |
70 | @Override
71 | public void close() throws IOException
72 | {
73 | for (OutputStream out: outputStreams)
74 | out.close();
75 | }
76 |
77 | /**
78 | * Write stdout and stderr to two separate files
79 | * @param fnOut
80 | * @param fnErr
81 | */
82 | public static void init(String fnOut, String fnErr) {
83 | System.out.println("Logging stdout to: " + fnOut);
84 | System.out.println("Logging stdout to: " + fnErr);
85 |
86 | try
87 | {
88 | FileOutputStream fout= new FileOutputStream(fnOut);
89 | FileOutputStream ferr= new FileOutputStream(fnErr);
90 |
91 | MultiOutputStream multiOut= new MultiOutputStream(System.out, fout);
92 | MultiOutputStream multiErr= new MultiOutputStream(System.err, ferr);
93 |
94 | PrintStream stdout= new PrintStream(multiOut);
95 | PrintStream stderr= new PrintStream(multiErr);
96 |
97 | System.setOut(stdout);
98 | System.setErr(stderr);
99 | }
100 | catch (FileNotFoundException e) {
101 | e.printStackTrace();
102 | }
103 |
104 | }
105 |
106 | /**
107 | * Write stdout and stderr into the same file
108 | * @param fnOutAndErr
109 | */
110 | public static void init(String fnOutAndErr) {
111 | System.out.println("Logging all output to: " + fnOutAndErr);
112 |
113 | try
114 | {
115 | FileOutputStream fout= new FileOutputStream(fnOutAndErr);
116 |
117 | MultiOutputStream multiOut= new MultiOutputStream(System.out, fout);
118 |
119 | PrintStream stdout= new PrintStream(multiOut);
120 |
121 | System.setOut(stdout);
122 | System.setErr(stdout);
123 | }
124 | catch (FileNotFoundException e) {
125 | e.printStackTrace();
126 | }
127 |
128 | }
129 |
130 | }
131 |
--------------------------------------------------------------------------------
/src/main/java/ims/cs/util/NewStaticPrinter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of QSample.
3 | * QSample is free software: you can redistribute it and/or modify
4 | * it under the terms of the GNU General Public License as published by
5 | * the Free Software Foundation, either version 3 of the License, or
6 | * (at your option) any later version.
7 | *
8 | * QSample is distributed in the hope that it will be useful,
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | * GNU General Public License for more details.
12 | *
13 | * You should have received a copy of the GNU General Public License
14 | * along with QSample. If not, see .
15 | */
16 |
17 |
18 | package ims.cs.util;
19 |
20 | import ims.cs.lingdata.Document;
21 | import ims.cs.lingdata.Token;
22 | import ims.cs.qsample.spans.Span;
23 |
24 | import java.io.FileNotFoundException;
25 | import java.io.PrintWriter;
26 | import java.text.DateFormat;
27 | import java.text.SimpleDateFormat;
28 | import java.util.Date;
29 |
30 | /**
31 | * A static printer to easily log output. Used mostly for debugging purposes.
32 | * Created by scheibcn on 3/3/16.
33 | */
34 | public class NewStaticPrinter {
35 | // printer may be turned off
36 | public static boolean isOn = true;
37 |
38 | public static String fileRoot;
39 | public static String fileName;
40 | static PrintWriter writer;
41 |
42 | /**
43 | * Pass function to do nothing.
44 | */
45 | public static void pass() {}
46 |
47 | /**
48 | * Sets a log file name from the specified log file root
49 | * @param logFileName
50 | * @throws FileNotFoundException
51 | */
52 | public static void init(String logFileName) throws FileNotFoundException {
53 | fileRoot = logFileName;
54 | fileName = logFileName + ".debug";
55 | if (isOn) writer = new PrintWriter(fileName);
56 | }
57 |
58 |
59 | /**
60 | * Generates a log file name from the specified log file root
61 | * @param prefix
62 | * @return
63 | */
64 | public static String getLogFileName (String prefix) {
65 | DateFormat dateFormat = new SimpleDateFormat("yyyyMMdd-HH:mm:ss");
66 | Date date = new Date();
67 |
68 | return prefix + dateFormat.format(date) + ".log";
69 | }
70 |
71 | /**
72 | * Print line to log file
73 | * @param s
74 | */
75 | public static void println(String s) {
76 | if (isOn) {
77 | writer.write(s);
78 | writer.write("\n");
79 | }
80 | }
81 |
82 | /**
83 | * Print to log file
84 | * @param s
85 | */
86 | public static void print(String s) {
87 | if (isOn) {
88 | writer.write(s);
89 | }
90 | }
91 |
92 | /**
93 | * Print n copies of s to the log file
94 | * @param s
95 | * @param n
96 | */
97 | public static void printN(String s, int n) {
98 | for (int i = 0; i < n; i++) print(s);
99 | println("");
100 | }
101 |
102 |
103 | /**
104 | * Print the perceptron predictions for the given document to the log file
105 | * @param document
106 | * @param prefix string to prepend for each line
107 | */
108 | public static void printPerceptronPrediction (Document document, String prefix) {
109 | for (Token token : document.getTokenList()) {
110 | StringBuilder line = new StringBuilder();
111 |
112 | // prefix
113 | line.append(prefix);
114 | line.append("\t");
115 |
116 | // add token information
117 | line.append(token.predText);
118 | line.append("\t");
119 |
120 |
121 | // gold information
122 | boolean goldBegin = token.startsGoldContentSpan();
123 | boolean goldEnd = token.endsGoldContentSpan();
124 | boolean goldCue = token.isGoldCue();
125 |
126 | if (goldBegin) line.append('B');
127 | else line.append('_');
128 |
129 | if (goldEnd) line.append('E');
130 | else line.append('_');
131 |
132 | if (goldCue) line.append('C');
133 | else line.append('_');
134 |
135 | line.append('\t');
136 |
137 | // predicted information
138 | if (token.perceptronBeginScore > 0) line.append('B');
139 | else line.append('_');
140 |
141 | if (token.perceptronEndScore > 0) line.append('E');
142 | else line.append('_');
143 |
144 | if (token.isPredictedCue) line.append('C');
145 | else line.append('_');
146 |
147 | line.append('\t');
148 |
149 | // scores
150 | line.append(token.perceptronBeginScore); line.append('\t');
151 | line.append(token.perceptronEndScore); line.append('\t');
152 | line.append(token.perceptronCueScore); line.append('\t');
153 | line.append('\t');
154 |
155 | // scores
156 | line.append(token.numTimesSampledBegin); line.append('\t');
157 | line.append(token.numTimesSampledEnd); line.append('\t');
158 | line.append(token.numTimesSampledCue); line.append('\t');
159 |
160 |
161 | println(line.toString());
162 | }
163 | }
164 |
165 | /**
166 | * Print document predictions and gold information using SGML-style tags
167 | * @param doc
168 | */
169 | public static void printAnnotatedDocument(Document doc) {
170 | StringBuilder sb = new StringBuilder();
171 | for (int i = 0; i < doc.tokenList.size(); i++) {
172 | if (Span.anyBeginsAt(doc.goldSpanSet, i)) sb.append("");
173 | if (Span.anyBeginsAt(doc.predictedSpanSet, i)) sb.append("");
174 | sb.append(doc.tokenList.get(i).predText);
175 | if (Span.anyEndsAt(doc.predictedSpanSet, i)) sb.append("");
176 | if (Span.anyEndsAt(doc.goldSpanSet, i)) sb.append("");
177 | sb.append(" ");
178 | }
179 |
180 | println(sb.toString());
181 | }
182 |
183 | }
184 |
--------------------------------------------------------------------------------