├── .gitignore
├── README.md
├── example
    └── documents
    │   ├── wikinews-20070223-train.txt
    │   ├── wikinews-20070306-drain.txt
    │   ├── wikinews-20160706-telescope.txt
    │   └── wikinews-20160714-UN.txt
├── licenses
    ├── CC-SA-3.0
    │   └── LICENSE.txt
    ├── LICENSE.md
    ├── gpl-3.0
    │   └── LICENSE.txt
    ├── verbnet-license-3.0
    │   └── LICENSE.txt
    └── wordnet-license
    │   └── LICENSE.txt
├── pom.xml
├── resources
    └── PARC
    │   ├── configs
    │       ├── acl2016.crf.prop
    │       ├── acl2016.greedy.prop
    │       ├── acl2016.sampling.prop
    │       ├── predpipeline.crf.prop
    │       ├── predpipeline.greedy.prop
    │       └── predpipeline.sampling.prop
    │   ├── listfeatures
    │       ├── attribution_nouns.txt
    │       ├── krestel_verbs.txt
    │       ├── organization.hyponyms.txt
    │       ├── person.hyponyms.txt
    │       ├── titles.txt
    │       └── verbnet.txt
    │   └── news.txt
└── src
    └── main
        └── java
            └── ims
                └── cs
                    ├── bbn
                        ├── BbnNeHandler.java
                        └── BbnNeParser.java
                    ├── corenlp
                        ├── DocumentAligner.java
                        ├── Helper.java
                        ├── IndexedWordIterator.java
                        ├── PARCCoreNlpPipeline.java
                        └── TokenAligner.java
                    ├── lingdata
                        ├── ByteCount.java
                        ├── Corpus.java
                        ├── Document.java
                        ├── DocumentId.java
                        ├── GornAddressList.java
                        ├── Partition.java
                        ├── PlainTextCorpus.java
                        ├── PlainTextDocId.java
                        ├── Sentence.java
                        ├── SentenceId.java
                        ├── Token.java
                        ├── Types.java
                        └── WSJId.java
                    ├── mallet
                        ├── DocumentFeatureSet2TokenSequence.java
                        └── PARCDocumentInstance.java
                    ├── parc
                        ├── PARCAttribution.java
                        ├── PARCCorpus.java
                        ├── ParcUtils.java
                        ├── ProcessedCorpus.java
                        ├── SpanLabelExtractor.java
                        └── xml
                        │   ├── PARCHandler.java
                        │   └── PARCParser.java
                    ├── qsample
                        ├── evaluate
                        │   ├── EvaluateClassifier.java
                        │   ├── EvaluateSpan.java
                        │   └── F1.java
                        ├── features
                        │   ├── Binning.java
                        │   ├── BoundaryFeatures.java
                        │   ├── FeatureExtraction.java
                        │   ├── FeatureIndexMap.java
                        │   ├── FeatureIntSet.java
                        │   ├── FeatureSet.java
                        │   ├── FeatureStringSet.java
                        │   ├── SpanFeatures.java
                        │   └── components
                        │   │   ├── DocumentOffsetConjunction.java
                        │   │   ├── DocumentQuotationFeature.java
                        │   │   ├── SentenceConstituentFeatures.java
                        │   │   ├── SentenceDependencyFeatures.java
                        │   │   ├── SentenceFeaturesDerivedFromListCue.java
                        │   │   ├── SentenceIndicatorFeatures.java
                        │   │   ├── TokenDictFeatures.java
                        │   │   ├── TokenLexicalFeatures.java
                        │   │   └── TokenListFeatures.java
                        ├── greedysample
                        │   ├── HasScore.java
                        │   ├── HeuristicSampler.java
                        │   ├── PerceptronSampler.java
                        │   └── Sampling.java
                        ├── models
                        │   ├── CrfClassifier.java
                        │   ├── HigherSpanModel.java
                        │   └── QuotationPerceptrons.java
                        ├── perceptron
                        │   ├── Perceptron.java
                        │   ├── PerceptronTrainer.java
                        │   └── Weights.java
                        ├── run
                        │   ├── Common.java
                        │   ├── PlainTextCorpusReader.java
                        │   ├── QSample.java
                        │   ├── RunCrf.java
                        │   ├── RunHeuristicTest.java
                        │   └── RunPerceptronSampler.java
                        └── spans
                        │   ├── Span.java
                        │   ├── SpanBegin.java
                        │   └── SpanEnd.java
                    └── util
                        ├── MultiOutputStream.java
                        ├── NewStaticPrinter.java
                        └── StaticConfig.java


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | 
 3 | # Mobile Tools for Java (J2ME)
 4 | .mtj.tmp/
 5 | 
 6 | # Package Files #
 7 | *.jar
 8 | *.war
 9 | *.ear
10 | 
11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
12 | hs_err_pid*
13 | 
14 | # model binaries
15 | resources/PARC/models/*
16 | models.tar.gz
17 | 
18 | # local configuration files
19 | resources/PARC/configs/local/
20 | 
21 | # build output
22 | target/*
23 | 
24 | # tool output
25 | output
26 | 
27 | # intellij project
28 | *.iml
29 | *.idea
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | QSample
  2 | =======
  3 | 
  4 | QSample is a natural language processing tool for automatically
  5 | detecting quotations in text.
  6 | 
  7 | 
  8 | **Example:** In the sentence
  9 | 
 10 | > Witnesses said that several passengers have broken bones.
 11 | 
 12 | the span
 13 | 
 14 | > *that several passengers have broken bones*
 15 | 
 16 | is a quotation.
 17 | 
 18 | 
 19 | Requirements
 20 | ------------
 21 | 
 22 | Java JVM (>= 1.7) and Maven (>= 3.0.0) need to be installed. All other
 23 | dependencies will be downloaded automatically. The dependencies all
 24 | together will amount to ~250 MB. The trained model files take up another
 25 | ~80 MB.
 26 | 
 27 | 
 28 | Setup
 29 | --------
 30 | 
 31 | Install the tool by running the following commands (NOTE: this will trigger a
 32 | **~250 MB** Maven dependency download and will produce a .jar file of
 33 | comparable size):
 34 | 
 35 | 	git clone https://github.com/christianscheible/qsample.git
 36 | 	cd qsample
 37 | 	mvn compile
 38 | 	mvn package
 39 | 	
 40 | If the build was successful, you will find two .jar files in `target/`
 41 | (with and without dependencies, respectively).
 42 | 
 43 | Next, download and unpack the pre-trained models (**~80 MB**):
 44 | 
 45 | 	wget https://github.com/christianscheible/qsample/releases/download/0.1/models.tar.gz
 46 | 	tar xzfv models.tar.gz
 47 | 
 48 | 
 49 | Usage
 50 | -----
 51 | 
 52 | Now we are ready to detect quotations. As a first step, you can run the
 53 | tool on the example documents we provide in `example/documents`. The
 54 | expected format is a directory of plain text files, each containing a
 55 | single document. To process the documents, run the following command:
 56 | 
 57 | 	java -jar target/qsample-0.1-jar-with-dependencies.jar --sample example/documents/ output
 58 | 
 59 | QSample will produce several files in the output directory:
 60 | 
 61 | * `.log` file storing the messages that were also output to command line
 62 | * `.conf` file documenting the configuration used by the tool for this run
 63 | * one `.quotations.gz` file for each document in the input directory
 64 |   containing the detected quotations
 65 | 
 66 | The `.quotations.gz` files contain the predictions made by the model. As
 67 | an example, take the following snippet:
 68 | 
 69 | 	Witnesses       230     239     O       O
 70 | 	said            240     244     O       C
 71 | 	that            245     249     O       B
 72 | 	several         250     257     O       I
 73 | 	passengers      258     268     O       I
 74 | 	have            269     273     O       I
 75 | 	broken          274     280     O       I
 76 | 	bones           281     286     O       E
 77 | 	.               286     287     O       O
 78 | 	
 79 | The output format consists of five columns. The first column contains
 80 | the tokens; the second and third columns contains the byte begin and end
 81 | positions of the tokens in the original input file; the fourth column
 82 | contains the gold labels (if there are any); the fifth column contains
 83 | the predicted quotes. The predictions are encoded using BIOE-style
 84 | labels. The label `C` marks the occurrence of a *cue*, and all words
 85 | between the `B` (begin) and `E` (end) tag are the *content* of the
 86 | quotation.
 87 | 
 88 | 
 89 | Data
 90 | ----
 91 | 
 92 | This repository includes the following data:
 93 | 
 94 | * `example/documents`: Three news articles from WikiNews for
 95 |   testing. QSample expects one plain text document per file. You can
 96 |   mark paragraph boundaries in the text by adding an empty line after
 97 |   each paragraph. Knowledge about paragraphs is useful for detecting
 98 |   quotations. Linguistic pre-processing is performed by Stanford
 99 |   CoreNLP.
100 | * `resources/PARC/configs`: Configuration files for running experiments
101 |   (see below). The `acl2016*` configurations use gold pre-processing,
102 |   whereas the `predpipeline*` configurations use CoreNLP processing. For
103 |   each setup, we supply one file for each of the methods used in the
104 |   paper.
105 | * `resources/PARC/listfeatures`: Word lists for extracting features. We
106 |   supply lists of attribution nouns and verbs, organizations and
107 |   persons, titles, as well as a mapping of verbs to VerbNet
108 |   classes. These lists were generated from third-party resources, see
109 |   `licenses/LICENSE.md`.
110 | * `resources/news.txt`: A list of WSJ ID's that contain news documents.
111 | 
112 | 
113 | Running an experiment
114 | ---------------------
115 | 
116 | To run an experiment on annotated data, you need to obtain several
117 | resources:
118 | 
119 | * Penn Attribution Relations Corpus (PARC3, http://homepages.inf.ed.ac.uk/s1052974/resources.php)
120 | * Penn Treebank 2 (https://catalog.ldc.upenn.edu/LDC95T7)
121 | * BBN Pronoun Coreference and Entity Type Corpus (https://catalog.ldc.upenn.edu/LDC2005T33)
122 | 
123 | Afterwards, you can run experiments based on the configuration files in
124 | `resources/PARC/configs/`. To test the pre-trained models, you need to
125 | adapt the paths in the configuration files. To train a model, you can
126 | simply switch from `TEST` to `TRAIN` mode in the configuration.
127 | 
128 | 
129 | More information
130 | ----------------
131 | 
132 | For more information, refer to our paper (available at
133 | http://www.aclweb.org/anthology/P/P16/P16-1164.pdf):
134 | 
135 | 	@InProceedings{scheibleklingerpado2016,
136 | 		author    = {Scheible, Christian and Klinger, Roman and Pad\'{o}, Sebastian},
137 | 		title     = {Model Architectures for Quotation Detection},
138 | 		booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics},
139 | 		pages     = {1736-1745},
140 | 		year      = {2016}
141 | 	}
142 | 
143 | 	
144 | or check the tool's website at
145 | http://www.ims.uni-stuttgart.de/data/qsample for news.
146 | 
147 | 
148 | License
149 | -------
150 | 
151 | Please see `licenses/LICENSE.md`.
152 | 


--------------------------------------------------------------------------------
/example/documents/wikinews-20070223-train.txt:
--------------------------------------------------------------------------------
 1 | A 9 Carriage Class 390 Pendolino train, with as many as 180 people onboard, operated by Virgin Trains has derailed and crashed in Cumbria, England.
 2 | 
 3 | The train was the 17:15 service from London's Euston Station to Glasgow Central. Witnesses said that several passengers have broken bones. All but one carriage have slid down the embankment and all of the carriages were derailed. 5 people are still seriously injured in hospital, including the driver Ian Black, and 11 people altogether are still in hospital. Hospital reports early on Saturday morning indicate 1 death, 84 year old lady Margaret Masson from Glasgow.
 4 | 
 5 | "It's our understanding there are a number of people injured on the train. We think there are numerous injuries," said a spokesman for the Cumbria Fire and Rescue squad, Brian Mitchelhill.
 6 | 
 7 | "A train has crashed between Oxenholme and Tebay, but that is all we know at the moment. We have got two cars travelling there now and local police are attending," said a spokeswoman for the British Transport Police.
 8 | 
 9 | At least twelve ambulances and at least five fire trucks are at the scene, along with 3 Royal Air Force Sea King helicopters, two mountain rescue teams and one police helicopter. The rescue effort was hampered by limited access to the remote site and poor weather conditions.
10 | 
11 | "At the moment, we have reports of various injuries, from leg injuries to back injuries and head injuries -- ranging from minor to quite serious," said an unnamed ambulance official.
12 | 
13 | Virgin West Coast Class 390 Pendolino EMU 390009 at platform 1 of Carlisle railway station.
14 | "The carriage I am in is completely on its side, it did a sort of bump - and I was thinking don't worry this fine - but then the swaying became very dramatic," said a BBC executive, Caroline Thomson. "Where I am there are some injuries - a woman with a very badly hurt back but I suspect further up the train it's a bit more serious."
15 | 
16 | First reports suggested that the train hit something on the track at 20:15 near Grayrigg, between Oxenholme and Tebay, derailed and slid down an embankment. Early investigators reports say a set of points will be the primary focus of the investigation. All but one of the train's carriages are said to have totally come off the tracks, a spokeswoman said.
17 | 
18 | So far 12 passengers have been taken to The Royal Preston Hospital, where three casualties are said to be in a "critical condition" and one serious. 3 passengers with limb injuries are at the Royal Lancaster Infirmary. People with minor injuries were taken to a nearby farm before being transferred to a Kendal hotel.
19 | 
20 | As of 22:16, Sleeper services over the affected line have been suspended. Other major services are terminating at Preston or Carlisle according to (UK) National Rail Enquires. A five mile cordon has been set up to seal the crash site as investigators from the Rail Accident Investigation Branch attempt to determine the cause of the crash.
21 | 
22 | Virgin owner Richard Branson came to the site from his holiday in Switzerland and held a press conference at a safe distance, roughly 200 meters, from the site. Branson stated that the Virgin Train Pendolino was "built like a tank", and believed the track was to blame. All of the carriages retained their integrity and none of the windows broke. Mr Branson also praised the driver of the train for attempting to stop the train and not leaving the cockpit.
23 | 
24 | Network Rail has checked 700 sets of rail points in response to the accident, though no speed restrictions have been imposed.
25 | 


--------------------------------------------------------------------------------
/example/documents/wikinews-20070306-drain.txt:
--------------------------------------------------------------------------------
 1 | A rupture in the underground stormwater drain system opened a huge sinkhole on February 23, killing three people and bringing down twelve houses in Guatemala City.
 2 | 
 3 | Teenagers Irma and David Soyos and their father, 53-year old Domingo Soyos were killed when their house collapsed into the sinkhole. Nearly a thousand people were evacuated from the San Antonio neighborhood after the collapse.
 4 | 
 5 | Wikinews interviewed Eric Haddox, a civil engineer who has visited the site of the sinkhole and spoken to the engineers working on fixing the drain. Mr. Haddox, who specialises in the building of earthworks, roads, water supply and sewage systems, and is working as a missionary in Guatemala, visited the site following the collapse to help in the recovery effort.
 6 | 
 7 | Mr. Haddox told us that the size of the hole is much smaller than the 330 feet depth originally reported and that the erosion causing the collapse is believed to have happened over a long time, and not just during the recent rains as initially suspected.
 8 | 
 9 | There are also concerns that a four-story building less than a metre from the edge of the hole may collapse as the earth under the building continues to be eroded.
10 | 
11 | Before the collapse, a junction box linked two collector pipes to a 3.5m main pipe leading to a nearby canyon in a system believed to be 20 to 50 years old. The surrounding earth had been filled in artificially to level the ground, but the fill was not well compacted before being built upon. Such leveling of the ground is widespread in Guatemala city.
12 | 
13 | It is thought that, at some point in the last 20 years, either one of the collector pipes ruptured or was detached from the junction box, possibly because of seismic activity. Water gushing out of the break following rainstorms gradually eroded the loosely compacted soil, creating an expanding cavern around the junction box. On February 23, the roof of this cavern collapsed, creating the sinkhole, 20m wide at the top and tapering out towards the bottom, which is about 60m (204 feet) deep, not 330 feet as originally reported.
14 | 
15 | "Things like this don't happen often and there are many interesting engineering lessons to be learned with them", Mr. Haddox said.
16 | 
17 | The sinkhole has continued to expand even after the collapse, since the collector pipes continue to carry water, which cascades 15m down the sinkhole to the main pipe, further eroding the sides of the sinkhole. The hole was about 25m wide at the top and 40m wide at the bottom a week ago.
18 | 
19 | A bypass pipe is being laid to divert the water away from the junction to arrest further erosion. The sinkhole will then have to be drained before repair work can begin.
20 | 
21 | Authorities are also concerned that similar breakages and undermining may be happening at other locations, Mr. Haddox said. Muddy water has been seen coming out of the main collector pipes, but it is not certain whether this is due to ruptures elsewhere or simply mud from the surface that has been washed into the drainage system.
22 | 
23 | 


--------------------------------------------------------------------------------
/example/documents/wikinews-20160706-telescope.txt:
--------------------------------------------------------------------------------
 1 | On Sunday, China announced the attachment of the final panel to its telescope named Five hundred meter Aperture Spherical Telescope (FAST). This piece marks the end of a five-year-long US$180 million (CNY¥1.2 billion) construction project.
 2 | 
 3 | FAST comprises about 4,500 panels and spans a diameter of 500 meters (about 1640 feet). The telescope is part of a series of ventures into space exploration by China, including planning another robotic Moon mission and creating a Chinese space station, with its core module set to be launched into space in 2018. With the country's founding centenary coming in 2049, Chinese President Xi Jinping said during a Beijing conference, "great scientific and technological capacity is a must for China to be strong".
 4 | 
 5 | In order to achieve optimal electromagnetic performance for FAST with minimal signal interference, it was built in the South China Karst. This ultimately forced the relocation of about 9,100 inhabitants within a 3.1-mile (5km) radius of the telescope. The residents received about US$1,800 (CNY¥12,000) in reimbursement, with those experiencing difficulties with housing receiving about US$1,500 (CNY¥10,000) in extra compensation. The Chinese government supports the resettlement, with senior party official Li Yuecheng saying the relocation would provide a "sound electromagnetic wave environment".
 6 | 
 7 | The telescope is now the largest-diameter single-dish radio telescope. It took the spot from the 305-meter diameter Arecibo Observatory telescope in Puerto Rico. Russia's RATAN-600 multi-element radio telescope has a diameter of 576 meters. This adds to China's record-defying achievements; it contains the world's largest bridge and the world's longest wall, the Great Wall of China.
 8 | 
 9 | The telescope is set to be ready for use in September. Its possible uses include exploration for pulsars, a special type of neutron stars detected through their emission of radio pulses. Scientists have also described the telescope's potential to explore alien civilization, with NAO Radio Astronomy Technology Laboratory director Peng Bo saying FAST's "potential to discover an alien civilization will be 5 to 10 times that of current equipment, as it can see farther and darker planets".
10 | 
11 | 


--------------------------------------------------------------------------------
/example/documents/wikinews-20160714-UN.txt:
--------------------------------------------------------------------------------
 1 | On Tuesday, a United Nations (UN) tribunal in The Hague dismissed China's sovereignty claims to the South China Sea, a body of water connecting to the Pacific Ocean which is also bordered by the Philippines, Vietnam, Brunei, Malaysia, Taiwan, and Indonesia. Court battles over the claims between China and the Philippines go back to 2013.
 2 | 
 3 | These claims were established by China during the reign of its Nationalist government in the 1940s, marked by a demarcation line nicknamed its Nine-dash line. Its line stretched hundreds of miles from the Chinese mainland, including about nine tenths of the entire sea. The South China Sea is a valuable property, providing passage for about US$5 trillion in trade by planes and boats every year. China is not the only country to claim large parts of the sea; notably, Taiwan and Vietnam have also done so, but other large-scale claimants have been less militarily active about their claims than China.
 4 | 
 5 | China has built several artificial islands and military bases in the South China Sea. The tribunal scolded the impeding of fishing and exploration in the sea by China, which it deemed against the United Nations Convention on the Law of the Sea (UNCLOS), signed by China in 1982. The tribunal also concluded China knowingly permitted the poaching of endangered turtles and clams as well as destroyed coral reefs to construct artificial islands.
 6 | 
 7 | UNCLOS permits countries to claim a 200-nautical mile area from their mainland, referred to as an exclusive economic zone. It also permits freedom of navigation, allowing unimpeded exploration through "high seas": international waters also available for the use of fishing and trade passages.
 8 | 
 9 | There is no process to enforce the decision. UNCLOS allows countries to exclude themselves from "compulsory binding procedures for the settlement of disputes" as defined in Part XV, Section 3 - Article 298. China exercised this right to exclude themselves from compulsory binding procedures on August 25, 2006. They reject the jurisdiction or authority of the tribunal's findings. Various other countries have also exercised Article 298 partially or fully, such as Australia, Canada, the UK, Russia, and France.
10 | 
11 | Many nations made statements after the decision. The Chinese government opposed the decision, calling it "ill-founded". It said "China neither accepts nor recognizes" the decision. The Philippine government referred to the decision as a "milestone decision". The US, a key ally with many of the countries claiming parts of the sea, said it was an "important contribution to the shared goal of a peaceful resolution to disputes in the South China Sea".
12 | 


--------------------------------------------------------------------------------
/licenses/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Code
 2 | ====
 3 | 
 4 | Our code is, unless otherwise specified below, subject to the GPL 3.0 license (`gpl-3.0/`).
 5 | 
 6 | MultiOutputStream based on code by
 7 | http://www.codeproject.com/Tips/315892/A-quick-and-easy-way-to-direct-Java-System-out-to
 8 | (CodeProject license, http://www.codeproject.com/info/cpol10.aspx)
 9 | 
10 | 
11 | Resources
12 | =========
13 | 
14 | `resources/PARC/listfeatures/`: Lists and dictionaries for feature extraction
15 | 
16 | * `attribution_nouns.txt`: List of attribution nouns published by Pareti (2015).
17 | * `krestel_verbs.txt`: List of attribution verbs published by Krestel et al. (2008).
18 | * `organization.hyponyms.txt`, `person.hyponyms.txt`: List of persons and organizations, extracted from WordNet (WordNet license, wordnet-license.txt)
19 | * `titles.txt`: List of titles collected from Wikipedia page https://en.wikipedia.org/wiki/Title (CC-SA license, http://creativecommons.org/licenses/by-sa/3.0/)
20 | * `verbnet.txt`: VerbNet category mappings (VerbNet license, verbnet-license.3.0.txt)
21 | 
22 | `resources/PARC/news.txt`: List of WSJ news articles by http://www.let.rug.nl/~bplank/metadata/genre_files_updated.html
23 | 
24 | `examples/documents`: Three news documents from WikiNews (CC-SA license)
25 | 


--------------------------------------------------------------------------------
/licenses/verbnet-license-3.0/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |             VerbNet 3.0 License (also applies to VerbNet 3.X versions)
 3 | 
 4 | This software and database is being provided to you, the LICENSEE, by
 5 | the University of Colorado under the following license.  By obtaining, using
 6 | and/or copying this software and database, you agree that you have
 7 | read, understood, and will comply with these terms and conditions.:
 8 | 
 9 | Permission to use, copy, modify and distribute this software and
10 | database and its documentation for any purpose and without fee or
11 | royalty is hereby granted, provided that you agree to comply with
12 | the following copyright notice and statements, including the disclaimer,
13 | and that the same appear on ALL copies of the software, database and
14 | documentation, including modifications that you make for internal
15 | use or for distribution.
16 | 
17 | VerbNet 3.0 (or 3.X) Copyright 2009 by University of Colorado.  All rights reserved.
18 | 
19 | THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND THE UNIVERSITY 
20 | OF COLORADO  MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
21 | IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, UNIVERSITY 
22 | OF COLORADO MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
23 | ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
24 | OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
25 | INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
26 | OTHER RIGHTS.
27 | 
28 | The name of University of Colorado or CU may not be used in
29 | advertising or publicity pertaining to distribution of the software
30 | and/or database.  Title to copyright in this software, database and
31 | any associated documentation shall at all times remain with
32 | University of Colorado and LICENSEE agrees to preserve same. 
33 | 
34 | Please reference the following document(s) in any description of 
35 | applications based on VerbNet 3.0 or 3.X: 
36 | 
37 | Karin Kipper, Anna Korhonen, Neville Ryant, Martha Palmer, 
38 | A Large-scale Classification of English Verbs, 
39 | Language Resources and Evaluation Journal, 42(1), pp. 21-40, 
40 | Springer Netherland, 2008.
41 | 
42 | and/or
43 | 
44 | Karin Kipper Schuler, Anna Korhonen, Susan W. Brown, VerbNet overview,
45 | extensions, mappings and apps, Tutorial, NAACL-HLT 2009, Boulder,
46 | Colorado.
47 | 


--------------------------------------------------------------------------------
/licenses/wordnet-license/LICENSE.txt:
--------------------------------------------------------------------------------
1 | WordNet Release 3.0
2 | 
3 | This software and database is being provided to you, the LICENSEE, by Princeton University under the following license. By obtaining, using and/or copying this software and database, you agree that you have read, understood, and will comply with these terms and conditions.: Permission to use, copy, modify and distribute this software and database and its documentation for any purpose and without fee or royalty is hereby granted, provided that you agree to comply with the following copyright notice and statements, including the disclaimer, and that the same appear on ALL copies of the software, database and documentation, including modifications that you make for internal use or for distribution. WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved. THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of Princeton University or Princeton may not be used in advertising or publicity pertaining to distribution of the software and/or database. Title to copyright in this software, database and any associated documentation shall at all times remain with Princeton University and LICENSEE agrees to preserve same.
4 | 
5 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  2 | 
  3 |   <modelVersion>4.0.0</modelVersion>
  4 | 
  5 |   <name>QSample</name>
  6 |   <groupId>ims.cs</groupId>
  7 |   <artifactId>qsample</artifactId>
  8 |   <version>0.1</version>
  9 |   <inceptionYear>2016</inceptionYear>
 10 |   <properties>
 11 |     <java.version>1.7</java.version>
 12 |     <junit.version>4.10</junit.version>
 13 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 14 |   </properties>
 15 |   <organization>
 16 |     <name>IMS, University of Stuttgart, Germany</name>
 17 |     <url>http://www.ims.uni-stuttgart.de/~scheibcn</url>
 18 |   </organization>
 19 | 
 20 |   
 21 |   <dependencies>
 22 | 
 23 |     <dependency>
 24 |       <groupId>junit</groupId>
 25 |       <artifactId>junit</artifactId>
 26 |       <version>${junit.version}</version>
 27 |     </dependency>
 28 | 
 29 |     <dependency>
 30 |       <groupId>edu.stanford.nlp</groupId>
 31 |       <artifactId>stanford-corenlp</artifactId>
 32 |       <version>3.9.2</version>
 33 |     </dependency>
 34 | 
 35 |     <dependency>
 36 |       <groupId>edu.stanford.nlp</groupId>
 37 |       <artifactId>stanford-corenlp</artifactId>
 38 |       <version>3.9.2</version>
 39 |       <classifier>models</classifier>
 40 |     </dependency>
 41 | 
 42 |     <dependency>
 43 |       <groupId>cc.mallet</groupId>
 44 |       <artifactId>mallet</artifactId>
 45 |       <version>2.0.7</version>
 46 |     </dependency>
 47 | 
 48 | 
 49 |     <dependency>
 50 |       <groupId>org.apache.commons</groupId>
 51 |       <artifactId>commons-lang3</artifactId>
 52 |       <version>3.0</version>
 53 |     </dependency>
 54 | 
 55 | 
 56 |     <dependency>
 57 |       <groupId>net.sf.jgrapht</groupId>
 58 |       <artifactId>jgrapht</artifactId>
 59 |       <version>0.8.3</version>
 60 |     </dependency>
 61 | 
 62 |   </dependencies>
 63 | 
 64 |   <build>
 65 |     <sourceDirectory>src/main/java</sourceDirectory>
 66 |     <plugins>
 67 | 
 68 |         <plugin>
 69 |             <groupId>org.apache.maven.plugins</groupId>
 70 |             <artifactId>maven-compiler-plugin</artifactId>
 71 |             <version>3.5.1</version>
 72 |             <configuration>
 73 |               <source>1.7</source>
 74 |               <target>1.7</target>
 75 |             </configuration>
 76 |         </plugin>
 77 | 
 78 | 
 79 |       <!-- Assembles the jar and other release formats (tarball, etc.). -->
 80 |       <plugin>
 81 |         <artifactId>maven-assembly-plugin</artifactId>
 82 |           <version>2.6</version>
 83 |           <executions>
 84 |           <execution>
 85 |             <id>make-assembly</id>
 86 |             <phase>package</phase>
 87 |             <goals>
 88 |               <goal>attached</goal>
 89 |             </goals>
 90 |           </execution>
 91 |         </executions>
 92 |         <configuration>
 93 | 	  <archive>
 94 | 	    <manifest>
 95 | 	      <mainClass>ims.cs.qsample.run.QSample</mainClass>
 96 | 	    </manifest>
 97 | 	  </archive>
 98 | 	  <descriptorRefs>
 99 | 	    <descriptorRef>jar-with-dependencies</descriptorRef>
100 | 	  </descriptorRefs>
101 |         </configuration>
102 |       </plugin>
103 | 
104 |         <plugin>
105 |             <groupId>org.apache.maven.plugins</groupId>
106 |             <artifactId>maven-jar-plugin</artifactId>
107 |             <version>3.0.0</version>
108 |             <executions>
109 |                 <execution>
110 |                     <id>make-assembly</id>
111 |                     <phase>package</phase>
112 |                 </execution>
113 |             </executions>
114 |             <configuration>
115 |                 <excludes>
116 |                     <exclude>**/log4j.properties</exclude>
117 |                 </excludes>
118 |                 <archive>
119 |                     <manifest>
120 |                         <mainClass>ims.cs.qsample.run.QSample</mainClass>
121 |                     </manifest>
122 |                 </archive>
123 |             </configuration>
124 |         </plugin>
125 |         
126 |       <plugin>
127 |         <artifactId>maven-release-plugin</artifactId>
128 |         <version>2.1</version>
129 |         <configuration>
130 |           <!-- During release:perform, enable the "release" profile -->
131 |           <releaseProfiles>release</releaseProfiles>
132 |           <goals>deploy package</goals>
133 |         </configuration>
134 |       </plugin>
135 |       
136 |     </plugins>
137 |   </build>
138 | </project>
139 | 


--------------------------------------------------------------------------------
/resources/PARC/configs/acl2016.crf.prop:
--------------------------------------------------------------------------------
 1 | # Properties file to replicate the ACL 2016 CRF results
 2 | # To run this, please set the following paths first:
 3 | #   path for writing output
 4 | outputDirectory=/path/to/output
 5 | #   path to PARC3
 6 | parcRoot=/path/to/PARC3_complete
 7 | #   path to PTB raw data
 8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/
 9 | #   path to BBN named entities
10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/
11 | #   path for caching CoreNLP output (optional, see below)
12 | coreNlpOutputDirectory=/tmp/
13 | 
14 | # you may optionally change the following settings
15 | #   switch this to true if you want a lot of debug output
16 | verbose=false
17 | #   switch this to TRAIN if you also want to train the model
18 | cliMode=TEST
19 | #   switch this on to cache CoreNLP output in a file
20 | cacheParses=true
21 | #   switch off if you want all outputs in one file
22 | oneFilePerInput=false
23 | 
24 | # the following settings need to remain unchanged
25 | # Training options
26 | modelForTextFileMode=CRF
27 | quotationTypes=DIM
28 | jackknifing=false
29 | outerIter=30
30 | innerIter=50
31 | predictionIter=1000
32 | maxNumTrials=10
33 | predictEvery=10
34 | maxCueDistanceSampling=30
35 | maxCueDistanceHeuristic=30
36 | maxLengthSampling=75
37 | maxLengthHeuristic=50
38 | cueMargin=25.0
39 | beginMargin=25.0
40 | endMargin=25.0
41 | samplerMarginPositive=15
42 | cueTemperature=10.0
43 | beginTemperature=10.0
44 | endTemperature=10.0
45 | useGoldPreprocessing=true
46 | 
47 | # Paths
48 | inputDirectory=*NOT USED IN EXPERIMENT MODE*
49 | crfModelFile=resources/PARC/models/acl2016.goldtok.crfmodel
50 | perceptronModelFile=resources/PARC/models/acl2016.goldtok.models
51 | 
52 | # Features
53 | dependencyParentRelHead=true
54 | constituentLevel=true
55 | lexicalLemma=false
56 | sentenceHasQuote=true
57 | constituentBinningStacked=false
58 | sentenceHasCue=false
59 | sentenceLength=true
60 | dependencyChildRel=true
61 | lexicalPos=false
62 | sentenceLengthBinningStacked=false
63 | dependencyChildRelHead=true
64 | constituentParent=true
65 | dependencyFeatures=true
66 | lexicalWindowSize=5
67 | documentQuotationFeature=true
68 | sentenceHasNe=true
69 | samplerMarginNegative=1
70 | constituentLeftmost=true
71 | flattenQuotes=true
72 | useBioeTags=true
73 | constituentAncestorL=true
74 | sentenceHasPronoun=true
75 | lexicalToken=false
76 | sentenceLengthBinning=false
77 | dependencyCueDependent=true
78 | constituentGoverning=true
79 | constituentBinning=false
80 | dependencyParentRel=true
81 | constituentFeatures=true
82 | lexicalBigram=false
83 | documentOffsetConjunction=true
84 | 
85 | 


--------------------------------------------------------------------------------
/resources/PARC/configs/acl2016.greedy.prop:
--------------------------------------------------------------------------------
 1 | # Properties file to replicate the ACL 2016 greedy results
 2 | # To run this, please set the following paths first:
 3 | #   path for writing output
 4 | outputDirectory=/path/to/output
 5 | #   path to PARC3
 6 | parcRoot=/path/to/PARC3_complete
 7 | #   path to PTB raw data
 8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/
 9 | #   path to BBN named entities
10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/
11 | #   path for caching CoreNLP output (optional, see below)
12 | coreNlpOutputDirectory=/tmp/
13 | 
14 | # you may optionally change the following settings
15 | #   switch this to true if you want a lot of debug output
16 | verbose=false
17 | #   switch this to TRAIN if you also want to train the model
18 | cliMode=TEST
19 | #   switch this on to cache CoreNLP output in a file
20 | cacheParses=true
21 | #   switch off if you want all outputs in one file
22 | oneFilePerInput=false
23 | 
24 | # the following settings need to remain unchanged
25 | # Training options
26 | modelForTextFileMode=GREEDY
27 | quotationTypes=DIM
28 | jackknifing=false
29 | outerIter=30
30 | innerIter=50
31 | predictionIter=1000
32 | maxNumTrials=10
33 | predictEvery=10
34 | maxCueDistanceSampling=30
35 | maxCueDistanceHeuristic=30
36 | maxLengthSampling=75
37 | maxLengthHeuristic=50
38 | cueMargin=25.0
39 | beginMargin=25.0
40 | endMargin=25.0
41 | samplerMarginPositive=15
42 | cueTemperature=10.0
43 | beginTemperature=10.0
44 | endTemperature=10.0
45 | useGoldPreprocessing=true
46 | 
47 | # Paths
48 | inputDirectory=*NOT USED IN EXPERIMENT MODE*
49 | crfModelFile=resources/PARC/models/acl2016.goldtok.crfmodel
50 | perceptronModelFile=resources/PARC/models/acl2016.goldtok.models
51 | 
52 | # Features
53 | dependencyParentRelHead=true
54 | constituentLevel=true
55 | lexicalLemma=false
56 | sentenceHasQuote=true
57 | constituentBinningStacked=false
58 | sentenceHasCue=false
59 | sentenceLength=true
60 | dependencyChildRel=true
61 | lexicalPos=false
62 | sentenceLengthBinningStacked=false
63 | dependencyChildRelHead=true
64 | constituentParent=true
65 | dependencyFeatures=true
66 | lexicalWindowSize=5
67 | documentQuotationFeature=true
68 | sentenceHasNe=true
69 | samplerMarginNegative=1
70 | constituentLeftmost=true
71 | flattenQuotes=true
72 | useBioeTags=true
73 | constituentAncestorL=true
74 | sentenceHasPronoun=true
75 | lexicalToken=false
76 | sentenceLengthBinning=false
77 | dependencyCueDependent=true
78 | constituentGoverning=true
79 | constituentBinning=false
80 | dependencyParentRel=true
81 | constituentFeatures=true
82 | lexicalBigram=false
83 | documentOffsetConjunction=true
84 | 
85 | 


--------------------------------------------------------------------------------
/resources/PARC/configs/acl2016.sampling.prop:
--------------------------------------------------------------------------------
 1 | # Properties file to replicate the ACL 2016 greedy results
 2 | # To run this, please set the following paths first:
 3 | #   path for writing output
 4 | outputDirectory=/path/to/output
 5 | #   path to PARC3
 6 | parcRoot=/path/to/PARC3_complete
 7 | #   path to PTB raw data
 8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/
 9 | #   path to BBN named entities
10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/
11 | #   path for caching CoreNLP output (optional, see below)
12 | coreNlpOutputDirectory=/tmp/
13 | 
14 | # you may optionally change the following settings
15 | #   switch this to true if you want a lot of debug output
16 | verbose=false
17 | #   switch this to TRAIN if you also want to train the model
18 | cliMode=TEST
19 | #   switch this on to cache CoreNLP output in a file
20 | cacheParses=true
21 | #   switch off if you want all outputs in one file
22 | oneFilePerInput=false
23 | 
24 | # the following settings need to remain unchanged
25 | # Training options
26 | modelForTextFileMode=SAMPLE
27 | quotationTypes=DIM
28 | jackknifing=false
29 | outerIter=30
30 | innerIter=50
31 | predictionIter=1000
32 | maxNumTrials=10
33 | predictEvery=10
34 | maxCueDistanceSampling=30
35 | maxCueDistanceHeuristic=30
36 | maxLengthSampling=75
37 | maxLengthHeuristic=50
38 | cueMargin=25.0
39 | beginMargin=25.0
40 | endMargin=25.0
41 | samplerMarginPositive=15
42 | cueTemperature=10.0
43 | beginTemperature=10.0
44 | endTemperature=10.0
45 | useGoldPreprocessing=true
46 | 
47 | # Paths
48 | inputDirectory=*NOT USED IN EXPERIMENT MODE*
49 | crfModelFile=resources/PARC/models/acl2016.goldtok.crfmodel
50 | perceptronModelFile=resources/PARC/models/acl2016.goldtok.models
51 | 
52 | # Features
53 | dependencyParentRelHead=true
54 | constituentLevel=true
55 | lexicalLemma=false
56 | sentenceHasQuote=true
57 | constituentBinningStacked=false
58 | sentenceHasCue=false
59 | sentenceLength=true
60 | dependencyChildRel=true
61 | lexicalPos=false
62 | sentenceLengthBinningStacked=false
63 | dependencyChildRelHead=true
64 | constituentParent=true
65 | dependencyFeatures=true
66 | lexicalWindowSize=5
67 | documentQuotationFeature=true
68 | sentenceHasNe=true
69 | samplerMarginNegative=1
70 | constituentLeftmost=true
71 | flattenQuotes=true
72 | useBioeTags=true
73 | constituentAncestorL=true
74 | sentenceHasPronoun=true
75 | lexicalToken=false
76 | sentenceLengthBinning=false
77 | dependencyCueDependent=true
78 | constituentGoverning=true
79 | constituentBinning=false
80 | dependencyParentRel=true
81 | constituentFeatures=true
82 | lexicalBigram=false
83 | documentOffsetConjunction=true
84 | 
85 | 


--------------------------------------------------------------------------------
/resources/PARC/configs/predpipeline.crf.prop:
--------------------------------------------------------------------------------
 1 | # Properties file to replicate the ACL 2016 CRF results
 2 | # To run this, please set the following paths first:
 3 | #   path for writing output
 4 | outputDirectory=/path/to/output
 5 | #   path to PARC3
 6 | parcRoot=/path/to/PARC3_complete
 7 | #   path to PTB raw data
 8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/
 9 | #   path to BBN named entities
10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/
11 | #   path for caching CoreNLP output (optional, see below)
12 | coreNlpOutputDirectory=/tmp/
13 | 
14 | # you may optionally change the following settings
15 | #   switch this to true if you want a lot of debug output
16 | verbose=false
17 | #   switch this to TRAIN if you also want to train the model
18 | cliMode=TEST
19 | #   switch this on to cache CoreNLP output in a file
20 | cacheParses=false
21 | #   switch off if you want all outputs in one file
22 | oneFilePerInput=true
23 | 
24 | # the following settings need to remain unchanged
25 | # Training options
26 | modelForTextFileMode=CRF
27 | quotationTypes=DIM
28 | jackknifing=false
29 | outerIter=30
30 | innerIter=50
31 | predictionIter=1000
32 | maxNumTrials=10
33 | predictEvery=10
34 | maxCueDistanceSampling=30
35 | maxCueDistanceHeuristic=30
36 | maxLengthSampling=75
37 | maxLengthHeuristic=50
38 | cueMargin=25.0
39 | beginMargin=25.0
40 | endMargin=25.0
41 | samplerMarginPositive=15
42 | cueTemperature=10.0
43 | beginTemperature=10.0
44 | endTemperature=10.0
45 | useGoldPreprocessing=false
46 | 
47 | # Paths
48 | inputDirectory=*NOT USED IN EXPERIMENT MODE*
49 | crfModelFile=resources/PARC/models/predpipeline.crfmodel
50 | perceptronModelFile=resources/PARC/models/predpipeline.models
51 | 
52 | # Features
53 | dependencyParentRelHead=true
54 | constituentLevel=true
55 | lexicalLemma=false
56 | sentenceHasQuote=true
57 | constituentBinningStacked=false
58 | sentenceHasCue=false
59 | sentenceLength=true
60 | dependencyChildRel=true
61 | lexicalPos=false
62 | sentenceLengthBinningStacked=false
63 | dependencyChildRelHead=true
64 | constituentParent=true
65 | dependencyFeatures=true
66 | lexicalWindowSize=5
67 | documentQuotationFeature=true
68 | sentenceHasNe=true
69 | samplerMarginNegative=1
70 | constituentLeftmost=true
71 | flattenQuotes=true
72 | useBioeTags=true
73 | constituentAncestorL=true
74 | sentenceHasPronoun=true
75 | lexicalToken=false
76 | sentenceLengthBinning=false
77 | dependencyCueDependent=true
78 | constituentGoverning=true
79 | constituentBinning=false
80 | dependencyParentRel=true
81 | constituentFeatures=true
82 | lexicalBigram=false
83 | documentOffsetConjunction=true
84 | 
85 | 


--------------------------------------------------------------------------------
/resources/PARC/configs/predpipeline.greedy.prop:
--------------------------------------------------------------------------------
 1 | # Properties file to replicate the ACL 2016 CRF results
 2 | # To run this, please set the following paths first:
 3 | #   path for writing output
 4 | outputDirectory=/path/to/output
 5 | #   path to PARC3
 6 | parcRoot=/path/to/PARC3_complete
 7 | #   path to PTB raw data
 8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/
 9 | #   path to BBN named entities
10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/
11 | #   path for caching CoreNLP output (optional, see below)
12 | coreNlpOutputDirectory=/tmp/
13 | 
14 | # you may optionally change the following settings
15 | #   switch this to true if you want a lot of debug output
16 | verbose=false
17 | #   switch this to TRAIN if you also want to train the model
18 | cliMode=TEST
19 | #   switch this on to cache CoreNLP output in a file
20 | cacheParses=false
21 | #   switch off if you want all outputs in one file
22 | oneFilePerInput=true
23 | 
24 | # the following settings need to remain unchanged
25 | # Training options
26 | modelForTextFileMode=GREEDY
27 | quotationTypes=DIM
28 | jackknifing=false
29 | outerIter=30
30 | innerIter=50
31 | predictionIter=1000
32 | maxNumTrials=10
33 | predictEvery=10
34 | maxCueDistanceSampling=30
35 | maxCueDistanceHeuristic=30
36 | maxLengthSampling=75
37 | maxLengthHeuristic=50
38 | cueMargin=25.0
39 | beginMargin=25.0
40 | endMargin=25.0
41 | samplerMarginPositive=15
42 | cueTemperature=10.0
43 | beginTemperature=10.0
44 | endTemperature=10.0
45 | useGoldPreprocessing=false
46 | 
47 | # Paths
48 | inputDirectory=*NOT USED IN EXPERIMENT MODE*
49 | crfModelFile=resources/PARC/models/predpipeline.crfmodel
50 | perceptronModelFile=resources/PARC/models/predpipeline.models
51 | 
52 | # Features
53 | dependencyParentRelHead=true
54 | constituentLevel=true
55 | lexicalLemma=false
56 | sentenceHasQuote=true
57 | constituentBinningStacked=false
58 | sentenceHasCue=false
59 | sentenceLength=true
60 | dependencyChildRel=true
61 | lexicalPos=false
62 | sentenceLengthBinningStacked=false
63 | dependencyChildRelHead=true
64 | constituentParent=true
65 | dependencyFeatures=true
66 | lexicalWindowSize=5
67 | documentQuotationFeature=true
68 | sentenceHasNe=true
69 | samplerMarginNegative=1
70 | constituentLeftmost=true
71 | flattenQuotes=true
72 | useBioeTags=true
73 | constituentAncestorL=true
74 | sentenceHasPronoun=true
75 | lexicalToken=false
76 | sentenceLengthBinning=false
77 | dependencyCueDependent=true
78 | constituentGoverning=true
79 | constituentBinning=false
80 | dependencyParentRel=true
81 | constituentFeatures=true
82 | lexicalBigram=false
83 | documentOffsetConjunction=true
84 | 
85 | 


--------------------------------------------------------------------------------
/resources/PARC/configs/predpipeline.sampling.prop:
--------------------------------------------------------------------------------
 1 | # Properties file to replicate the ACL 2016 CRF results
 2 | # To run this, please set the following paths first:
 3 | #   path for writing output
 4 | outputDirectory=/path/to/output
 5 | #   path to PARC3
 6 | parcRoot=/path/to/PARC3_complete
 7 | #   path to PTB raw data
 8 | pdtbWsjRawDirectory=/path/to/treebank2/raw/wsj/
 9 | #   path to BBN named entities
10 | bbnPath=/path/to/bbn-pcet/data/WSJtypes-subtypes/
11 | #   path for caching CoreNLP output (optional, see below)
12 | coreNlpOutputDirectory=/tmp/
13 | 
14 | # you may optionally change the following settings
15 | #   switch this to true if you want a lot of debug output
16 | verbose=false
17 | #   switch this to TRAIN if you also want to train the model
18 | cliMode=TEST
19 | #   switch this on to cache CoreNLP output in a file
20 | cacheParses=false
21 | #   switch off if you want all outputs in one file
22 | oneFilePerInput=true
23 | 
24 | # the following settings need to remain unchanged
25 | # Training options
26 | modelForTextFileMode=SAMPLE
27 | quotationTypes=DIM
28 | jackknifing=false
29 | outerIter=30
30 | innerIter=50
31 | predictionIter=1000
32 | maxNumTrials=10
33 | predictEvery=10
34 | maxCueDistanceSampling=30
35 | maxCueDistanceHeuristic=30
36 | maxLengthSampling=75
37 | maxLengthHeuristic=50
38 | cueMargin=25.0
39 | beginMargin=25.0
40 | endMargin=25.0
41 | samplerMarginPositive=15
42 | cueTemperature=10.0
43 | beginTemperature=10.0
44 | endTemperature=10.0
45 | useGoldPreprocessing=false
46 | 
47 | # Paths
48 | inputDirectory=*NOT USED IN EXPERIMENT MODE*
49 | crfModelFile=resources/PARC/models/predpipeline.crfmodel
50 | perceptronModelFile=resources/PARC/models/predpipeline.models
51 | 
52 | # Features
53 | dependencyParentRelHead=true
54 | constituentLevel=true
55 | lexicalLemma=false
56 | sentenceHasQuote=true
57 | constituentBinningStacked=false
58 | sentenceHasCue=false
59 | sentenceLength=true
60 | dependencyChildRel=true
61 | lexicalPos=false
62 | sentenceLengthBinningStacked=false
63 | dependencyChildRelHead=true
64 | constituentParent=true
65 | dependencyFeatures=true
66 | lexicalWindowSize=5
67 | documentQuotationFeature=true
68 | sentenceHasNe=true
69 | samplerMarginNegative=1
70 | constituentLeftmost=true
71 | flattenQuotes=true
72 | useBioeTags=true
73 | constituentAncestorL=true
74 | sentenceHasPronoun=true
75 | lexicalToken=false
76 | sentenceLengthBinning=false
77 | dependencyCueDependent=true
78 | constituentGoverning=true
79 | constituentBinning=false
80 | dependencyParentRel=true
81 | constituentFeatures=true
82 | lexicalBigram=false
83 | documentOffsetConjunction=true
84 | 
85 | 


--------------------------------------------------------------------------------
/resources/PARC/listfeatures/attribution_nouns.txt:
--------------------------------------------------------------------------------
  1 | accord
  2 | bill
  3 | counterclaim
  4 | document
  5 | formulation
  6 | according
  7 | call
  8 | criticism
  9 | doubt
 10 | guess
 11 | accusation
 12 | challenge
 13 | critic
 14 | effort
 15 | highlight
 16 | acknowledgement
 17 | charge
 18 | cry
 19 | elaboration
 20 | hint
 21 | ad
 22 | chart
 23 | data
 24 | encouragement
 25 | hope
 26 | admission
 27 | citation
 28 | decision
 29 | eruption
 30 | idea
 31 | advice
 32 | claim
 33 | declaration
 34 | estimate
 35 | illustration
 36 | agreement
 37 | command
 38 | deduction
 39 | eulogy
 40 | implication
 41 | allegation
 42 | comment
 43 | defence
 44 | evidence
 45 | imposition
 46 | amendment
 47 | commercial
 48 | definition
 49 | exclamation
 50 | indication
 51 | announcement
 52 | complaint
 53 | deliberation
 54 | expectation
 55 | information
 56 | answer
 57 | concern
 58 | demand
 59 | explanation
 60 | insinuation
 61 | anticipation
 62 | concession
 63 | denial
 64 | expression
 65 | inspiration
 66 | argument
 67 | conclusion
 68 | depiction
 69 | fear
 70 | instruction
 71 | article
 72 | condition
 73 | description
 74 | feeling
 75 | intention
 76 | assertion
 77 | confession
 78 | dictate
 79 | file
 80 | interjection
 81 | assumption
 82 | confidence
 83 | disappointment
 84 | filing
 85 | interpretation
 86 | assurance
 87 | confirmation
 88 | disapproval
 89 | find
 90 | issue
 91 | belief
 92 | consideration
 93 | disclosure
 94 | finding
 95 | joke
 96 | bet
 97 | contention
 98 | discovery
 99 | figure
100 | knowledge
101 | book
102 | convinction
103 | dispute
104 | forecast
105 | lament
106 | laugh
107 | offer
108 | question
109 | response
110 | support
111 | law
112 | opinion
113 | quotation
114 | revelation
115 | supposition
116 | lawsuit
117 | order
118 | realization
119 | rule
120 | survey
121 | lecture
122 | pact
123 | reason
124 | rumor
125 | suspicion
126 | legislation
127 | paper
128 | recognition
129 | saying
130 | talk
131 | lesson
132 | permission
133 | recollection
134 | scream
135 | temptation
136 | letter
137 | plan
138 | recommendation
139 | shout
140 | testimony
141 | list
142 | pledge
143 | recount
144 | sigh
145 | theory
146 | menace
147 | point
148 | reflection
149 | sign
150 | thought
151 | mention
152 | policy
153 | reform
154 | signal
155 | threat
156 | message
157 | poll
158 | refusal
159 | snort
160 | understandment
161 | mind
162 | praise
163 | rejection
164 | specification
165 | urge
166 | moan
167 | prediction
168 | remark
169 | speculation
170 | view
171 | need
172 | press
173 | repetition
174 | spell
175 | voice
176 | news
177 | proclamation
178 | reply
179 | statement
180 | want
181 | note
182 | project
183 | report
184 | statistic
185 | warning
186 | notice
187 | promise
188 | reproach
189 | story
190 | wisdom
191 | notification
192 | proposal
193 | request
194 | strategy
195 | worry
196 | oath
197 | protest
198 | requirement
199 | study
200 | yell
201 | objection
202 | prove
203 | research
204 | suggestion
205 | observation
206 | provision
207 | resentment
208 | suit


--------------------------------------------------------------------------------
/resources/PARC/listfeatures/krestel_verbs.txt:
--------------------------------------------------------------------------------
 1 | according
 2 | accuse
 3 | acknowledge
 4 | add
 5 | admit
 6 | agree
 7 | allege
 8 | announce
 9 | argue
10 | assert
11 | believe
12 | blame
13 | charge
14 | cite
15 | claim
16 | complain
17 | concede
18 | conclude
19 | confirm
20 | contend
21 | criticize
22 | declare
23 | decline
24 | deny
25 | describe
26 | disagree
27 | disclose
28 | estimate
29 | explain
30 | fear
31 | hope
32 | insist
33 | maintain
34 | mention
35 | note
36 | order
37 | predict
38 | promise
39 | recall
40 | recommend
41 | reply
42 | report
43 | say
44 | state
45 | stress
46 | suggest
47 | tell
48 | testify
49 | think
50 | urge
51 | warn
52 | worry
53 | write
54 | observe


--------------------------------------------------------------------------------
/resources/PARC/listfeatures/titles.txt:
--------------------------------------------------------------------------------
  1 | Mr
  2 | Mrs
  3 | Ms
  4 | Mr.
  5 | Mrs.
  6 | Ms.
  7 | Miss
  8 | Mister
  9 | Madam
 10 | Hon. 
 11 | MP
 12 | MYP
 13 | Representative
 14 | Senator
 15 | Speaker
 16 | President
 17 | Councillor
 18 | Alderman
 19 | Selectman
 20 | Delegate
 21 | Mayor
 22 | Lady
 23 | Mayoress
 24 | Lord
 25 | Governor
 26 | Lieutenant
 27 | Prefect
 28 | Prelate
 29 | Premier
 30 | Burgess
 31 | Ambassador
 32 | Envoy
 33 | Secretary
 34 | Cardinal
 35 | Attaché
 36 | Chargé
 37 | Provost
 38 | Prince
 39 | Princess
 40 | Archduke
 41 | Archduchess
 42 | Duke
 43 | Duchess
 44 | Marquis
 45 | Marquess
 46 | Marquise
 47 | Marchioness
 48 | Count
 49 | Countess
 50 | Earl
 51 | Viscount
 52 | Viscountess
 53 | Baron
 54 | Baroness
 55 | Emperor
 56 | Empress
 57 | King
 58 | Queen
 59 | Tsar
 60 | Tsarina
 61 | Leader
 62 | Pope
 63 | Sir
 64 | Dame
 65 | Advocate
 66 | Attorney
 67 | Bailiff
 68 | Barrister
 69 | Chancellor
 70 | Judge
 71 | Justice
 72 | Clerk
 73 | Magistrate
 74 | Promagistrate
 75 | Mufti
 76 | Grand Mufti
 77 | Privy
 78 | Counsellor
 79 | Majesty
 80 | Solicitor
 81 | Abbess
 82 | Abbot
 83 | Brother
 84 | Sister
 85 | Mother
 86 | Superior
 87 | Friar
 88 | Bishop
 89 | Archbishop
 90 | Metropolitan
 91 | Presbyter
 92 | Priest
 93 | Priestess
 94 | Father
 95 | Fr.
 96 | Patriarch
 97 | Pope
 98 | Catholicos
 99 | Vicar
100 | Chaplain
101 | Canon
102 | Pastor
103 | Prelate
104 | Primate
105 | Dom
106 | Cardinal
107 | Venerable
108 | Blessed
109 | Saint
110 | Christ
111 | Deacon
112 | Archdeacon
113 | Acolyte
114 | Dean
115 | Elder
116 | Minister
117 | Monsignor
118 | Reader
119 | Almoner
120 | Dr.
121 | Dr
122 | MD
123 | PhD
124 | EdD
125 | PharmD
126 | LLD
127 | JD
128 | Prof
129 | Prof.
130 | Professor
131 | Colonel
132 | General
133 | Commodore
134 | Corporal
135 | Mate
136 | Sergeant
137 | Admiral
138 | Brigadier
139 | Captain
140 | Commander
141 | General
142 | Officer
143 | Lieutenant
144 | Major
145 | Private
146 | Constable
147 | Agent
148 | Principal
149 | Nurse
150 | Nanny
151 | Coach
152 | Wizard
153 | Chief
154 | Scout
155 | Lama
156 | Dalai
157 | Panchen
158 | Druid
159 | Archdruid
160 | Rabbi
161 | Rebbe
162 | Hakham
163 | Buddha
164 | Ayatollah
165 | Imam
166 | Bodhisattva
167 | Mullah
168 | Kohen
169 | Nat
170 | Mahdi
171 | Rosh
172 | HaYeshiva
173 | Saoshyant
174 | Tirthankar
175 | Vardapet
176 | Mahatma
177 | Pandit
178 | Swami
179 | Ustad
180 | Sheikh
181 | Emir
182 | Emira
183 | Sultan
184 | Sultana
185 | Maharajah
186 | Maharani
187 | Eze
188 | Mwami
189 | Nizam
190 | Dato
191 | Oba
192 | Tor
193 | Tiv
194 | Obi
195 | Elder
196 | Vizier
197 | Grand


--------------------------------------------------------------------------------
/src/main/java/ims/cs/bbn/BbnNeHandler.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.bbn;
 19 | 
 20 | import org.xml.sax.Attributes;
 21 | import org.xml.sax.SAXException;
 22 | import org.xml.sax.helpers.DefaultHandler;
 23 | 
 24 | import java.util.ArrayList;
 25 | import java.util.HashMap;
 26 | import java.util.List;
 27 | import java.util.Map;
 28 | 
 29 | /**
 30 |  * XML handler to process named entity information from the BBN dataset.
 31 |  */
 32 | public class BbnNeHandler extends DefaultHandler {
 33 | 
 34 | 	StringBuffer accumulator = new StringBuffer();   /* Accumulate parsed text */
 35 | 	List<String> tags;
 36 | 	Map<String, List<String>> tagMap = new HashMap<>();
 37 | 	String currentTag;
 38 | 	String fileNo;
 39 | 	boolean tagPreceded = false;
 40 | 	boolean disableNextTag = false;
 41 | 	private String docNo;
 42 | 
 43 | 
 44 | 	public void characters(char[] buffer, int start, int length) {
 45 | 		accumulator.append(buffer, start, length);
 46 | 	}
 47 | 
 48 | 
 49 | 	@Override
 50 |     public void startDocument() throws SAXException {
 51 | 	}
 52 | 
 53 | 	@Override
 54 |     public void endDocument() throws SAXException {
 55 |     }
 56 | 
 57 | 	/**
 58 | 	 * Returns all currently unprocessed text read so far
 59 | 	 * @return
 60 | 	 */
 61 | 	public String popText() {
 62 | 		String text = accumulator.toString();
 63 | 		accumulator.setLength(0);
 64 | 		return text;
 65 | 	}
 66 | 
 67 | 	/**
 68 | 	 * Counts number of spaces. Double spaces are conflated.
 69 | 	 * @param s
 70 | 	 * @return
 71 | 	 */
 72 | 	public int numSpaces(String s) {
 73 | 		int numSpaces = 0;
 74 | 		boolean prevIsWhitespace = false;
 75 | 
 76 | 		for (int i = 0; i < s.length(); i++) {
 77 | 			if (Character.isWhitespace(s.charAt(i))) {
 78 | 				if (!prevIsWhitespace)
 79 | 					numSpaces++;
 80 | 				prevIsWhitespace = true;
 81 | 			} else {
 82 | 				prevIsWhitespace = false;
 83 | 			}
 84 | 		}
 85 | 
 86 | 		return numSpaces;
 87 | 	}
 88 | 
 89 | 	/**
 90 | 	 * Counts number of words.
 91 | 	 * @param s
 92 | 	 * @return
 93 | 	 */
 94 | 	public int numWords(String s) {
 95 | 		int numWords;
 96 | 		if (s.equals("")) {
 97 | 			numWords = 0;
 98 | 		} else {
 99 | 			numWords = numSpaces(s) + 1;
100 | 		}
101 | 		return numWords;
102 | 	}
103 | 
104 | 	@Override
105 | 	public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
106 | 
107 | 		if (qName.equals("DOC")) {    /* document starts, reset accumulator */
108 | 			accumulator.setLength(0);
109 | 			tags = new ArrayList<>();
110 | 		} else if (qName.endsWith("EX")) {  /* NE tag starts */
111 | 			String text = popText();
112 | 			String trimText = text.trim();
113 | 
114 | 			// count words to align with the tokenized text
115 | 			int numWords = numWords(trimText);
116 | 
117 | 
118 | 			// adjust word counters in case of mid-word tags
119 | 			if (tagPreceded && (text.length() == 0 || !Character.isWhitespace(text.charAt(0)))) numWords--;
120 | 			if (text.length() == 0 || !Character.isWhitespace(text.charAt(text.length()-1))) numWords--;
121 | 
122 | 			if (trimText.length() > 0 && numWords < 0) {
123 | 				disableNextTag = true;
124 | 			}
125 | 
126 | 			// pad with outside tags
127 | 			for (int i = 0; i < numWords; i++) tags.add("O");
128 | 			currentTag = atts.getValue("TYPE");
129 | 		} else if (qName.equals("DOCNO")) {   /* new document, reset accumulator (to be sure) */
130 | 			accumulator.setLength(0);
131 | 		}
132 | 	}
133 | 
134 | 	@Override
135 | 	public void endElement(String uri, String localName, String qName) {
136 | 		if (qName.equals("DOC")) {   //* document ends */
137 | 			String text = popText();
138 | 			String trimText = text.trim();
139 | 			int numWords = numWords(trimText);
140 | 
141 | 			// adjust word counters in case of mid-word tags
142 | 			if (tagPreceded && (text.length() == 0 || !Character.isWhitespace(text.charAt(0)))) numWords--;
143 | 
144 | 			// pad with outside tags
145 | 			for (int i = 0; i < numWords; i++) tags.add("O");
146 | 
147 | 			// store annotation
148 | 			tagMap.put(fileNo, tags);
149 | 			tagPreceded = false;
150 | 		} else if (qName.endsWith("EX")) {   /* NE tag ends */
151 | 			if (disableNextTag) {
152 | 				disableNextTag = false;
153 | 				return;
154 | 			}
155 | 
156 | 			String text = popText();
157 | 			String trimText = text.trim();
158 | 			int numWords = numWords(trimText);
159 | 			for (int i = 0; i < numWords; i++) tags.add(currentTag);
160 | 			tagPreceded = true;
161 | 		} else if (qName.equals("DOCNO")) {  /* document number ends, parse document number */
162 | 			docNo = popText();
163 | 			fileNo = docNo.trim().substring(5);
164 | 			tagPreceded = false;
165 | 		}
166 | 	}
167 | 
168 | 	/**
169 | 	 * Returns the NE annotations for a given file ID
170 | 	 * @param fileId
171 | 	 * @return
172 | 	 */
173 | 	public List<String> getTags(String fileId) {
174 | 		return tagMap.get(fileId);
175 | 	}
176 | 
177 | 
178 | }
179 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/bbn/BbnNeParser.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | package ims.cs.bbn;
 18 | 
 19 | import ims.cs.lingdata.Document;
 20 | import ims.cs.lingdata.DocumentId;
 21 | import ims.cs.lingdata.Token;
 22 | import ims.cs.util.StaticConfig;
 23 | import org.xml.sax.InputSource;
 24 | import org.xml.sax.SAXException;
 25 | import org.xml.sax.XMLReader;
 26 | 
 27 | import javax.xml.parsers.ParserConfigurationException;
 28 | import javax.xml.parsers.SAXParser;
 29 | import javax.xml.parsers.SAXParserFactory;
 30 | import java.io.File;
 31 | import java.io.IOException;
 32 | import java.util.List;
 33 | 
 34 | /**
 35 |  * XML parser for BBN named entity dataset
 36 |  */
 37 | public class BbnNeParser {
 38 | 
 39 | 
 40 | 	private static BbnNeParser instance;
 41 | 	private static SAXParser saxParser;
 42 | 	private static XMLReader xmlReader;
 43 | 	private static BbnNeHandler handler;
 44 | 
 45 | 	public String currentBbnFile;
 46 | 
 47 | 
 48 | 	private BbnNeParser() throws ParserConfigurationException, SAXException {
 49 | 		SAXParserFactory spf = SAXParserFactory.newInstance();
 50 | 		saxParser = spf.newSAXParser();
 51 | 		xmlReader = saxParser.getXMLReader();
 52 | 		handler = new BbnNeHandler();
 53 | 		xmlReader.setContentHandler(handler);
 54 | 	}
 55 | 
 56 | 	/**
 57 | 	 * BBN splits each section in up to 4 files. This function determines which one contains the document in question.
 58 | 	 * @param document
 59 | 	 * @return
 60 | 	 */
 61 | 	public String getBbnFileName(Document document) {
 62 | 		DocumentId id = document.docId;
 63 | 		String sectionStr = id.getSectionStr();
 64 | 		String fileStr = id.getFileStr();
 65 | 		int num = Integer.parseInt(fileStr);
 66 | 		char partitionChar;
 67 | 
 68 | 		// BBN partition rule
 69 | 		if (num < 25) {
 70 | 			partitionChar = 'a';
 71 | 		} else if (num < 50) {
 72 | 			partitionChar = 'b';
 73 | 		} else if (num < 75) {
 74 | 			partitionChar = 'c';
 75 | 		} else {
 76 | 			partitionChar = 'd';
 77 | 		}
 78 | 
 79 | 		String fileName = "wsj" + sectionStr + partitionChar + ".qa";
 80 | 
 81 | 		return fileName;
 82 | 	}
 83 | 
 84 | 
 85 | 	/**
 86 | 	 * Takes a previously loaded WSJ document and adds BBN named entities.
 87 | 	 * This function does some rudimentary caching, which requires the WSJ documents to be parsed in order to stay fast.
 88 | 	 * @param document
 89 | 	 * @return
 90 | 	 * @throws IOException
 91 | 	 * @throws SAXException
 92 | 	 */
 93 | 	public Document augmentDocumentXml(Document document) throws IOException, SAXException {
 94 | 		String fileName = getBbnFileName(document);
 95 | 
 96 | 		// move to the next BBN file if necessary
 97 | 		// this will be efficient if the documents are passed in WSJ order as it avoids reloading the same file
 98 | 		if (!fileName.equals(currentBbnFile)) {
 99 | 			File xmlFile = new File(StaticConfig.bbnPath + fileName);
100 | 			xmlReader.parse(new InputSource(xmlFile.getPath()));
101 | 			currentBbnFile = fileName;
102 | 		}
103 | 
104 | 		List<String> tags = handler.getTags(document.docId.getFileStr());
105 | 		List<Token> tokenList = document.tokenList;
106 | 
107 | 		// sanity check: same number of tokens?
108 | 		if (tags.size() != tokenList.size()) {
109 | 			throw new Error("Tag and token counts differ");
110 | 		}
111 | 
112 | 		// align tags and tokens
113 | 		for (int i = 0; i < tokenList.size(); i++) {
114 | 			Token token = tokenList.get(i);
115 | 			String neTag = tags.get(i);
116 | 			token.goldNer = neTag;
117 | 		}
118 | 
119 | 		return document;
120 | 	}
121 | 
122 | 
123 | 	public static BbnNeParser getInstance() throws ParserConfigurationException, SAXException {
124 | 		if (instance == null) {
125 | 			instance = new BbnNeParser();
126 | 		}
127 | 		return instance;
128 | 	}
129 | 
130 | }
131 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/corenlp/DocumentAligner.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.corenlp;
 19 | 
 20 | import java.util.ArrayList;
 21 | import java.util.Iterator;
 22 | import java.util.List;
 23 | 
 24 | import ims.cs.lingdata.Document;
 25 | import ims.cs.lingdata.Sentence;
 26 | import ims.cs.lingdata.Token;
 27 | import edu.stanford.nlp.ling.CoreAnnotations;
 28 | import edu.stanford.nlp.ling.CoreLabel;
 29 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
 30 | import edu.stanford.nlp.util.CoreMap;
 31 | import ims.cs.util.StaticConfig;
 32 | 
 33 | /**
 34 |  * Aligns CoreNLP parser output with the original document. This is necessary since CoreNLP may produce a
 35 |  * tokenization that deviates from the input.
 36 |  */
 37 | public class DocumentAligner {
 38 | 
 39 | 	private Document pDocument;
 40 | 	private List<Sentence> pcSentenceList;
 41 | 	private boolean useCoreNlpQuoteCompletion = true; 
 42 | 	
 43 | 	
 44 | 	public DocumentAligner(Document pDocument, CoreMap cDocument) {
 45 | 		this.pDocument = pDocument;
 46 | 		alignSentences(pDocument, cDocument);
 47 | 	}
 48 | 
 49 | 	/**
 50 | 	 * Aligns original document and CoreNLP processed document.
 51 | 	 * @param pDocument
 52 | 	 * @param cDocument
 53 | 	 */
 54 | 	private void alignSentences(Document pDocument, CoreMap cDocument) {
 55 | 
 56 | 		// get sentences
 57 | 	    List<CoreMap> cSentenceList = cDocument.get(SentencesAnnotation.class);
 58 | 
 59 | 		// state variables
 60 | 		pcSentenceList = new ArrayList<>();
 61 | 		Iterator<CoreMap> cSentenceIter = cSentenceList.iterator();
 62 | 	    Iterator<Token> pTokenIter = pDocument.tokenList.iterator();
 63 | 	    Token nextPToken = pTokenIter.next(); 
 64 | 
 65 | 		// now iterate over CoreNLP sentences
 66 | 	    while (cSentenceIter.hasNext()) {
 67 | 			// get sentence tokens
 68 | 	    	CoreMap cSentence = cSentenceIter.next();
 69 | 			List<CoreLabel> cTokens = cSentence.get(CoreAnnotations.TokensAnnotation.class);
 70 | 	    	List<Token> currentSentencePTokens = new ArrayList<>(cTokens.size());
 71 | 
 72 | 
 73 | 			// identify last token
 74 | 	    	CoreLabel finalToken = cTokens.get(cTokens.size()-1);
 75 | 			int endPosition = finalToken.endPosition();
 76 | 			
 77 | 			// align tokens by byte count until the end of the sentence
 78 | 			while (nextPToken.goldByteCount.getBegin() <= endPosition) {
 79 | 				currentSentencePTokens.add(nextPToken);
 80 | 				if (nextPToken.goldByteCount.getEnd() <= endPosition) {
 81 | 					if (pTokenIter.hasNext()) {
 82 | 						nextPToken = pTokenIter.next();
 83 | 					} else {
 84 | 						break;
 85 | 					}
 86 | 				} else {
 87 | 					break;
 88 | 				}
 89 | 			}
 90 | 			
 91 | 
 92 | 			// check if any tokens need to be aligned at all
 93 | 			if (currentSentencePTokens.size() > 0) {
 94 | 				TokenAligner ta = new TokenAligner(currentSentencePTokens, cSentence);
 95 | 				ta.setUseCoreNlpQuoteCompletion(useCoreNlpQuoteCompletion);
 96 | 				Sentence combinedSentence = ta.getCombinedSentence();
 97 | 
 98 | 				if (combinedSentence == null) {
 99 | 					if (StaticConfig.verbose)
100 | 						System.out.println("Discarding empty combined sentence: " +
101 | 								cSentence.toString() + currentSentencePTokens.toString());
102 | 				} else {
103 | 					pcSentenceList.add(combinedSentence);
104 | 				}
105 | 			} else {   /* sentence may be empty if CoreNLP produced spurious tokens */
106 | 				if (StaticConfig.verbose)
107 | 					System.out.println("Discarding empty PARC sentence: " +
108 | 						cSentence.toString() + currentSentencePTokens.toString());
109 | 			}
110 | 
111 | 	    }	    
112 | 
113 | 	}
114 | 
115 | 	/**
116 | 	 * Returns the aligned document
117 | 	 * @return
118 | 	 */
119 | 	public Document getDocument() {
120 | 		Document combinedDocument = new Document(pDocument);
121 | 		
122 | 		combinedDocument.sentenceList = pcSentenceList;
123 | 		
124 | 		List<Token> documentTokenList = new ArrayList<Token>(pcSentenceList.size() * 5);
125 | 		
126 | 		for (Sentence sentence: pcSentenceList) {
127 | 			sentence.document = combinedDocument;
128 | 			documentTokenList.addAll(sentence.tokenList);
129 | 		}
130 | 		
131 | 		combinedDocument.tokenList = documentTokenList;
132 | 
133 | 		// set token positions in the new document
134 | 		for (int i = 0; i < combinedDocument.tokenList.size(); i++) {
135 | 			combinedDocument.tokenList.get(i).predPosition = i;
136 | 		}
137 | 
138 | 		return combinedDocument;
139 | 
140 | 	}
141 | 
142 | }
143 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/corenlp/IndexedWordIterator.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.corenlp;
19 | 
20 | import java.util.Iterator;
21 | 
22 | import edu.stanford.nlp.ling.IndexedWord;
23 | 
24 | /**
25 |  * Iterates over all indexed words safely -- this is useful as punctuation may not have an associated indexed word
26 |  */
27 | public class IndexedWordIterator implements Iterator<IndexedWord> {
28 | 
29 | 	Iterator<IndexedWord> iter;
30 | 	IndexedWord currentWord;
31 | 	int index = 1;
32 | 	
33 | 	private void fetch() {
34 | 		if (iter.hasNext()) {
35 | 			currentWord = iter.next();
36 | 		} else {
37 | 			currentWord = null;			
38 | 		}
39 | 	}
40 | 	
41 | 	public IndexedWordIterator(Iterator<IndexedWord> iter) {
42 | 		this.iter = iter;
43 | 		fetch();
44 | 	}
45 | 	
46 | 	public boolean hasNext() {
47 | 		return true;
48 | 	}
49 | 
50 | 	public IndexedWord next() {
51 | 		IndexedWord returnVal;
52 | 		
53 | 		if (currentWord == null) {
54 | 			returnVal = null;
55 | 		} else if (currentWord.index() == index) {
56 | 			returnVal = currentWord;
57 | 			fetch();
58 | 		} else {
59 | 			returnVal = null;
60 | 		}
61 | 		
62 | 		index++;
63 | 		return returnVal;
64 | 	}
65 | 
66 | 	public void remove() {
67 | 		throw new UnsupportedOperationException("no remove allowed");
68 | 	}
69 | 
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/ByteCount.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.lingdata;
19 | 
20 | /**
21 |  * Byte offset information
22 |  */
23 | public class ByteCount {
24 | 	public int begin;
25 | 	public int end;
26 | 	
27 | 	public ByteCount (int begin, int end) {
28 | 		this.begin = begin;
29 | 		this.end = end;
30 | 	}
31 | 
32 | 	public ByteCount(String value) {
33 | 		String[] tokens = value.split(",");
34 | 		begin = Integer.parseInt(tokens[0]);
35 | 		end = Integer.parseInt(tokens[1]);
36 | 	}
37 | 
38 | 	public int getBegin() {
39 | 		return begin;
40 | 	}
41 | 
42 | 
43 | 	public int getEnd() {
44 | 		return end;
45 | 	}
46 | 	
47 | 	@Override
48 | 	public String toString() {
49 | 		return "" + begin + "," + end;
50 | 	}
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/Corpus.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.lingdata;
19 | 
20 | import java.util.List;
21 | import java.util.Map;
22 | 
23 | import ims.cs.lingdata.Types.PartitionName;
24 | 
25 | /**
26 |  * Abstract Corpus class.
27 |  * A corpus has training, dev, and test partitions as well as a document list
28 |  */
29 | public abstract class Corpus {
30 | 	
31 | 	List<Document> docList;
32 | 	private Map<PartitionName, Partition> partitionMap;
33 | 	
34 | 	public abstract Partition getTrain();
35 | 	public abstract Partition getDev();
36 | 	public abstract Partition getTest();
37 | 	
38 | 	public List<Document> getDocumentList() {
39 | 		return docList;
40 | 	}
41 | 	
42 | 	public void setDocumentList(List<Document> docList) {
43 | 		this.docList = docList;
44 | 	}
45 | 	public Map<PartitionName, Partition> getPartitionMap() {
46 | 		return partitionMap;
47 | 	}
48 | 	public void setPartitionMap(Map<PartitionName, Partition> partitionMap) {
49 | 		this.partitionMap = partitionMap;
50 | 	}
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/Document.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.lingdata;
 19 | 
 20 | import java.util.HashSet;
 21 | import java.util.List;
 22 | import java.util.Set;
 23 | 
 24 | import ims.cs.lingdata.Types.Genre;
 25 | import ims.cs.qsample.spans.Span;
 26 | 
 27 | /**
 28 |  * Representation of a document.
 29 |  * Has a list of sentences and a list of tokens; holds span predictions.
 30 |  */
 31 | public class Document  {
 32 | 
 33 | 	public List<Sentence> sentenceList;
 34 | 	public List<Token> tokenList;
 35 | 	public DocumentId docId;
 36 | 	public Genre genre;
 37 | 	public String text;
 38 | 	public String sourceCorpusName;
 39 | 
 40 | 	// span predictions
 41 | 	public Set<Span> predictedSpanSet;
 42 | 	public Set<Span> goldSpanSet;
 43 | 
 44 | 
 45 | 	// CoreNLP flag to avoid multiple processing
 46 | 	public boolean isCoreNlpProcessed;
 47 | 
 48 | 	public Document(Document pDocument) {
 49 | 		this.docId = pDocument.docId;
 50 | 		this.genre = pDocument.genre;
 51 | 		this.text = pDocument.text;
 52 | 		this.sourceCorpusName = pDocument.sourceCorpusName;
 53 | 
 54 | 		this.predictedSpanSet = new HashSet<Span>();
 55 | 		this.goldSpanSet = new HashSet<Span>();
 56 | 	}
 57 | 
 58 | 
 59 | 	public Document() { }
 60 | 
 61 | 
 62 | 	public List<Token> getTokenList() {
 63 | 		return tokenList;
 64 | 	}
 65 | 	
 66 |     public Set<Span> goldSpansOfLabel(String label) {
 67 | 		Set<Span> selectedGoldSpans = new HashSet<>();
 68 | 		for (Span gs : goldSpanSet) {
 69 | 			if (gs.label.equals(label)) {
 70 | 				selectedGoldSpans.add(gs);
 71 | 			}
 72 | 		}
 73 | 		return selectedGoldSpans;
 74 | 	}
 75 | 
 76 | 	public Set<Span> predictedSpansOfLabel(String label) {
 77 | 		Set<Span> predGoldSpans = new HashSet<>();
 78 | 		for (Span ps : predictedSpanSet) {
 79 | 			if (ps.label.equals(label)) {
 80 | 				predGoldSpans.add(ps);
 81 | 			}
 82 | 		}
 83 | 		return predGoldSpans;
 84 | 	}
 85 | 
 86 | 	public Token getPrevToken(Token t) {
 87 | 		return getPrevToken(t, 1);
 88 | 	}
 89 | 
 90 | 	public Token getNextToken(Token t) {
 91 | 		return getNextToken(t, 1);
 92 | 	}
 93 | 
 94 | 	public Token getPrevToken(Token t, int dist) {
 95 | 		if (t.predPosition - dist >= 0) return tokenList.get(t.predPosition-dist);
 96 | 		else return null;
 97 | 	}
 98 | 
 99 | 	public Token getNextToken(Token t, int dist) {
100 | 		if (t.predPosition < tokenList.size()-dist) return tokenList.get(t.predPosition+dist);
101 | 		else return null;
102 | 	}
103 | 
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/DocumentId.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.lingdata;
19 | 
20 | /**
21 |  * Interface to represent a document ID.
22 |  * In our world, all documents have WSJ behavior, so they are part of a section and have a file number.
23 |  */
24 | public interface DocumentId {
25 | 	String getSectionStr();
26 | 	String getFileStr();
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/GornAddressList.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.lingdata;
19 | 
20 | /**
21 |  * Dummy class for Gorn addresses which turns out were not needed. Remains for compatibility.
22 |  */
23 | public class GornAddressList {
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/Partition.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.lingdata;
19 | 
20 | import java.util.List;
21 | import java.util.Map;
22 | 
23 | /**
24 |  * A partition is a list of documents which may belong to different sections
25 |  */
26 | public class Partition  {
27 | 
28 | 	public List<Document> docList;
29 | 	public Map<String, List<Document>> sectionMap;
30 | 	
31 | 
32 | 	public Partition() {}
33 | 	
34 | 	public Partition(List<Document> docList) {
35 | 		this.docList = docList;
36 | 	}
37 | 
38 | 
39 | 	public List<Document> getDocumentList() {
40 | 		return docList;
41 | 	}
42 | 	
43 | 	public int size() {
44 | 		return docList.size();
45 | 	}
46 | 
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/PlainTextCorpus.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.lingdata;
19 | 
20 | import java.util.List;
21 | 
22 | /**
23 |  * A corpus to hold documents read from plain text files.
24 |  * Has only one partition and consists only of test data.
25 |  */
26 | public class PlainTextCorpus extends Corpus {
27 | 
28 |     Partition partition;
29 | 
30 |     public PlainTextCorpus(List<Document> documentList) {
31 |         setDocumentList(documentList);
32 |         partition = new Partition();
33 |         partition.docList = documentList;
34 |     }
35 | 
36 |     @Override
37 |     public Partition getTrain() {
38 |         return null;
39 |     }
40 | 
41 |     @Override
42 |     public Partition getDev() {
43 |         return null;
44 |     }
45 | 
46 |     @Override
47 |     public Partition getTest() {
48 |         return partition;
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/PlainTextDocId.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.lingdata;
19 | 
20 | 
21 | /**
22 |  * A document ID for plain text documents.
23 |  * Since we require a WSJ-like directory structure, we can generate IDs from that.
24 |  */
25 | public class PlainTextDocId implements DocumentId {
26 | 
27 |     String sectionStr;
28 |     String fileStr;
29 | 
30 |     public PlainTextDocId (String section, String file) {
31 |         sectionStr = section;
32 |         fileStr = file;
33 |     }
34 | 
35 |     @Override
36 |     public String getSectionStr() {
37 |         return sectionStr;
38 |     }
39 | 
40 |     @Override
41 |     public String getFileStr() {
42 |         return fileStr;
43 |     }
44 | 
45 |     @Override
46 |     public String toString() {
47 |         return sectionStr + "," + fileStr;
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/Sentence.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.lingdata;
19 | 
20 | import java.util.HashMap;
21 | import java.util.List;
22 | import java.util.Map;
23 | 
24 | import edu.stanford.nlp.ling.IndexedWord;
25 | import edu.stanford.nlp.semgraph.SemanticGraph;
26 | import edu.stanford.nlp.trees.Tree;
27 | import org.jgrapht.alg.FloydWarshallShortestPaths;
28 | 
29 | /**
30 |  * Representation of a sentence.
31 |  * Is part of a document; contains a list of tokens; may have a constituency and a dependency tree.
32 |  */
33 | public class Sentence  {
34 | 
35 | 	public List<Token> tokenList;
36 | 	public GornAddressList gorn;
37 | 	public SentenceId sentenceId;
38 | 	public int positionInDocument;
39 | 	public Document document;
40 | 	
41 | 	// CoreLabel backwards lookup
42 | 	public Map<IndexedWord, Token> indexedWordLookup;
43 | 	public HashMap<Tree, Token> treeLookup;
44 | 
45 | 	// CoreNLP output
46 | 	public Tree tree;
47 | 	public SemanticGraph dependencyGraph;
48 | 	public FloydWarshallShortestPaths fw;
49 | 
50 | 
51 | 	public Sentence () {}
52 | 	public Sentence (Document d) {
53 | 		document = d;
54 | 	}
55 | 
56 | 	public List<Token> getTokenList() {
57 | 		return tokenList;
58 | 	}
59 | 
60 | 	public Token first() { return tokenList.get(0); }
61 | 	public Token last() { return tokenList.get(tokenList.size()-1); }
62 | 
63 | 	@Override
64 | 	public String toString() {
65 | 		return tokenList.toString();
66 | 	}
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/SentenceId.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.lingdata;
19 | 
20 | /**
21 |  * Holds an ID for a sentence.
22 |  * Can be calculated from the document's ID together with the Gorn address of the sentence.
23 |  */
24 | public class SentenceId {
25 | 
26 | 	private WSJId wsjId;
27 | 	private GornAddressList gorn;
28 | 	
29 | 	public SentenceId(WSJId wsdId, GornAddressList gorn) {
30 | 		this.gorn = gorn;
31 | 		this.wsjId = wsdId;
32 | 	}
33 | 
34 | 	public WSJId getWsjId () {
35 | 		return wsjId;
36 | 	}
37 | 
38 | 	public GornAddressList getGorn() {
39 | 		return gorn;
40 | 	}
41 | 
42 | 	@Override
43 | 	public String toString() {
44 | 		return "" + wsjId + ":" + gorn;
45 | 	}
46 | 
47 | 	@Override
48 | 	public boolean equals(Object obj) {
49 | 		if (obj instanceof SentenceId) {
50 | 			SentenceId objId = (SentenceId) obj;
51 | 			return this.wsjId.equals(objId.wsjId) && this.gorn.equals(objId.gorn);
52 | 			//FIXME: maybe the gorn thing doesn't work
53 | 		} else {
54 | 			return false;
55 | 		}
56 | 	}
57 | 	
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/Types.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.lingdata;
19 | 
20 | public abstract class Types {
21 | 	public enum PartitionName {TRAIN, DEV, TEST};
22 | 	public enum Genre {FICTION, NEWS, BIOGRAPHY};
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/lingdata/WSJId.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.lingdata;
19 | 
20 | import java.io.Serializable;
21 | 
22 | /**
23 |  * Document ID as used in the Wall Street Journal corpus.
24 |  * Each document is part of a section and is stored in a file, each of which have an integral ID.
25 |  */
26 | public class WSJId implements Serializable, DocumentId {
27 | 
28 | 	private static final long serialVersionUID = 4443044961863001270L;
29 | 	
30 | 	private Integer section;
31 | 	private Integer file;
32 | 
33 | 	
34 | 	public WSJId (Integer section) {
35 | 		this(section, null);
36 | 	}
37 | 
38 | 	public WSJId (String section, String file) {
39 | 		this(Integer.parseInt(section), Integer.parseInt(file));
40 | 	}
41 | 
42 | 	public WSJId (String section) {
43 | 		this(Integer.parseInt(section));
44 | 	}
45 | 	
46 | 	public WSJId (Integer section, Integer file) {
47 | 		this.section = section;
48 | 		this.file = file;
49 | 	}
50 | 	
51 | 	public int getSectionInt() {
52 | 		return section;
53 | 	}
54 | 
55 | 	public int getFileInt() {
56 | 		return file;
57 | 	}
58 | 	
59 | 	private static String addOffset(int i) {
60 | 		if (i < 10) {
61 | 			return "0" + i;
62 | 		} else {
63 | 			return "" + i;
64 | 		}
65 | 	}
66 | 	
67 | 	public String getSectionStr() {
68 | 		return addOffset(section);
69 | 	}
70 | 	
71 | 	public String getFileStr() {
72 | 		return addOffset(file);
73 | 	}
74 | 	
75 | 	@Override
76 | 	public boolean equals(Object other) {
77 | 		if (other instanceof WSJId) {
78 | 			WSJId otherId = (WSJId) other; 
79 | 			return (this.section == otherId.section) && (this.file == otherId.file);
80 | 		} else {
81 | 			return false;
82 | 		}
83 | 	}
84 | 
85 | 	public boolean sectionEquals(WSJId other) {
86 | 		return this.section == other.section;
87 | 	}
88 | 	
89 | 	@Override
90 | 	public String toString() {
91 | 		return getSectionStr() + getFileStr();
92 | 	}
93 | }
94 | 
95 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/mallet/DocumentFeatureSet2TokenSequence.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.mallet;
19 | 
20 | import java.util.List;
21 | 
22 | import cc.mallet.pipe.Pipe;
23 | import cc.mallet.types.Instance;
24 | import cc.mallet.types.Token;
25 | import cc.mallet.types.TokenSequence;
26 | import ims.cs.qsample.features.FeatureSet;
27 | 
28 | /**
29 |  * Pipe to convert our internal feature set to mallet token feature entries
30 |  * @author scheibcn
31 |  */
32 | public class DocumentFeatureSet2TokenSequence extends Pipe {
33 | 
34 | 	private static final long serialVersionUID = 3218174517742238232L;
35 | 
36 | 	@Override
37 | 	public Instance pipe(Instance inst) {
38 | 
39 | 		// ensure that the instance is of the right type
40 | 		if (!(inst instanceof PARCDocumentInstance)) {
41 | 			throw new UnsupportedOperationException("Expected CoreMap, got " + inst.getClass());
42 | 		}
43 | 
44 | 
45 | 		List<ims.cs.lingdata.Token> tokenList = ((PARCDocumentInstance) inst).document.getTokenList();
46 | 		TokenSequence ts = new TokenSequence();
47 | 
48 | 		// iterate over tokens and convert their internal feature sets into Mallet feature sets
49 | 		for (ims.cs.lingdata.Token cToken : tokenList) {
50 | 			FeatureSet fs = cToken.boundaryFeatureSet;
51 | 			Token mToken = new Token(cToken.predText);
52 | 
53 | 			// copy each feature
54 | 			for (Object entry : fs) {
55 | 				mToken.setFeatureValue(entry.toString(), 1);
56 | 			}
57 | 			
58 | 			ts.add(mToken);
59 | 		}
60 | 		
61 | 		inst.setData(ts);
62 | 		
63 | 		return inst;
64 | 	}
65 | 	
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/mallet/PARCDocumentInstance.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.mallet;
19 | 
20 | 
21 | import cc.mallet.types.Instance;
22 | import ims.cs.lingdata.Document;
23 | 
24 | /**
25 |  * Mallet "Instance" wrapper class for documents
26 |  */
27 | public class PARCDocumentInstance extends Instance {
28 | 
29 | 	private static final long serialVersionUID = -6933321582801583924L;
30 | 
31 | 	public transient Document document;
32 | 	
33 | 	private PARCDocumentInstance() {
34 | 		super(null, null, null, null);
35 | 	};
36 | 	
37 | 	public PARCDocumentInstance(Document document) {
38 | 		super(document, null, document.docId, document);
39 | 		this.document = document;
40 | 	}
41 | 
42 | 	
43 | 	public Document getDocument() {
44 | 		return document;
45 | 	}
46 | 	
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/parc/PARCAttribution.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.parc;
19 | 
20 | /**
21 |  * A single attribution.
22 |  */
23 | public class PARCAttribution {
24 | 
25 | 	// attribution roles as annotated in the PARC corpus
26 | 	public enum Role { SOURCE, CONTENT, CUE, SUPPLEMENT };
27 | 
28 | 	// types by Pareti et al.
29 | 	public enum Type { DIRECT, INDIRECT, MIXED };
30 | 
31 | 	public Role role;
32 | 	public String id;
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/parc/ParcUtils.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.parc;
 19 | 
 20 | import edu.stanford.nlp.ling.IndexedWord;
 21 | import edu.stanford.nlp.semgraph.SemanticGraphEdge;
 22 | import edu.stanford.nlp.trees.GrammaticalRelation;
 23 | import ims.cs.corenlp.Helper;
 24 | import ims.cs.lingdata.ByteCount;
 25 | import ims.cs.lingdata.Document;
 26 | import ims.cs.lingdata.Sentence;
 27 | import ims.cs.lingdata.Token;
 28 | import edu.stanford.nlp.trees.Tree;
 29 | import org.jgrapht.alg.FloydWarshallShortestPaths;
 30 | import org.jgrapht.graph.SimpleDirectedGraph;
 31 | 
 32 | import java.util.Iterator;
 33 | import java.util.List;
 34 | 
 35 | /**
 36 |  * Collection of Utility functions
 37 |  */
 38 | public abstract class ParcUtils {
 39 | 
 40 | 	/**
 41 | 	 * Find all head verbs in the corpus. The algorithm is taken from Pareti (2015).
 42 | 	 * @param sentence
 43 | 	 */
 44 | 	public static void markHeadVerbs (Sentence sentence) {
 45 | 
 46 | 		for (Tree tree : sentence.tree.preOrderNodeList()) {
 47 | 			if (tree.label().value().equals("VP")) {
 48 | 				boolean valid = true;
 49 | 				for (Tree child : tree.children()) {
 50 | 					if (child.label().value().equals("VP")) {
 51 | 						valid = false;
 52 | 						break;
 53 | 					}
 54 | 				}
 55 | 
 56 | 				if (valid) {
 57 | 					for (Tree child : tree.children()) {
 58 | 						if (child.firstChild().isLeaf() && child.label().value().startsWith("V")) {
 59 | 							Token token = sentence.treeLookup.get(child.firstChild());
 60 | 							if (token != null)
 61 | 								token.isHeadVerb = true;
 62 | 						}
 63 | 					}
 64 | 				}
 65 | 			}
 66 | 		}
 67 | 	}
 68 | 
 69 | 	/**
 70 | 	 * Annotates paragraph-continuing quotation marks. doParagraphAnnotation() needs to be called before this.
 71 | 	 * @param document
 72 | 	 */
 73 | 	public static void markParagraphQuotes(Document document) {
 74 | 		int quoteIndex = 1;
 75 | 
 76 | 		for (Token token: document.tokenList) {
 77 | 			if (Helper.isQuote(token)) {
 78 | 				// ignore even quotes at paragraph begins
 79 | 				if (token.paragraphBegins && quoteIndex % 2 == 0)
 80 | 					token.ignoreQuote = true;
 81 | 				else
 82 | 					quoteIndex++;
 83 | 			}
 84 | 		}
 85 | 	}
 86 | 
 87 | 	/**
 88 | 	 * Annotates for each token whether it starts a paragraph by its raw text
 89 | 	 * @param document
 90 | 	 */
 91 | 	public static void doParagraphAnnotation (Document document) {
 92 | 		String documentText = document.text;
 93 | 		Iterator<Token> tokenIter = document.tokenList.iterator();
 94 | 
 95 | 		if (!tokenIter.hasNext()) {
 96 | 			System.err.println("Skipping paragraph annotation empty document: " + document.docId);
 97 | 			return;
 98 | 		}
 99 | 
100 | 		Token token = tokenIter.next();
101 | 		ByteCount bc = token.goldByteCount;
102 | 
103 | 		// iterate over all character positions in the text
104 | 		char prevC = 0;
105 | 
106 | 	    for (int i = 0; i < documentText.length(); i++) {
107 | 	    	if (i > bc.getEnd()) {
108 | 	    		if (!tokenIter.hasNext()) break;   /* reached the last token */
109 | 
110 | 	    		token = tokenIter.next();
111 | 	    		bc = token.goldByteCount;
112 | 	    	}
113 | 
114 | 	    	char c = documentText.charAt(i);
115 | 
116 | 			// two consecutive newlines indicate a paragraph
117 | 	    	if (prevC == '\n' &&  c == '\n') {
118 | 	    		token.paragraphBegins = true;
119 | 	    	}
120 | 
121 | 	    	prevC = c;
122 | 	    }
123 | 	}
124 | 
125 | 	/**
126 | 	 * Anonymizes certain named entities in the text
127 | 	 * @param document
128 | 	 */
129 | 	public static void anonymizeNamedEntities (Document document) {
130 | 		for (Token token: document.getTokenList()) {
131 | 			if (token.predNer.startsWith("ORGANIZATION") || token.predNer.startsWith("PERSON")) {
132 | 				String substText = "[NE]";
133 | 				token.originalPredText = token.predText;
134 | 
135 | 				token.predLemma = substText;
136 | 				token.predText = substText;
137 | 				token.goldLemma = substText;
138 | 				token.goldText = substText;
139 | 			}
140 | 		}
141 | 	}
142 | 
143 | 	/**
144 | 	 * CoreNLP tries to predict opening and closing quotation marks.
145 | 	 * This method maps the variation back to one symbol.
146 | 	 * @param document
147 | 	 */
148 | 	public static void sanitizeQuotationMarks (Document document) {
149 | 		for (Token token : document.getTokenList()) {
150 | 			// double quotes
151 | 			if (token.predLemma.equals("``") || token.predLemma.equals("\"") || token.predLemma.equals("''")) {
152 | 				token.predLemma = "\"";
153 | 				token.predPosTag = "\"";
154 | 				token.predText = "\"";
155 | 				token.goldPosTag = "\"";
156 | 				token.goldLemma = "\"";
157 | 				token.goldText = "\"";
158 | 			}
159 | 
160 | 			// single quotes
161 | 			if (token.predLemma.equals("`") || token.predLemma.equals("''")) {
162 | 				token.predLemma = "'";
163 | 				token.predPosTag = "'";
164 | 				token.predText = "'";
165 | 				token.goldLemma = "'";
166 | 				token.goldPosTag = "'";
167 | 				token.goldText = "'";
168 | 			}
169 | 
170 | 		}
171 | 	}
172 | 
173 | 	/**
174 | 	 * The FW implementation needs distinct objects as edges, which this class accomplishes.
175 | 	 * CoreNLP seems to optimize storage by caching strings, so different edges have identical label strings.
176 | 	 */
177 | 	public static class IndexedEdge {
178 | 		public GrammaticalRelation rel;
179 | 		public int index;
180 | 
181 | 		public IndexedEdge(GrammaticalRelation rel, int index) {
182 | 			this.rel = rel;
183 | 			this.index = index;
184 | 		}
185 | 	}
186 | 
187 | 	/**
188 | 	 * Compute cached dependency paths using Floyd Warshall
189 | 	 * @param dependencies
190 | 	 * @return
191 | 	 */
192 | 	public static FloydWarshallShortestPaths computeFloydWarshallSGE(List<SemanticGraphEdge> dependencies) {
193 | 		SimpleDirectedGraph<IndexedWord, IndexedEdge> graph = new SimpleDirectedGraph<IndexedWord, IndexedEdge>(IndexedEdge.class);
194 | 		int edgeId = 0;
195 | 		for (SemanticGraphEdge dep : dependencies) {
196 | 			graph.addVertex(dep.getGovernor());
197 | 			graph.addVertex(dep.getDependent());
198 | 			graph.addEdge(dep.getGovernor(), dep.getDependent(), new IndexedEdge(dep.getRelation(), edgeId));
199 | 		}
200 | 		return new FloydWarshallShortestPaths(graph);
201 | 	}
202 | 
203 | }
204 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/parc/xml/PARCParser.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.parc.xml;
19 | 
20 | import java.io.File;
21 | import java.io.IOException;
22 | 
23 | import javax.xml.parsers.ParserConfigurationException;
24 | import javax.xml.parsers.SAXParser;
25 | import javax.xml.parsers.SAXParserFactory;
26 | 
27 | import org.xml.sax.InputSource;
28 | import org.xml.sax.SAXException;
29 | import org.xml.sax.XMLReader;
30 | 
31 | import ims.cs.lingdata.Document;
32 | 
33 | /**
34 |  * XML parser for the PARC corpus
35 |  */
36 | public class PARCParser {
37 | 
38 | 	private static PARCParser instance;
39 | 	private static SAXParser saxParser;
40 | 	private static XMLReader xmlReader;
41 | 	private static PARCHandler handler;
42 | 	
43 | 	private PARCParser () throws ParserConfigurationException, SAXException {
44 | 		SAXParserFactory spf = SAXParserFactory.newInstance();
45 | 	    saxParser = spf.newSAXParser();
46 | 	    xmlReader = saxParser.getXMLReader();
47 | 	    handler = new PARCHandler();
48 | 	    xmlReader.setContentHandler(handler);
49 | 	}
50 | 	
51 | 	public Document parseFile(File xmlFile) throws IOException, SAXException {
52 | 		xmlReader.parse(new InputSource(xmlFile.getPath()));
53 | 	    return handler.getDocument();
54 | 
55 | 	}
56 | 	
57 | 	public static PARCParser getInstance() throws ParserConfigurationException, SAXException {
58 | 		if (instance == null) {
59 | 			instance = new PARCParser();
60 | 		}
61 | 		return instance;
62 | 	}
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/evaluate/EvaluateClassifier.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.qsample.evaluate;
19 | 
20 | import ims.cs.lingdata.Document;
21 | 
22 | import java.util.List;
23 | 
24 | /**
25 |  * Evaluation functions for single-token classifiers
26 |  * Created by scheibcn on 3/2/16.
27 |  */
28 | public class EvaluateClassifier {
29 | 
30 |     /**
31 |      * Container class for quotation classifier results, i.e., begin, end, and cue F1
32 |      */
33 |     public static class ClassifierResults {
34 |         F1.Stats beginStats;
35 |         F1.Stats endStats;
36 |         F1.Stats cueStats;
37 | 
38 |         public String toString() {
39 |             return String.format("Pb=%1.3f Rb=%1.3f Fb=%1.3f     Pe=%1.3f Re=%1.3f Fe=%1.3f     Pc=%1.3f Rc=%1.3f Fc=%1.3f",
40 |                     beginStats.precision, beginStats.recall, beginStats.f1,
41 |                     endStats.precision, endStats.recall, endStats.f1,
42 |                     cueStats.precision, cueStats.recall, cueStats.f1);
43 | 
44 |         }
45 |     }
46 | 
47 |     /**
48 |      * Evaluate begin, end, and cue classifier output over all tokens in the specified documents
49 |      * @param trainDocs
50 |      * @return
51 |      */
52 |     public static ClassifierResults evaluateClassifier (List<Document> trainDocs) {
53 |         if (trainDocs == null) return null;
54 |         ClassifierResults results = new ClassifierResults();
55 | 
56 |         results.beginStats = F1.evalPerceptron(trainDocs, "begin");
57 |         results.endStats = F1.evalPerceptron(trainDocs, "end");
58 |         results.cueStats = F1.evalPerceptron(trainDocs, "cue");
59 | 
60 |         return results;
61 |     }
62 | 
63 | 
64 |     /**
65 |      * Print begin, end, and cue classifier evaluations over all tokens in the specified training, test, val, and
66 |      * resubstitution documents
67 |      * @param trainDocs
68 |      * @param testDocs
69 |      * @param valDocs
70 |      * @param resDocs
71 |      * @param prefix
72 |      */
73 |     public static void evaluateAndPrint(List<Document> trainDocs, List<Document> testDocs, List<Document> valDocs, List<Document> resDocs, String prefix) {
74 |         ClassifierResults trainResults = evaluateClassifier(trainDocs);
75 |         ClassifierResults testResults = evaluateClassifier(testDocs);
76 |         ClassifierResults valResults = evaluateClassifier(valDocs);
77 |         ClassifierResults resResults = evaluateClassifier(resDocs);
78 | 
79 |         if (trainResults != null) System.out.println(prefix + " TRAIN   " + trainResults.toString());
80 |         if (testResults != null) System.out.println(prefix + " TEST    " + testResults.toString());
81 |         if (valResults != null) System.out.println(prefix + " VAL     " + valResults.toString());
82 |         if (resResults != null) System.out.println(prefix + " RES     " + resResults.toString());
83 |     }
84 | 
85 | }
86 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/evaluate/EvaluateSpan.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.evaluate;
 19 | 
 20 | import ims.cs.lingdata.Document;
 21 | import ims.cs.parc.PARCAttribution;
 22 | 
 23 | import java.util.List;
 24 | 
 25 | /**
 26 |  * Evaluation functions for span prediction models.
 27 |  * Created by scheibcn on 3/2/16.
 28 |  */
 29 | public class EvaluateSpan {
 30 | 
 31 |     /**
 32 |      * Container class for all necessary F1 statistics to do Pareti-style quotation evaluation
 33 |      */
 34 |     public static class SpanResults {
 35 |         public F1.Stats strictCue;
 36 |         public F1.Stats strictContent;
 37 |         public F1.Stats partialContent;
 38 |         public F1.Stats strictContentDirect;
 39 |         public F1.Stats partialContentDirect;
 40 |         public F1.Stats strictContentIndirect;
 41 |         public F1.Stats partialContentIndirect;
 42 |         public F1.Stats strictContentMixed;
 43 |         public F1.Stats partialContentMixed;
 44 | 
 45 |         public String toString(String sep) {
 46 |             return strictContent.toString() + sep
 47 |                     + strictContentDirect + sep
 48 |                     + strictContentIndirect + sep
 49 |                     + strictContentMixed + sep
 50 |                     + strictCue + sep + sep
 51 |                     + partialContent + sep
 52 |                     + partialContentDirect + sep
 53 |                     + partialContentIndirect + sep
 54 |                     + partialContentMixed;
 55 |         }
 56 |     }
 57 | 
 58 |     /**
 59 |      * SpanResults for training, test, validation, and resubstitution data
 60 |      */
 61 |     public static class ResultSet {
 62 |         public SpanResults trainResults;
 63 |         public SpanResults testResults;
 64 |         public SpanResults valResults;
 65 |         public SpanResults resResults;
 66 |     }
 67 | 
 68 | 
 69 |     /**
 70 |      * Evaluate cue and content span models
 71 |      * @param documentList
 72 |      * @return
 73 |      */
 74 |     public static SpanResults cueContentEvaluation (List<Document> documentList) {
 75 |         SpanResults evaluation = new SpanResults();
 76 |         evaluation.strictCue = F1.evalSpans(documentList, "cue", false, null);
 77 |         evaluation.strictContent = F1.evalSpans(documentList, "content", false, null);
 78 |         evaluation.partialContent = F1.evalSpans(documentList, "content", true, null);
 79 |         evaluation.strictContentDirect = F1.evalSpans(documentList, "content", false, PARCAttribution.Type.DIRECT);
 80 |         evaluation.partialContentDirect = F1.evalSpans(documentList, "content", true, PARCAttribution.Type.DIRECT);
 81 |         evaluation.strictContentIndirect = F1.evalSpans(documentList, "content", false, PARCAttribution.Type.INDIRECT);
 82 |         evaluation.partialContentIndirect = F1.evalSpans(documentList, "content", true, PARCAttribution.Type.INDIRECT);
 83 |         evaluation.strictContentMixed = F1.evalSpans(documentList, "content", false, PARCAttribution.Type.MIXED);
 84 |         evaluation.partialContentMixed = F1.evalSpans(documentList, "content", true, PARCAttribution.Type.MIXED);
 85 | 
 86 |         return evaluation;
 87 |     }
 88 | 
 89 |     /**
 90 |      * Returns a string where the input s is repeated n times
 91 |      * @param s
 92 |      * @param n
 93 |      * @return
 94 |      */
 95 |     private static String generateN(String s, int n) {
 96 |         StringBuilder sb = new StringBuilder();
 97 |         for (int i = 0; i < n; i++) {
 98 |             sb.append(s);
 99 |         }
100 | 
101 |         return sb.toString();
102 |     }
103 | 
104 |     private static void printHeader(String sep, int offset) {
105 |         System.out.println(generateN("-", offset) + "--------------------------------------------------------------------------------------------------------------------------------------------------------");
106 |         System.out.println(generateN(" ", offset) + "        exact                                                                              "+sep+""+sep+" partial");
107 |         System.out.println(generateN(" ", offset) + "        ALL            "+sep+" DIRECT         "+sep+" INDIRECT       "+sep+" MIXED          "+sep+" cue            "+sep+""+sep+" ALL            "+sep+" DIRECT         "+sep+" INDIRECT       "+sep+" MIXED         ");
108 |         System.out.println(generateN(" ", offset) + "        P    R    F    "+sep+" P    R    F    "+sep+" P    R    F    "+sep+" P    R    F    "+sep+" P    R    F    "+sep+""+sep+" P    R    F    "+sep+" P    R    F    "+sep+" P    R    F    "+sep+" P    R    F   ");
109 | 
110 |     }
111 | 
112 |     private static void printFooter(int offset) {
113 |         System.out.println(generateN("-", offset) + "--------------------------------------------------------------------------------------------------------------------------------------------------------");
114 |     }
115 | 
116 | 
117 |     private static void printResults(String prefix, String sep, SpanResults trainingEval, SpanResults testEval, SpanResults valEval, SpanResults resEval) {
118 |         printHeader(sep, prefix.length() + 1);
119 | 
120 |         if (trainingEval != null) System.out.println(prefix + " TRAIN  " + trainingEval.toString(sep));
121 |         if (testEval != null) System.out.println(prefix + " TEST   " + testEval.toString(sep));
122 |         if (valEval != null) System.out.println(prefix + " VAL    " + valEval.toString(sep));
123 |         if (resEval != null) System.out.println(prefix + " RES    " + resEval.toString(sep));
124 | 
125 |         printFooter(prefix.length() + 1);
126 |     }
127 | 
128 |     public static ResultSet evaluateAndPrint(String prefix, String sep, List<Document> trainingDocuments, List<Document> testDocuments, List<Document> valDocuments, List<Document> resDocuments) {
129 |         ResultSet resultSet = new ResultSet();
130 | 
131 |         if (trainingDocuments != null) resultSet.trainResults = cueContentEvaluation(trainingDocuments);
132 |         if (testDocuments != null) resultSet.testResults = cueContentEvaluation(testDocuments);
133 |         if (valDocuments != null) resultSet.valResults = cueContentEvaluation(valDocuments);
134 |         if (resDocuments != null) resultSet.resResults = cueContentEvaluation(resDocuments);
135 | 
136 |         printResults(prefix, sep, resultSet.trainResults, resultSet.testResults,
137 |                 resultSet.valResults, resultSet.resResults);
138 | 
139 |         return resultSet;
140 |     }
141 | 
142 | 
143 | }
144 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/Binning.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.features;
 19 | 
 20 | import java.util.ArrayList;
 21 | import java.util.List;
 22 | 
 23 | /**
 24 |  * Binning for distances
 25 |  * Created by scheibcn on 3/4/16.
 26 |  */
 27 | public class Binning {
 28 |     /**
 29 |      * Bins that stack up from 0 to 100
 30 |      * @param distance
 31 |      * @param prefix
 32 |      * @return
 33 |      */
 34 |     public static List<String> distanceBinsStackUp (int distance, String prefix) {
 35 |         List<String> features = new ArrayList<>();
 36 |         
 37 |         if (distance > 0) features.add(prefix + ">=1");
 38 |         if (distance > 1) features.add(prefix + ">=2");
 39 |         if (distance > 2) features.add(prefix + ">=3");
 40 |         if (distance > 3) features.add(prefix + ">=4");
 41 |         if (distance > 4) features.add(prefix + ">=5");
 42 |         if (distance > 5) features.add(prefix + ">=6");
 43 |         if (distance > 6) features.add(prefix + ">=7");
 44 |         if (distance > 7) features.add(prefix + ">=8");
 45 |         if (distance > 10) features.add(prefix + ">=11");
 46 |         if (distance > 15) features.add(prefix + ">=16");
 47 |         if (distance > 20) features.add(prefix + ">=21");
 48 |         if (distance > 25) features.add(prefix + ">=26");
 49 |         if (distance > 30) features.add(prefix + ">=31");
 50 |         if (distance > 40) features.add(prefix + ">=41");
 51 |         if (distance > 50) features.add(prefix + ">=51");
 52 |         if (distance > 60) features.add(prefix + ">=61");
 53 |         if (distance > 70) features.add(prefix + ">=71");
 54 |         if (distance > 80) features.add(prefix + ">=81");
 55 |         if (distance > 90) features.add(prefix + ">=91");
 56 |         if (distance > 100) features.add(prefix + ">=101");
 57 | 
 58 |         return features;
 59 |     }
 60 | 
 61 |     /**
 62 |      * Bins that stack down from 0 to 100
 63 |      * @param distance
 64 |      * @param prefix
 65 |      * @return
 66 |      */
 67 |     public static List<String> distanceBinsStackDown (int distance, String prefix) {
 68 |         List<String> features = new ArrayList<>();
 69 | 
 70 |         if (distance < 2) features.add(prefix + "<=1");
 71 |         if (distance < 3) features.add(prefix + "<=2");
 72 |         if (distance < 4) features.add(prefix + "<=3");
 73 |         if (distance < 5) features.add(prefix + "<=4");
 74 |         if (distance < 6) features.add(prefix + "<=5");
 75 |         if (distance < 7) features.add(prefix + "<=6");
 76 |         if (distance < 8) features.add(prefix + "<=7");
 77 |         if (distance < 9) features.add(prefix + "<=8");
 78 |         if (distance < 12) features.add(prefix + "<=11");
 79 |         if (distance < 17) features.add(prefix + "<=16");
 80 |         if (distance < 22) features.add(prefix + "<=21");
 81 |         if (distance < 27) features.add(prefix + "<=26");
 82 |         if (distance < 32) features.add(prefix + "<=31");
 83 |         if (distance < 42) features.add(prefix + "<=41");
 84 |         if (distance < 52) features.add(prefix + "<=51");
 85 |         if (distance < 62) features.add(prefix + "<=61");
 86 |         if (distance < 72) features.add(prefix + "<=71");
 87 |         if (distance < 82) features.add(prefix + "<=81");
 88 |         if (distance < 92) features.add(prefix + "<=91");
 89 |         if (distance < 102) features.add(prefix + "<=101");
 90 | 
 91 |         return features;
 92 |     }
 93 | 
 94 |     /**
 95 |      * Interval bins from 0 to 100
 96 |      * @param distance
 97 |      * @param prefix
 98 |      * @return
 99 |      */
100 |     public static List<String> distanceBins1to100(int distance, String prefix) {
101 |         List<String> features = new ArrayList<>();
102 | 
103 |         if (distance > 0 && distance < 5) features.add(prefix + "_in_[0,5)");
104 |         if (distance >= 5 && distance < 10) features.add(prefix + "_in_[5,10)");
105 |         if (distance >= 10 && distance < 20) features.add(prefix + "_in_[10,20)");
106 |         if (distance >= 20 && distance < 40) features.add(prefix + "_in_[20,40)");
107 |         if (distance >= 40 && distance < 60) features.add(prefix + "_in_[40,60)");
108 |         if (distance >= 60 && distance < 80) features.add(prefix + "_in_[60,80)");
109 |         if (distance >= 80 && distance <= 100) features.add(prefix + "_in_[60,100]");
110 | 
111 |         return features;
112 |     }
113 | 
114 | 
115 |     /**
116 |      * Bins from 0 to 100, intervals and stacking up & down
117 |      * @param distance
118 |      * @param prefix
119 |      * @return
120 |      */
121 |     public static List<String> distanceBinsAll (int distance, String prefix) {
122 |         List<String> features = new ArrayList<>();
123 |         features.addAll(distanceBins1to100(distance, prefix));
124 |         features.addAll(distanceBinsStackDown(distance,prefix));
125 |         features.addAll(distanceBinsStackUp(distance,prefix));
126 |         return features;
127 |     }
128 | }
129 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/FeatureExtraction.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.features;
 19 | 
 20 | import java.io.IOException;
 21 | 
 22 | 
 23 | import ims.cs.qsample.features.components.SentenceConstituentFeatures;
 24 | import ims.cs.qsample.features.components.SentenceDependencyFeatures;
 25 | import ims.cs.qsample.features.components.SentenceFeaturesDerivedFromListCue;
 26 | import ims.cs.qsample.features.components.SentenceIndicatorFeatures;
 27 | import ims.cs.qsample.features.components.TokenDictFeatures;
 28 | import ims.cs.qsample.features.components.TokenLexicalFeatures;
 29 | import ims.cs.qsample.features.components.TokenListFeatures;
 30 | import ims.cs.lingdata.Document;
 31 | import ims.cs.lingdata.Sentence;
 32 | import ims.cs.lingdata.Token;
 33 | import ims.cs.qsample.features.components.DocumentOffsetConjunction;
 34 | import ims.cs.qsample.features.components.DocumentQuotationFeature;
 35 | import ims.cs.util.StaticConfig;
 36 | 
 37 | /**
 38 |  * Feature extractor class for (mostly) those features that require non-static code.
 39 |  */
 40 | public class FeatureExtraction {
 41 | 
 42 | 	private TokenListFeatures tokenPersonFeatures;
 43 | 	private TokenListFeatures tokenOrganizationFeatures;
 44 | 	private TokenListFeatures tokenTitleFeatures;
 45 | 	private TokenListFeatures tokenListFeatures;
 46 | 	private TokenListFeatures tokenNounListFeatures;
 47 | 	private TokenDictFeatures verbNetFeatures;
 48 | 	private DocumentOffsetConjunction documentOffsetConjunction;
 49 | 
 50 | 
 51 | 	public FeatureExtraction () throws IOException, ClassNotFoundException {
 52 | 		// non-static extractors
 53 | 		tokenPersonFeatures = new TokenListFeatures("resources/PARC/listfeatures/person.hyponyms.txt", "EK:PER");
 54 | 		tokenOrganizationFeatures = new TokenListFeatures("resources/PARC/listfeatures/organization.hyponyms.txt", "EK:ORG");
 55 | 		tokenTitleFeatures = new TokenListFeatures("resources/PARC/listfeatures/titles.txt", "EK:TITLE");
 56 | 		tokenListFeatures = new TokenListFeatures("resources/PARC/listfeatures/krestel_verbs.txt", "CUELIST");
 57 | 		tokenNounListFeatures = new TokenListFeatures("resources/PARC/listfeatures/attribution_nouns.txt", "NOUNCUELIST");
 58 | 		verbNetFeatures = new TokenDictFeatures("resources/PARC/listfeatures/verbnet.txt", "VERBNET");
 59 | 
 60 | 		// restrict extractors to certain pos tags
 61 | 		tokenNounListFeatures.posStart = "N";
 62 | 		tokenListFeatures.posStart = "V";
 63 | 		verbNetFeatures.posStart = "V";
 64 | 
 65 | 		// Offset conjunction on non-static features
 66 | 		documentOffsetConjunction = new DocumentOffsetConjunction();
 67 | 	}
 68 | 
 69 | 	
 70 | 	/**
 71 | 	 * Runs token-level feature extraction on the tokens in the document
 72 | 	 * @param document
 73 | 	 */
 74 | 	public void extractTokenFeatures(Document document) {
 75 | 		for (Token token : document.tokenList) {
 76 | 			tokenPersonFeatures.extract(token);
 77 | 			tokenOrganizationFeatures.extract(token);
 78 | 			tokenTitleFeatures.extract(token);
 79 | 			TokenLexicalFeatures.extract(token);
 80 | 
 81 | 			tokenListFeatures.extract(token);
 82 | 			tokenNounListFeatures.extract(token);
 83 | 			verbNetFeatures.extract(token);
 84 | 		}
 85 | 	}
 86 | 	
 87 | 
 88 | 	/**
 89 | 	 * Runs sentence-level feature extraction on the sentences in the document
 90 | 	 * @param document
 91 | 	 */
 92 | 	public void extractSentenceFeatures (Document document) {
 93 | 		for (Sentence sentence : document.sentenceList) {
 94 | 			SentenceIndicatorFeatures.extract(sentence);
 95 | 			if (StaticConfig.dependencyFeatures) SentenceDependencyFeatures.extract(sentence);
 96 | 			if (StaticConfig.constituentFeatures) SentenceConstituentFeatures.extract(sentence);
 97 | 			SentenceFeaturesDerivedFromListCue.extract(sentence);
 98 | 		}
 99 | 	}
100 | 
101 | 
102 | 	public void setUpFeatureSets(Document doc) {
103 | 		for (Token token : doc.tokenList)
104 | 			if (token.boundaryFeatureSet == null)
105 | 				token.boundaryFeatureSet = new FeatureIntSet();
106 | 	}
107 | 
108 | 
109 | 	/**
110 | 	 * Runs feature extraction on a single document
111 | 	 * @param document
112 | 	 */
113 | 	public void extractAllFeatures (Document document) {
114 | 		// initialize empty feature sets
115 | 		setUpFeatureSets(document);
116 | 		
117 | 		// Token features & sentence features
118 | 		extractTokenFeatures(document);
119 | 		extractSentenceFeatures(document);
120 | 
121 | 		// quotation mark features
122 | 		if (StaticConfig.documentQuotationFeature)
123 | 			DocumentQuotationFeature.extract(document);
124 | 
125 | 		// offset conjunction
126 | 		if (StaticConfig.documentOffsetConjunction)
127 | 			documentOffsetConjunction.extract(document);
128 | 
129 | 		// additional features
130 | 		BoundaryFeatures.additionalBoundaryFeatures(document);
131 | 	}
132 | 
133 | 
134 | }
135 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/FeatureIndexMap.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.qsample.features;
19 | 
20 | import java.util.ArrayList;
21 | import java.util.HashMap;
22 | import java.util.List;
23 | import java.util.Map;
24 | 
25 | /**
26 |  * Automatically counting string to int mapping for feature sets.
27 |  * Created by scheibcn on 6/1/16.
28 |  */
29 | public class FeatureIndexMap {
30 |     Map<String, Integer> f2i;
31 |     List<String> i2f;
32 | 
33 |     int maxIndex = -1;
34 | 
35 |     FeatureIndexMap () {
36 |         f2i = new HashMap<>();
37 |         i2f = new ArrayList<>();
38 |     }
39 | 
40 |     /**
41 |      * Translate string to index. If the string is unknown, it is assigned a new index.
42 |      * @param feature
43 |      * @return
44 |      */
45 |     public int getIndex(String feature) {
46 |         if (f2i.containsKey(feature)) {
47 |             return f2i.get(feature);
48 |         } else {
49 |             maxIndex++;
50 |             f2i.put(feature, maxIndex);
51 |             i2f.add(feature);
52 |             return maxIndex;
53 |         }
54 |     }
55 | 
56 |     /**
57 |      * Translate index to string.
58 |      * @param index
59 |      * @return
60 |      */
61 |     public String getFeature(int index) {
62 |         if (index <= maxIndex) {
63 |             return i2f.get(index);
64 |         } else {
65 |             throw new Error("Lookup error");
66 |         }
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/FeatureIntSet.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.features;
 19 | 
 20 | import java.util.Collection;
 21 | import java.util.HashSet;
 22 | import java.util.Iterator;
 23 | import java.util.Set;
 24 | 
 25 | /**
 26 |  * A feature set storing features as integers.
 27 |  * Created by scheibcn on 6/1/16.
 28 |  */
 29 | public class FeatureIntSet implements FeatureSet {
 30 | 
 31 |     // internal mapping from feature strings to integers
 32 |     static FeatureIndexMap featureIndexMap = new FeatureIndexMap(); // a static map across all feature sets
 33 |     Set<Integer> featureIndices =  new HashSet<>();
 34 | 
 35 |     @Override
 36 |     public int size() {
 37 |         return featureIndices.size();
 38 |     }
 39 | 
 40 |     @Override
 41 |     public boolean isEmpty() {
 42 |         return featureIndices.isEmpty();
 43 |     }
 44 | 
 45 | 
 46 | 
 47 |     @Override
 48 |     public boolean add(String s) {
 49 |         int index = featureIndexMap.getIndex(s);
 50 |         featureIndices.add(index);
 51 |         return true;
 52 |     }
 53 | 
 54 |     @Override
 55 |     public Iterator<String> iterator() { return new StringIterator(); }
 56 | 
 57 |     @Override
 58 |     public boolean addAll(Collection<? extends String> c) {
 59 |         if (c instanceof FeatureIntSet) {
 60 |             // just call addAll on the index sets
 61 |             featureIndices.addAll(((FeatureIntSet) c).featureIndices);
 62 |         } else if (c instanceof Collection) {
 63 |             for (String s : c) this.add(s);
 64 |         } else {
 65 |             throw new Error("Incompatible types");
 66 |         }
 67 | 
 68 |         return true;
 69 |     }
 70 | 
 71 | 
 72 |     @Override
 73 |     public boolean contains(Object o) {
 74 |         int targetIndex = featureIndexMap.getIndex((String) o);
 75 |         return featureIndices.contains(targetIndex);
 76 |     }
 77 | 
 78 |     @Override
 79 |     public void clear() { featureIndices.clear(); }
 80 | 
 81 | 
 82 |     /**
 83 |      * Iterator that automatically maps the stored indices to strings
 84 |      */
 85 |     class StringIterator implements Iterator<String> {
 86 | 
 87 |         Iterator<Integer> featureIndexIter;
 88 | 
 89 |         StringIterator () { featureIndexIter = featureIndices.iterator(); }
 90 | 
 91 |         @Override
 92 |         public boolean hasNext() {
 93 |             return featureIndexIter.hasNext();
 94 |         }
 95 | 
 96 |         @Override
 97 |         public String next() {
 98 |             int index = featureIndexIter.next();
 99 |             return featureIndexMap.getFeature(index);
100 |         }
101 | 
102 |         @Override
103 |         public void remove() {
104 |             featureIndexIter.remove();
105 |         }
106 |     }
107 | 
108 | 
109 | 
110 |     // NOTE: for compatibility, FeatureSets are collections
111 |     // BELOW: interfaces inherited from collection that we do not need to implement
112 | 
113 |     @Override
114 |     public Object[] toArray() { throw new Error("Not implemented"); }
115 | 
116 |     @Override
117 |     public <T> T[] toArray(T[] a) { throw new Error("Not implemented"); }
118 | 
119 |     @Override
120 |     public boolean remove(Object o) { throw new Error("Not implemented"); }
121 | 
122 |     @Override
123 |     public boolean containsAll(Collection<?> c) { throw new Error("Not implemented"); }
124 | 
125 |     @Override
126 |     public boolean removeAll(Collection<?> c) { throw new Error("Not implemented"); }
127 | 
128 |     @Override
129 |     public boolean retainAll(Collection<?> c) { throw new Error("Not implemented"); }
130 | 
131 | }
132 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/FeatureSet.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.qsample.features;
19 | 
20 | import java.util.Collection;
21 | 
22 | /**
23 |  * Interface for feature sets. For now just a collection of String.
24 |  * Created by scheibcn on 6/1/16.
25 |  */
26 | public interface FeatureSet extends Collection<String> {
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/FeatureStringSet.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.qsample.features;
19 | 
20 | import java.util.HashSet;
21 | import java.util.Iterator;
22 | 
23 | /**
24 |  * A feature set that stores features as strings internally. Essentially just a HashSet.
25 |  */
26 | public class FeatureStringSet extends HashSet<String> implements FeatureSet {
27 | 	public FeatureStringSet(FeatureStringSet f) {
28 | 		super(f);
29 | 	}
30 | 	
31 | 	public FeatureStringSet() {
32 | 		super();
33 | 	}
34 | 
35 | 	public FeatureStringSet(int size) {
36 | 		super(size);
37 | 	}
38 | 	
39 | 	@Override
40 | 	public boolean add(String e) { return super.add(e);	}
41 | 
42 | 	@Override
43 | 	public Iterator<String> iterator() {
44 | 		return super.iterator();
45 | 	}
46 | 
47 | 
48 | }
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/DocumentOffsetConjunction.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.features.components;
 19 | 
 20 | import java.util.Arrays;
 21 | import java.util.LinkedList;
 22 | import java.util.List;
 23 | 
 24 | import ims.cs.lingdata.Document;
 25 | import ims.cs.lingdata.Token;
 26 | import ims.cs.qsample.features.FeatureSet;
 27 | import org.apache.commons.lang3.StringUtils;
 28 | 
 29 | /**
 30 |  * Offset conjunction over a selection of features.
 31 |  * Idea here: enumerate all possible patterns of feature conjunctions. Then test for each feature set whether it
 32 |  * contains each of the conjunctions. If so, add the conjunction.
 33 |  */
 34 | public class DocumentOffsetConjunction {
 35 | 
 36 | 	// features subject to conjunction
 37 | 	private static final String[] features = new String[] {"SENT:QUOT", "SENT:NE", "SENT:PRO", "SENT:HASCUE", "CUE-DEP", "IS-LEFTMOST", "SENT-BEGIN-WIN", "SENT-END-WIN"};
 38 | 
 39 | 	private List<String[]> patternList;
 40 | 	
 41 | 
 42 | 	public DocumentOffsetConjunction() {
 43 | 		patternList = new LinkedList<>();
 44 | 		
 45 | 		// add empty entry to start
 46 | 		patternList.add(new String[] {});
 47 | 		
 48 | 		
 49 | 		for (String s : features) {
 50 | 			List<String[]> newPatterns = new LinkedList<String[]>();
 51 | 			for (String[] pattern : patternList) {
 52 | 				String[] concat = append(pattern, s);
 53 | 				newPatterns.add(concat);
 54 | 			}
 55 | 			
 56 | 			patternList.addAll(newPatterns);
 57 | 		}
 58 | 		
 59 | 		// remove the empty entry
 60 | 		patternList.remove(0);
 61 | 	}
 62 | 
 63 | 	/**
 64 | 	 * Add feature conjunctions to all tokens in the document
 65 | 	 * @param document
 66 | 	 */
 67 | 	public void extract (Document document) {
 68 | 		List<Token> tokenList = document.getTokenList();
 69 | 		
 70 | 		for (Token token : tokenList) {
 71 | 			FeatureSet fs = token.boundaryFeatureSet;
 72 | 
 73 | 			// check for each pattern whether the feature set satisfies it
 74 | 			for (String[] features : patternList) {
 75 | 				boolean matches = true;
 76 | 				for (String feature: features) {
 77 | 					if (!fs.contains(feature)) {
 78 | 						matches = false;
 79 | 						break;
 80 | 					}
 81 | 				}
 82 | 
 83 | 				// if the pattern is satisfied, add the conjunction
 84 | 				if (matches) {
 85 | 					fs.add("CONJUNCTION:" + StringUtils.join(",", features));
 86 | 				}
 87 | 			}
 88 | 		}
 89 | 	}
 90 | 	
 91 | 	
 92 | 	public static String[] append (String[] a1, String s) {
 93 | 		String[] ret = new String[a1.length + 1];
 94 | 		System.arraycopy(a1, 0, ret, 0, a1.length);
 95 | 		ret[ret.length-1] = s;
 96 | 		return ret;
 97 | 	}
 98 | 	
 99 | 	public void printPatterns() {
100 | 		for(String[] p : patternList) {
101 | 			System.out.println(Arrays.toString(p));
102 | 		}
103 | 	}
104 | 
105 | 
106 | }
107 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/DocumentQuotationFeature.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.qsample.features.components;
19 | 
20 | import java.util.List;
21 | 
22 | import ims.cs.lingdata.Document;
23 | import ims.cs.lingdata.Token;
24 | import ims.cs.corenlp.Helper;
25 | 
26 | /**
27 |  * Check for each token whether it is enclosed in quotation marks
28 |  */
29 | public abstract class DocumentQuotationFeature {
30 | 	
31 | 	public static final String INQ_PREFIX = "DOC:INQ";
32 | 	public static final String NOTINQ_PREFIX = "DOC:NOTINQ";
33 | 	public static final String OPEN_PREFIX = "DOC:Q-OPENS";
34 | 	public static final String CLOSE_PREFIX = "DOC:Q-CLOSES";
35 | 
36 | 	public static void extract(Document document) {
37 | 		boolean inQuote = false;
38 | 
39 | 		List<Token> tokenList = document.getTokenList();
40 | 			
41 | 		for (Token token : tokenList) {
42 | 			// check if token is a quotation mark and is not to be ignored
43 | 			// (paragraph-initial tokens may be marked to be ignored)
44 | 			if (Helper.isQuote(token) && !token.ignoreQuote) {
45 | 
46 | 				// add respective feature ...
47 | 				if (inQuote)
48 | 					token.boundaryFeatureSet.add(CLOSE_PREFIX);
49 | 				else
50 | 					token.boundaryFeatureSet.add(OPEN_PREFIX);
51 | 
52 | 				// toggle in-quote state
53 | 				inQuote = !inQuote;
54 | 				token.boundaryFeatureSet.add(INQ_PREFIX);
55 | 			} else if (inQuote) {   /* currently in quote */
56 | 				token.boundaryFeatureSet.add(INQ_PREFIX);
57 | 			} else {   /* currently not in quote */
58 | 				token.boundaryFeatureSet.add(NOTINQ_PREFIX);
59 | 			}
60 | 		}
61 | 	}
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/SentenceConstituentFeatures.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.features.components;
 19 | 
 20 | import java.util.LinkedList;
 21 | import java.util.List;
 22 | 
 23 | import ims.cs.lingdata.Sentence;
 24 | import ims.cs.lingdata.Token;
 25 | import ims.cs.qsample.features.FeatureSet;
 26 | import edu.stanford.nlp.trees.Tree;
 27 | import ims.cs.util.StaticConfig;
 28 | 
 29 | /**
 30 |  * Token features derived from the constituency parse of a sentence
 31 |  */
 32 | public abstract class SentenceConstituentFeatures {
 33 | 
 34 | 	// feature names
 35 | 	private static final String LEVEL_FEATURE = "LVL";
 36 | 	private static final String LEFTMOST_FEATURE = "IS-LEFTMOST";
 37 | 	private static final String GOV_FEATURE = "GOV:";
 38 | 	private static final String AL_FEATURE = "AL:";
 39 | 	private static final String PARENT_FEATURE = "PARENT:";
 40 | 
 41 | 	public static void extract(Sentence s) {
 42 | 		addTreeFeatures(s, s.tree);
 43 | 	}
 44 | 
 45 | 	/**
 46 | 	 * Class for keeping track of node-level pairs
 47 | 	 */
 48 | 	private static class NodeFeatures {
 49 | 		String label;
 50 | 		Integer level;
 51 | 		
 52 | 		public NodeFeatures(String label, int depth) {
 53 | 			this.label = label;
 54 | 			this.level = depth;
 55 | 		}
 56 | 	}
 57 | 
 58 | 	/**
 59 | 	 * Add tree features recursively
 60 | 	 * @param s
 61 | 	 * @param t
 62 | 	 */
 63 | 	private static void addTreeFeatures(Sentence s, Tree t) {
 64 | 		addTreeFeatures(s, t, 0, new LinkedList<NodeFeatures>(), null, true, null);
 65 | 	}
 66 | 
 67 | 	/**
 68 | 	 * Recursion step for tree featues
 69 | 	 * @param sentence
 70 | 	 * @param t complete tree
 71 | 	 * @param level current level
 72 | 	 * @param governingLabels list of governing labels
 73 | 	 * @param parent information about direct parent
 74 | 	 * @param isLeftmost is the node the leftmost one in the constituent specified by ancestorWhereLeftmost
 75 | 	 * @param ancestorWhereLeftmost
 76 | 	 */
 77 | 	private static void addTreeFeatures(Sentence sentence, Tree t, int level, List<NodeFeatures> governingLabels, NodeFeatures parent, boolean isLeftmost, NodeFeatures ancestorWhereLeftmost) {
 78 | 
 79 | 
 80 | 		if (t.isLeaf()) {   /* terminal nodes */
 81 | 			// get the current token represented by this subtree
 82 | 			Token pToken = sentence.treeLookup.get(t);
 83 | 
 84 | 			// check if token is null. this can happen if the token was unaligned previously (e.g., because of
 85 | 			// a parser error)
 86 | 			if (pToken == null) {
 87 | 				if (StaticConfig.verbose)
 88 | 					System.err.println(sentence.sentenceId + " Dropping tree without associated token: " + t + " ");
 89 | 				return;
 90 | 			}
 91 | 
 92 | 			FeatureSet fs = pToken.boundaryFeatureSet;
 93 | 
 94 | 			// leftmost feature (see Pareti paper for description)
 95 | 			if (StaticConfig.constituentLeftmost && isLeftmost)
 96 | 				fs.add(LEFTMOST_FEATURE);
 97 | 
 98 | 			// level in tree
 99 | 			if (StaticConfig.constituentLevel) {
100 | 				fs.add(LEVEL_FEATURE + level);
101 | 				addLevelBinHeuristic(pToken, LEVEL_FEATURE, level);
102 | 			}
103 | 
104 | 			// leftmost feature label
105 | 			if (StaticConfig.constituentAncestorL) {
106 | 				fs.add(AL_FEATURE + "LBL:" + ancestorWhereLeftmost.label);
107 | 				fs.add(AL_FEATURE + "LVL:" + ancestorWhereLeftmost.level);
108 | 				
109 | 				addLevelBinHeuristic(pToken, AL_FEATURE + "LVL", ancestorWhereLeftmost.level);
110 | 			}
111 | 
112 | 			// parent in constituent tree
113 | 			if (StaticConfig.constituentParent) {
114 | 				fs.add(PARENT_FEATURE + "LBL:" + parent.label);
115 | 			}
116 | 
117 | 			// labels of all ancestors
118 | 			if (StaticConfig.constituentGoverning) {   /* "Ancestor" features in the paper */
119 | 				for (NodeFeatures nf: governingLabels) {
120 | 					// label with and without depth
121 | 					fs.add(GOV_FEATURE + nf.label + "@" + nf.level);   /* ambiguous in paper */
122 | 					fs.add(GOV_FEATURE + nf.label);
123 | 					fs.add(GOV_FEATURE + nf.label + "@-" + (level - nf.level));   /* ambiguous in paper */
124 | 
125 | 					addLevelBinHeuristic(pToken, GOV_FEATURE + nf.label + "@", nf.level);
126 | 					addLevelBinHeuristic(pToken, GOV_FEATURE + nf.label + "@-", (level - nf.level));
127 | 				}
128 | 			}
129 | 		} else {  // non-terminal node
130 | 			List<Tree> childList = t.getChildrenAsList();
131 | 			String label = t.label().toString();
132 | 
133 | 			// copy governing node features for next recursion step
134 | 			List<NodeFeatures> governingLabelsUpdate = new LinkedList<NodeFeatures>(governingLabels);
135 | 			governingLabelsUpdate.add(new NodeFeatures(label, level));
136 | 
137 | 			// set leftmost ancestor
138 | 			if (ancestorWhereLeftmost == null) {
139 | 				ancestorWhereLeftmost = new NodeFeatures(label, level);
140 | 			}
141 | 
142 | 			// check for pre-terminals -- otherwise, set the leftmost flag for the first constituent
143 | 			if (childList.size() > 1) {
144 | 				isLeftmost = true;
145 | 			}
146 | 
147 | 			// call function for all children
148 | 			for (Tree child : childList) {
149 | 				addTreeFeatures(sentence, child, level + 1, governingLabelsUpdate, new NodeFeatures(label, level), isLeftmost, ancestorWhereLeftmost);
150 | 				isLeftmost = false;
151 | 				ancestorWhereLeftmost = null;
152 | 			}
153 | 		}
154 | 	}
155 | 
156 | 	/**
157 | 	 * Binning for levels
158 | 	 * @param mToken
159 | 	 * @param feature
160 | 	 * @param value
161 | 	 */
162 | 	private static void addLevelBinHeuristic(Token mToken, String feature, int value) {
163 | 		if (!StaticConfig.constituentBinning) return;
164 | 		
165 | 		FeatureSet fs = mToken.boundaryFeatureSet;
166 | 
167 | 		int[] bins = new int[] {0, 1, 2, 3, 5, 7, 10, 13, 16, 20, 25, 40, 1000  };
168 | 		
169 | 		for (int i=0; i < bins.length - 1; i++) {
170 | 			int threshLower = bins[i];
171 | 			int threshUpper = bins[i + 1];
172 | 
173 | 			// threshold satisfied? add bin feature!
174 | 			if (value <= threshUpper) {
175 | 				if (StaticConfig.constituentBinningStacked) {
176 | 					fs.add(feature + "(<=)" + threshLower);
177 | 					if (value >= threshLower)
178 | 						fs.add(feature + "(>=)" + threshLower);
179 | 				} else if (value > threshLower) {
180 | 					fs.add(feature + "(EXACT)" + threshLower);
181 | 				}
182 | 			}
183 | 		}
184 | 	}
185 | 
186 | 	
187 | }
188 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/SentenceDependencyFeatures.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.qsample.features.components;
19 | 
20 | import java.util.List;
21 | 
22 | import ims.cs.lingdata.Sentence;
23 | import ims.cs.lingdata.Token;
24 | import ims.cs.corenlp.Helper;
25 | import ims.cs.qsample.features.FeatureSet;
26 | import edu.stanford.nlp.semgraph.SemanticGraphEdge;
27 | import ims.cs.util.StaticConfig;
28 | 
29 | /**
30 |  * Token features derived from the dependency parse of a sentence
31 |  */
32 | public abstract class SentenceDependencyFeatures {
33 | 
34 | 	// feature names
35 | 	private static final String PARENT_REL_PREFIX = "PARENT-REL";
36 | 	private static final String PARENT_RELHEAD_PREFIX = "PARENT-REL+HD";
37 | 	private static final String CHILD_REL_PREFIX = "CHILD-REL";
38 | 	private static final String CHILD_RELHEAD_PREFIX = "CHILD-REL+HD";
39 | 
40 | 	/**
41 | 	 * Extract dependency features for all tokens in this sentence
42 | 	 * @param sentence
43 | 	 */
44 | 	public static void extract (Sentence sentence) {
45 | 		for (Token pToken : sentence.tokenList) {
46 | 			if (StaticConfig.dependencyParentRel || StaticConfig.dependencyParentRelHead) addParentFeature(pToken);
47 | 			if (StaticConfig.dependencyChildRel || StaticConfig.dependencyChildRelHead) addChildFeatures(pToken);
48 | 		}
49 | 	}
50 | 
51 | 	/**
52 | 	 * Add features about the parent of the token
53 | 	 * @param token
54 | 	 */
55 | 	private static void addParentFeature(Token token) {
56 | 		SemanticGraphEdge parentEdge = Helper.getDependencyParentRel(token);
57 | 		
58 | 		FeatureSet fs = token.boundaryFeatureSet;
59 | 		
60 | 		if (parentEdge != null) {
61 | 			// plain parent
62 | 			if (StaticConfig.dependencyParentRel)
63 | 				fs.add(PARENT_REL_PREFIX + "=" + parentEdge.getRelation());
64 | 
65 | 			// parent and relation label
66 | 			if (StaticConfig.dependencyParentRelHead)
67 | 				fs.add(PARENT_RELHEAD_PREFIX + "=" + parentEdge.getRelation() + "," + parentEdge.getGovernor().lemma());
68 | 		}
69 | 	}
70 | 
71 | 	/**
72 | 	 * Add features about the child of a token
73 | 	 * @param pcToken
74 | 	 */
75 | 	private static void addChildFeatures(Token pcToken) {
76 | 		List<SemanticGraphEdge> childEdgeList = Helper.getDependencyChildrenRels(pcToken);
77 | 		FeatureSet fs = pcToken.boundaryFeatureSet;
78 | 
79 | 		if (childEdgeList != null) {
80 | 			for (SemanticGraphEdge childEdge : childEdgeList) {
81 | 				// plain child
82 | 				if (StaticConfig.dependencyChildRel)
83 | 					fs.add(CHILD_REL_PREFIX + "=" + childEdge.getRelation());
84 | 
85 | 				// child and relation label
86 | 				if (StaticConfig.dependencyChildRelHead)
87 | 					fs.add(CHILD_RELHEAD_PREFIX + "=" + childEdge.getRelation() + "," + childEdge.getDependent().lemma());
88 | 			}
89 | 		}
90 | 	}
91 | 
92 | 
93 | }
94 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/SentenceFeaturesDerivedFromListCue.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.features.components;
 19 | 
 20 | import ims.cs.lingdata.Sentence;
 21 | import ims.cs.lingdata.Token;
 22 | import ims.cs.corenlp.Helper;
 23 | import ims.cs.util.StaticConfig;
 24 | 
 25 | import java.util.LinkedList;
 26 | import java.util.List;
 27 | 
 28 | /**
 29 |  * Token features based on cue information from the noun cue list.
 30 |  */
 31 | public abstract class SentenceFeaturesDerivedFromListCue {
 32 | 
 33 | 	private static final String CUE_DEP_PREFIX = "CUE-DEP:NOUNCUE";
 34 | 	private static final String CUE_PREFIX = "SENT:HASCUE:NOUNCUE";
 35 | 
 36 | 	/**
 37 | 	 * Extract features for all tokens in the sentence
 38 | 	 * @param sentence
 39 | 	 */
 40 | 	public static void extract (Sentence sentence) {
 41 | 		boolean sentenceHasCueFeature = sentenceHasCue(sentence.tokenList);
 42 | 
 43 | 		// check each token for noun-cue-ness, push features to its dependents (transitively)
 44 | 		for (Token pToken : sentence.tokenList) {
 45 | 			if (StaticConfig.dependencyCueDependent) {
 46 | 				// token is in noun cue list
 47 | 				if (pToken.boundaryFeatureSet.contains("NOUNCUELIST"))
 48 | 					addCueDependentFeature("LIST", pToken, sentence);
 49 | 
 50 | 				// token is "according to"
 51 | 				if (pToken.predText.toLowerCase().equals("according")
 52 | 						&& pToken.nextToken != null
 53 | 						&& pToken.nextToken.predText.equals("to"))
 54 | 					addCueDependentFeature("ACCORDINGTO", pToken, sentence);
 55 | 			}
 56 | 
 57 | 			SentenceIndicatorFeatures.addFeaturePositiveAndNegative(CUE_PREFIX, sentenceHasCueFeature, pToken);
 58 | 		}
 59 | 	}
 60 | 
 61 | 	/**
 62 | 	 * Push features to all dependents of a cue
 63 | 	 * @param type
 64 | 	 * @param token
 65 | 	 * @param sentence
 66 | 	 */
 67 | 	private static void addCueDependentFeature(String type, Token token, Sentence sentence) {
 68 | 		List<Token> stack = new LinkedList<Token>();
 69 | 		stack.add(token);
 70 | 
 71 | 		// recursively iterate over all children (and their children ...)
 72 | 		while (stack.size() > 0) {
 73 | 			Token current = stack.remove(0);
 74 | 			current.boundaryFeatureSet.add(CUE_DEP_PREFIX + "-" + type);
 75 | 			
 76 | 			List<Token> children = Helper.getDependencyChildren(current);
 77 | 			
 78 | 			if (children == null) continue;
 79 | 
 80 | 			for (Token c : children) {
 81 | 				if (c != null)	stack.add(c);
 82 | 			}
 83 | 		}
 84 | 	}
 85 | 
 86 | 	/**
 87 | 	 * Check whether the sentence has any noun cues
 88 | 	 * @param data
 89 | 	 * @return
 90 | 	 */
 91 | 	private static boolean sentenceHasCue(List<Token> data) {
 92 | 		for (Token token: data) {
 93 | 			if (token.boundaryFeatureSet.contains("NOUNCUELIST")) {
 94 | 				return true;
 95 | 			}
 96 | 		}
 97 | 		return false;
 98 | 	}
 99 | 
100 | }
101 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/SentenceIndicatorFeatures.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.features.components;
 19 | 
 20 | import ims.cs.lingdata.Sentence;
 21 | import ims.cs.lingdata.Token;
 22 | import ims.cs.corenlp.Helper;
 23 | import ims.cs.qsample.features.FeatureSet;
 24 | import ims.cs.util.StaticConfig;
 25 | 
 26 | /**
 27 |  * Add sentence-level indicator features to each token
 28 |  */
 29 | public abstract class SentenceIndicatorFeatures {
 30 | 
 31 | 	// feature names
 32 | 	private static final String QUOT_PREFIX = "SENT:QUOT";
 33 | 	private static final String NE_PREFIX = "SENT:NE";
 34 | 	private static final String PRO_PREFIX = "SENT:PRO";
 35 | 	private static final String SL_PREFIX = "SL=";
 36 | 	private static final String SL_LT_PREFIX = "SL<=";
 37 | 	private static final String SL_GT_PREFIX = "SL>=";
 38 | 	private static final String SL_EXACT_PREFIX = "SL-EXACT-BIN=";
 39 | 	private static final String SENT_BEGIN_WINDOW = "SENT-BEGIN-WIN";
 40 | 	private static final String SENT_END_WINDOW = "SENT-END-WIN";
 41 | 	private static final String INVERT_PREFIX = "NOT:";
 42 | 
 43 | 	/**
 44 | 	 * Extract indicator features for all tokens in this sentence
 45 | 	 * @param sentence
 46 | 	 */
 47 | 	public static void extract (Sentence sentence) {
 48 | 		// pre-compute features
 49 | 		boolean sentenceHasQuotFeature = sentenceHasQuotationMark(sentence);
 50 | 		boolean sentenceHasProFeature = sentenceHasPro(sentence);
 51 | 		boolean sentenceHasNeFeature = sentenceHasNe(sentence);
 52 | 		int sentenceLength = sentence.tokenList.size();
 53 | 
 54 | 		// distance to sentence boundaries
 55 | 		sentenceBoundDistance(sentence);
 56 | 
 57 | 		// now add pre-computed features to token list
 58 | 		for (Token mToken : sentence.tokenList) {
 59 | 			if (StaticConfig.sentenceHasQuote) addFeaturePositiveAndNegative(QUOT_PREFIX, sentenceHasQuotFeature, mToken);
 60 | 			if (StaticConfig.sentenceHasPronoun) addFeaturePositiveAndNegative(PRO_PREFIX, sentenceHasProFeature, mToken);
 61 | 
 62 | 			if (StaticConfig.sentenceHasNe) addFeaturePositiveAndNegative(NE_PREFIX, sentenceHasNeFeature, mToken);
 63 | 			if (StaticConfig.sentenceLength) {
 64 | 				addLengthLogBinHeuristic(mToken, sentenceLength);
 65 | 				mToken.boundaryFeatureSet.add(SL_PREFIX + sentenceLength);
 66 | 			}
 67 | 		}
 68 | 	}
 69 | 
 70 | 	/**
 71 | 	 * Add positive or negative version of a feature (i.e., also explicitly mark the absence of a feature)
 72 | 	 * @param featureName
 73 | 	 * @param featureOn
 74 | 	 * @param token
 75 | 	 */
 76 | 	public static void addFeaturePositiveAndNegative(String featureName, boolean featureOn, Token token) {
 77 | 		if (featureOn)
 78 | 			token.boundaryFeatureSet.add(featureName);
 79 | 		else
 80 | 			token.boundaryFeatureSet.add(INVERT_PREFIX + featureName);
 81 | 	}
 82 | 
 83 | 	/**
 84 | 	 * Binning for lengths, exponential bin spacing
 85 | 	 * @param pToken
 86 | 	 * @param length
 87 | 	 */
 88 | 	private static void addLengthLogBinHeuristic(Token pToken, int length) {
 89 | 		if (!StaticConfig.sentenceLengthBinning) return;
 90 | 		
 91 | 		FeatureSet fs = pToken.boundaryFeatureSet;
 92 | 		
 93 | 		int[] bins = new int[] {0, 2, 4, 8, 16, 32, 64, 1000};
 94 | 		
 95 | 		for (int i=0; i < bins.length - 1; i++) {
 96 | 			int threshLower = bins[i];
 97 | 			int threshUpper = bins[i+1];
 98 | 			
 99 | 			if (length <= threshUpper) {
100 | 				if (StaticConfig.sentenceLengthBinningStacked) {
101 | 					fs.add(SL_LT_PREFIX + "STACKED-" + threshLower);
102 | 				} else if (length > threshLower) {
103 | 					fs.add(SL_EXACT_PREFIX + threshLower);
104 | 				}
105 | 			} 
106 | 			
107 | 			if ((length >= threshLower) && StaticConfig.sentenceLengthBinningStacked) {
108 | 				fs.add(SL_GT_PREFIX + threshLower);
109 | 			}
110 | 		}
111 | 		
112 | 	}
113 | 
114 | 	/**
115 | 	 * Add features about the distance of each token to the sentence boundary
116 | 	 * @param sentence
117 | 	 */
118 | 	private static void sentenceBoundDistance(Sentence sentence) {
119 | 		int pos = 0;
120 | 		int sl = sentence.tokenList.size();
121 | 
122 | 		for (Token token : sentence.tokenList) {
123 | 			// compute distance to end
124 | 			int endDist = sl - pos - 1;
125 | 
126 | 			// if distance to either boundary is within a window of 5, add respective feature
127 | 			if (pos < 5) token.boundaryFeatureSet.add(SENT_BEGIN_WINDOW);
128 | 			if (endDist < 5) token.boundaryFeatureSet.add(SENT_END_WINDOW);
129 | 
130 | 			pos++;
131 | 		}
132 | 	}
133 | 
134 | 	/**
135 | 	 * Determines whether a sentence contains a quotation mark
136 | 	 * @param sentence
137 | 	 * @return
138 | 	 */
139 | 	private static boolean sentenceHasQuotationMark(Sentence sentence) {
140 | 		for (Token token: sentence.tokenList) {
141 | 			if (Helper.isQuote(token)) {
142 | 				return true;
143 | 			}
144 | 		}
145 | 		return false;
146 | 	}
147 | 
148 | 	/**
149 | 	 * Determines whether a sentence contains a pronoun
150 | 	 * @param sentence
151 | 	 * @return
152 | 	 */
153 | 	private static boolean sentenceHasPro(Sentence sentence) {
154 | 		for (Token token: sentence.tokenList) {
155 | 			if (token.predPosTag.startsWith("PR")) {
156 | 				return true;
157 | 			}
158 | 		}
159 | 		return false;
160 | 	}
161 | 
162 | 	/**
163 | 	 * Determines whether a sentence contains a named entity
164 | 	 * @param sentence
165 | 	 * @return
166 | 	 */
167 | 	private static boolean sentenceHasNe(Sentence sentence) {
168 | 		for (Token token: sentence.tokenList) {
169 | 			if ((token.predNer.startsWith("PERSON")) || (token.predNer.startsWith("ORGANIZATION"))) {
170 | 				return true;
171 | 			}
172 | 		}
173 | 		return false;
174 | 	}
175 | 
176 | 	
177 | 
178 | 
179 | }
180 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/TokenDictFeatures.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.qsample.features.components;
19 | 
20 | import ims.cs.lingdata.Token;
21 | 
22 | import java.io.BufferedReader;
23 | import java.io.FileReader;
24 | import java.io.IOException;
25 | import java.util.HashMap;
26 | import java.util.HashSet;
27 | import java.util.Map;
28 | import java.util.Set;
29 | 
30 | /**
31 |  * Feature extractor that extracts information about a token from a dictionary (read from a tab-separated file)
32 |  */
33 | public class TokenDictFeatures {
34 | 
35 | 	private String featureName = "VERBDICT";
36 | 	private String listFileName;
37 | 	private Map<String, Set<String>> wordMap;
38 | 	public String posStart = null;
39 | 
40 | 
41 | 	/**
42 | 	 * Set up the feature extractor
43 | 	 * @param listFileName name of the dictionary file (tab-separated)
44 | 	 * @param featureName name of the feature that will be extracted
45 | 	 * @throws IOException
46 | 	 */
47 | 	public TokenDictFeatures(String listFileName, String featureName) throws IOException {
48 | 		this.listFileName = listFileName;
49 | 		this.featureName = featureName;
50 | 		loadDictionary();
51 | 	}
52 | 
53 | 	/**
54 | 	 * Extract dictionary information for the token t
55 | 	 * @param t
56 | 	 */
57 | 	public void extract(Token t) {
58 | 		// check if the token's lemma is in the dictionary
59 | 		if (wordMap.containsKey(t.predLemma)) {
60 | 			// check for POS restriction if necessary
61 | 			if (posStart == null || t.predPosTag.startsWith(posStart)) {
62 | 				for (String vclass : wordMap.get(t.predLemma))
63 | 					t.boundaryFeatureSet.add(featureName + "=" + vclass);
64 | 			}
65 | 		}
66 | 	}
67 | 
68 | 	/**
69 | 	 * Load dictionary from a tab-separated file
70 | 	 * @throws IOException
71 | 	 */
72 | 	private void loadDictionary() throws IOException {
73 | 		wordMap = new HashMap<>();
74 | 		
75 | 	    BufferedReader br = new BufferedReader(new FileReader(listFileName));
76 | 	    String line;
77 | 	    
78 | 	    while ((line = br.readLine()) != null) {
79 | 	    	line = line.trim();
80 | 			String[] tokens = line.split("\\s+");
81 | 			String word = tokens[0];
82 | 			String wordClass = tokens[1];
83 | 
84 | 			if (!wordMap.containsKey(word)) {
85 | 				wordMap.put(word, new HashSet<String>());
86 | 			}
87 | 
88 | 	    	wordMap.get(word).add(wordClass);
89 | 	    }
90 | 	    
91 | 	    br.close();
92 | 
93 | 	}
94 | 
95 | 	
96 | }
97 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/TokenLexicalFeatures.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.features.components;
 19 | 
 20 | import ims.cs.lingdata.Token;
 21 | import ims.cs.qsample.features.FeatureSet;
 22 | import ims.cs.util.StaticConfig;
 23 | 
 24 | /**
 25 |  * Extracts lexical information about a token (e.g., word, lemma, POS)
 26 |  */
 27 | public abstract class TokenLexicalFeatures {
 28 | 
 29 | 	private static final String TOK_PREFIX = "TOK";
 30 | 	private static final String LEMMA_PREFIX = "LEMMA";
 31 | 	private static final String POS_PREFIX = "POS";
 32 | 	private static final String BG_PREFIX = "BG";
 33 | 	private static final String NE_PREFIX = "NE";
 34 | 	private static final String PARBEGIN_PREFIX = "PAR-BEGINS";
 35 | 	private static final String PAREND_PREFIX = "PAR-ENDS";
 36 | 
 37 | 
 38 | 	/**
 39 | 	 * Extract lexical features about a single token t
 40 | 	 * @param t
 41 | 	 */
 42 | 	public static void extract(Token t) {
 43 | 
 44 | 		if (StaticConfig.lexicalPos ||
 45 | 				StaticConfig.lexicalLemma ||
 46 | 				StaticConfig.lexicalToken)
 47 | 			addWindowFeatures(t);
 48 | 
 49 | 		if (StaticConfig.lexicalBigram) addBigramFeature(t);
 50 | 		addNeFeature(t);
 51 | 		addDocStructureFeature(t);
 52 | 	}
 53 | 
 54 | 	/**
 55 | 	 * Adds paragraph begin and end features
 56 | 	 * @param token
 57 | 	 */
 58 | 	private static void addDocStructureFeature(Token token) {
 59 | 		if (token.paragraphBegins) token.boundaryFeatureSet.add(PARBEGIN_PREFIX);
 60 | 		if (token.nextToken == null || token.nextToken.paragraphBegins) token.boundaryFeatureSet.add(PAREND_PREFIX);
 61 | 	}
 62 | 
 63 | 	/**
 64 | 	 * Adds features about whether the token is part of a named entity
 65 | 	 * @param token
 66 | 	 */
 67 | 	private static void addNeFeature(Token token) {
 68 | 		if (!token.predNer.equals("?") && !token.predNer.equals("O")) {
 69 | 			token.boundaryFeatureSet.add(NE_PREFIX+"-IS-NE");
 70 | 			token.boundaryFeatureSet.add(NE_PREFIX+"-IS-NE-" + token.predNer);
 71 | 		}
 72 | 	}
 73 | 
 74 | 	/**
 75 | 	 * Adds bigram features with the previous and next token
 76 | 	 * @param token
 77 | 	 */
 78 | 	private static void addBigramFeature(Token token) {
 79 | 		String prevWordForm;
 80 | 		String prevLemma;
 81 | 
 82 | 		String nextWordForm;
 83 | 		String nextLemma;
 84 | 
 85 | 		// find previous token
 86 | 		if (token.previousToken == null) {
 87 | 			prevWordForm = "null";
 88 | 			prevLemma = "null";
 89 | 		} else {
 90 | 			Token prevToken = token.previousToken;
 91 | 			prevWordForm = prevToken.predText;
 92 | 			prevLemma = prevToken.predLemma;
 93 | 		}
 94 | 
 95 | 		// find next token
 96 | 		if (token.nextToken == null) {
 97 | 			nextWordForm = "null";
 98 | 			nextLemma = "null";
 99 | 		} else {
100 | 			Token nextToken = token.nextToken;
101 | 			nextWordForm = nextToken.predText;
102 | 			nextLemma = nextToken.predLemma;
103 | 		}
104 | 
105 | 		// add features of word and lemma bigrams
106 | 		FeatureSet fs = token.boundaryFeatureSet;
107 | 
108 | 		fs.add(BG_PREFIX + prevWordForm + "<--" + token.predText);
109 | 		fs.add(BG_PREFIX + "(LEMMA)" + prevLemma + "<--" + token.predLemma);
110 | 
111 | 		fs.add(BG_PREFIX + nextWordForm + "-->" + token.predText);
112 | 		fs.add(BG_PREFIX + "(LEMMA)" + nextLemma + "-->" + token.predLemma);
113 | 	}
114 | 
115 | 
116 | 	/**
117 | 	 * Adds features from other tokens within a window
118 | 	 * @param pToken
119 | 	 */
120 | 	private static void addWindowFeatures(Token pToken) {
121 | 		// current POS tag
122 | 		FeatureSet fs = pToken.boundaryFeatureSet;
123 | 		
124 | 		if (StaticConfig.lexicalPos)   fs.add(POS_PREFIX + "-0=" + pToken.predPosTag);
125 | 		if (StaticConfig.lexicalToken) fs.add(TOK_PREFIX + "-0=" + pToken.predText);
126 | 		if (StaticConfig.lexicalLemma) fs.add(LEMMA_PREFIX + "-0=" + pToken.predLemma);
127 | 
128 | 		
129 | 		// previous tokens
130 | 		Token currentToken = pToken;
131 | 		for (int i = 1; i <= StaticConfig.lexicalWindowSize; i++) {
132 | 			String leftPos;
133 | 			String leftTok;
134 | 			String leftLemma;
135 | 
136 | 			Token prevToken = currentToken.previousToken;
137 | 			if (prevToken != null) {
138 | 				leftPos = prevToken.predPosTag;
139 | 				leftTok = prevToken.predText;
140 | 				leftLemma = prevToken.predLemma;
141 | 				currentToken = prevToken;
142 | 			} else {
143 | 				leftPos = "NONE";
144 | 				leftLemma = "NONE";
145 | 				leftTok = "NONE";
146 | 			}
147 | 
148 | 			if (StaticConfig.lexicalPos)	fs.add("WIN_" + POS_PREFIX + "-" + i + "=" + leftPos);
149 | 			if (StaticConfig.lexicalToken) fs.add("WIN_" + TOK_PREFIX + "-" + i + "=" + leftTok);
150 | 			if (StaticConfig.lexicalLemma) fs.add("WIN_" + LEMMA_PREFIX + "-" + i + "=" + leftLemma);
151 | 		}
152 | 
153 | 		// subsequent tokens
154 | 		currentToken = pToken;
155 | 		for (int i = 1; i <= StaticConfig.lexicalWindowSize; i++) {
156 | 			String rightPos;
157 | 			String rightTok;
158 | 			String rightLemma;
159 | 
160 | 			Token nextToken = currentToken.nextToken;
161 | 			if (nextToken != null) {
162 | 				rightPos = nextToken.predPosTag;
163 | 				rightTok = nextToken.predText;
164 | 				rightLemma = nextToken.predLemma;
165 | 				currentToken = nextToken;
166 | 
167 | 			} else {
168 | 				rightPos = "NONE";
169 | 				rightLemma = "NONE";
170 | 				rightTok = "NONE";
171 | 				
172 | 			}
173 | 
174 | 			if (StaticConfig.lexicalPos)   fs.add("WIN_" + POS_PREFIX + "+" + i + "=" + rightPos);
175 | 			if (StaticConfig.lexicalToken) fs.add("WIN_" + TOK_PREFIX + "+" + i + "=" + rightTok);
176 | 			if (StaticConfig.lexicalLemma) fs.add("WIN_" + LEMMA_PREFIX + "+" + i + "=" + rightLemma);
177 | 
178 | 		}
179 | 	}
180 | 
181 | 
182 | 	
183 | }
184 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/features/components/TokenListFeatures.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.features.components;
 19 | 
 20 | import java.io.BufferedReader;
 21 | import java.io.FileReader;
 22 | import java.io.IOException;
 23 | import java.util.HashSet;
 24 | import java.util.Set;
 25 | 
 26 | import ims.cs.lingdata.Token;
 27 | 
 28 | 
 29 | /**
 30 |  * Feature extractor that check whether a token is in a list (specified in a file)
 31 |  */
 32 | public class TokenListFeatures {
 33 | 
 34 | 	private String featureName = "VERBLIST";
 35 | 	private String listFileName;
 36 | 	private Set<String> wordSet;
 37 | 	private int window = 5;
 38 | 	public String posStart = null;
 39 | 
 40 | 
 41 | 	/**
 42 | 	 * Set up the feature extractor
 43 | 	 * @param listFileName list of words (one word per line)
 44 | 	 * @param featureName
 45 | 	 * @throws IOException
 46 | 	 */
 47 | 	public TokenListFeatures(String listFileName, String featureName) throws IOException {
 48 | 		this.listFileName = listFileName;
 49 | 		this.featureName = featureName;
 50 | 		loadWordList();
 51 | 	}
 52 | 
 53 | 	/**
 54 | 	 * Extract list feature for the token t
 55 | 	 * @param t
 56 | 	 */
 57 | 	public void extract(Token t) {
 58 | 		// current token
 59 | 		if ((posStart == null || t.predPosTag.startsWith(posStart)) && wordSet.contains(t.predLemma)) {
 60 | 				t.boundaryFeatureSet.add(featureName);
 61 | 		}
 62 | 
 63 | 		// window before the token
 64 | 		Token prevToken = t;
 65 | 		for (int i = 0; i < window; i++) {
 66 | 			prevToken = prevToken.previousToken;
 67 | 			if (prevToken == null) break;
 68 | 			if (wordSet.contains(prevToken.predLemma)) {
 69 | 				t.boundaryFeatureSet.add("WIN_-" + (i+1) + "-" + featureName);
 70 | 			}
 71 | 		}
 72 | 
 73 | 		// window after the token
 74 | 		Token nextToken = t;
 75 | 		for (int i = 0; i < window; i++) {
 76 | 			nextToken = nextToken.nextToken;
 77 | 			if (nextToken == null) break;
 78 | 			if (wordSet.contains(nextToken.predLemma)) {
 79 | 				t.boundaryFeatureSet.add("WIN_+" + (i+1) + "-" + featureName);
 80 | 			}
 81 | 		}
 82 | 	}
 83 | 
 84 | 	/**
 85 | 	 * Loads the word list (one word per line)
 86 | 	 * @throws IOException
 87 | 	 */
 88 | 	private void loadWordList() throws IOException {
 89 | 		wordSet = new HashSet<>();
 90 | 		
 91 | 	    BufferedReader br = new BufferedReader(new FileReader(listFileName));
 92 | 	    String line;
 93 | 
 94 | 	    while ((line = br.readLine()) != null) {
 95 | 	    	line = line.trim();
 96 | 	    	wordSet.add(line);
 97 | 	    }
 98 | 	    
 99 | 	    br.close();
100 | 
101 | 	}
102 | 
103 | 	
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/greedysample/HasScore.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.qsample.greedysample;
19 | 
20 | /**
21 |  * An interface for things that have a score.
22 |  * Created by scheibcn on 11/5/15.
23 |  */
24 | public interface HasScore {
25 |     double getScore();
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/greedysample/Sampling.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | package ims.cs.qsample.greedysample;
18 | 
19 | import java.util.List;
20 | import java.util.Random;
21 | 
22 | /**
23 |  * Sample an element from a list of elements with a score.
24 |  * Created by scheibcn on 11/5/15.
25 |  */
26 | public class Sampling {
27 | 
28 | 
29 |     Random random;
30 |     public boolean doExp = true;
31 | 
32 |     public Sampling(Random random) {
33 |         this.random = random;
34 |     }
35 | 
36 | 
37 |     /**
38 |      * Sample an element proportionally to sigmoid-transformed scores
39 |      * @param items
40 |      */
41 |     public int sampleOne(List<HasScore> items, double temperature, double bias) {
42 |         double[] values = new double[items.size()];
43 |         double sum = 0;
44 | 
45 |         // first compute scores and normalize
46 |         for (int i = 0; i < values.length; i++) {
47 |             double score = items.get(i).getScore();
48 |             values[i] = (score + bias) / temperature;
49 | 
50 |             if (doExp) {
51 |                 values[i] = 1/(1+Math.exp(-values[i]));
52 |             }
53 |             sum += values[i];
54 |         }
55 | 
56 |         // then sample proportionally
57 |         double sumNorm = 0;
58 |         double r = random.nextDouble();
59 |         int resultPosition = 0;
60 | 
61 |         for (int i = 0; i < values.length; i++) {
62 |             values[i] /= sum;
63 |             sumNorm += values[i];
64 |             if (sumNorm > r) {
65 |                 resultPosition = i;
66 |                 break;
67 |             }
68 |         }
69 | 
70 |         return resultPosition;
71 |     }
72 | 
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/models/HigherSpanModel.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.models;
 19 | 
 20 | import ims.cs.qsample.features.FeatureSet;
 21 | import ims.cs.qsample.perceptron.Perceptron;
 22 | import ims.cs.qsample.spans.Span;
 23 | 
 24 | import java.io.FileNotFoundException;
 25 | import java.io.FileOutputStream;
 26 | import java.io.PrintStream;
 27 | import java.io.Serializable;
 28 | 
 29 | /**
 30 |  * A model for scoring a whole span (rather than just begin and end information)
 31 |  * Created by scheibcn on 3/5/16.
 32 |  */
 33 | public class HigherSpanModel implements Serializable {
 34 | 
 35 |     private static final long serialVersionUID = 3509778136938744648L;
 36 | 
 37 |     // We actually make separate models for begin, end, and span-level information.
 38 |     // This makes feature management easier, among other things.
 39 |     Perceptron beginPerceptron;
 40 |     Perceptron endPerceptron;
 41 |     Perceptron higherOrderPerceptron;
 42 | 
 43 |     public HigherSpanModel() {
 44 |         this.beginPerceptron = new Perceptron();
 45 |         this.endPerceptron = new Perceptron();
 46 |         this.higherOrderPerceptron = new Perceptron();
 47 |     }
 48 | 
 49 |     /**
 50 |      * Computes the current score of a span according to the model
 51 |      * @param span
 52 |      * @param average use averaged perceptron?
 53 |      * @return
 54 |      */
 55 |     public double score(Span span, boolean average) {
 56 |         // we handle the begin, end, and span features separately
 57 |         FeatureSet beginFeatures = span.first().boundaryFeatureSet;
 58 |         FeatureSet endFeatures = span.last().boundaryFeatureSet;
 59 |         FeatureSet spanFeatures = span.featureSet;
 60 | 
 61 |         // ... then, we can compute three individual scores
 62 |         double score = 0;
 63 |         score += beginPerceptron.score(beginFeatures, average);
 64 |         score += endPerceptron.score(endFeatures, average);
 65 |         score += higherOrderPerceptron.score(spanFeatures, average);
 66 | 
 67 |         return score;
 68 |     }
 69 | 
 70 |     /**
 71 |      * Train the model using a given span, updating with a specified learning rate
 72 |      * @param span
 73 |      * @param isPositive Has the example been correctly classified?
 74 |      * @param rate learning rate
 75 |      */
 76 |     public void train(Span span, boolean isPositive, double rate) {
 77 |         FeatureSet leftFeatures = span.first().boundaryFeatureSet;
 78 |         FeatureSet rightFeatures = span.last().boundaryFeatureSet;
 79 |         FeatureSet spanFeatures = span.featureSet;
 80 | 
 81 |         // negate the learning rate if the example was wrong
 82 |         double effectiveRate = rate;
 83 |         if (!isPositive) effectiveRate = -effectiveRate;
 84 | 
 85 |         // update the three models separately
 86 |         //   (use the update function directly as the train function would first check the score, which is nonsensical
 87 |         //   for the individual models)
 88 |         beginPerceptron.update(leftFeatures, effectiveRate);
 89 |         endPerceptron.update(rightFeatures, effectiveRate);
 90 |         higherOrderPerceptron.update(spanFeatures, effectiveRate);
 91 |     }
 92 | 
 93 | 
 94 |     /**
 95 |      * Writes the current feature weights to a file
 96 |      * @param fileName
 97 |      * @throws FileNotFoundException
 98 |      */
 99 |     public void printWeights(String fileName) throws FileNotFoundException {
100 |         FileOutputStream fos = new FileOutputStream(fileName);
101 |         PrintStream ps = new PrintStream(fos);
102 |         beginPerceptron.printWeights(ps, "BEGIN");
103 |         endPerceptron.printWeights(ps, "END");
104 |         higherOrderPerceptron.printWeights(ps, "HIGHER");
105 |         ps.close();
106 |     }
107 | 
108 | }
109 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/perceptron/Perceptron.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.perceptron;
 19 | 
 20 | import ims.cs.qsample.features.FeatureSet;
 21 | import ims.cs.qsample.spans.Span;
 22 | 
 23 | import java.io.FileNotFoundException;
 24 | import java.io.FileOutputStream;
 25 | import java.io.PrintStream;
 26 | import java.io.Serializable;
 27 | import java.util.Map;
 28 | 
 29 | /**
 30 |  * Implementation of perceptron model
 31 |  * Created by scheibcn on 11/5/15.
 32 |  */
 33 | public class Perceptron  implements Serializable {
 34 |     private static final long serialVersionUID = 3436601656314837271L;
 35 | 
 36 |     // this model can actually also do logistic regression
 37 |     public enum UpdateType {PERCEPTRON, LR};
 38 | 
 39 |     // .. default is perceptron
 40 |     public UpdateType updateType = UpdateType.PERCEPTRON;
 41 | 
 42 |     public Weights weights = new Weights();
 43 | 
 44 |     // parameters
 45 |     public double fixedBias = 0;       /* optional bias that can be manually adjusted */
 46 |     public double marginPositive = 1;  /* margin for positive class */
 47 |     public double marginNegative = 1;  /* margin for negative class */
 48 | 
 49 |     // some debugging data
 50 |     public int numUpdates = 0;
 51 | 
 52 | 
 53 |     public Perceptron() {
 54 |         weights.weightMap.put("BIAS", 0.0);
 55 |     }
 56 | 
 57 | 
 58 |     /**
 59 |      * Score a feature set
 60 |      * @param featureSet
 61 |      * @return
 62 |      */
 63 |     public double score(FeatureSet featureSet, boolean average) {
 64 |         double score = 0;
 65 | 
 66 |         // first, add bias
 67 |         if (average) {
 68 |             score += weights.getAvg("BIAS");
 69 |             score += fixedBias;
 70 |         } else {
 71 |             score += weights.get("BIAS");
 72 |         }
 73 | 
 74 |         // then, score all features in the data
 75 |         for (String feature: featureSet) {
 76 |             if (average) {
 77 |                 score += weights.getAvg(feature);
 78 |             } else {
 79 |                 score += weights.get(feature);
 80 |             }
 81 |         }
 82 | 
 83 |         return score;
 84 |     }
 85 | 
 86 |     /**
 87 |      * Perform an update with a given training example
 88 |      * @param featureSet
 89 |      * @param isPositive is this example a positive one?
 90 |      * @param rate
 91 |      */
 92 |     public void train(FeatureSet featureSet, boolean isPositive, double rate) {
 93 |         if (updateType == UpdateType.PERCEPTRON)
 94 |             trainPerceptron(featureSet, isPositive, rate);   /* perceptron update */
 95 |         else if (updateType == UpdateType.LR)
 96 |             trainLr(featureSet, isPositive, rate);           /* logistic regression update */
 97 |     }
 98 | 
 99 |     /**
100 |      * Perform a perceptron-style update
101 |      * @param featureSet
102 |      * @param isPositive
103 |      * @param rate
104 |      */
105 |     public void trainPerceptron(FeatureSet featureSet, boolean isPositive, double rate) {
106 |         double predScore = score(featureSet, false);
107 | 
108 |         if (isPositive && predScore - marginPositive <= 0) {  /* positive example and negative margin violation */
109 |             update(featureSet, rate);
110 |         } else if (!isPositive && predScore + marginNegative > 0) { /* negative example and positive margin violation */
111 |             update(featureSet, -rate);
112 |         }
113 |     }
114 | 
115 |     /**
116 |      * Perform a logistic regression update
117 |      * @param featureSet
118 |      * @param isPositive
119 |      * @param rate
120 |      */
121 |     public void trainLr(FeatureSet featureSet, boolean isPositive, double rate) {
122 |         double predScore = score(featureSet, false);
123 | 
124 |         // true probability of the example?
125 |         int trueProb;
126 |         if (isPositive) trueProb = 1;
127 |         else trueProb = 0;
128 | 
129 |         // learning rate times LR gradient
130 |         double step = rate * (trueProb - sigmoid(predScore));
131 | 
132 |         update(featureSet, step);
133 |     }
134 | 
135 | 
136 |     /**
137 |      * Update the weights for each feature by the given rate
138 |      * @param featureSet
139 |      * @param rate
140 |      */
141 |     public void update(FeatureSet featureSet, double rate) {
142 |         // bias
143 |         weights.update("BIAS", rate);
144 | 
145 |         // features
146 |         for (String feature : featureSet) {
147 |             weights.update(feature, rate);
148 |         }
149 | 
150 |         numUpdates++;
151 |     }
152 | 
153 |     /**
154 |      * Print the weights for the features of the span to debug
155 |      * @param span
156 |      * @param prefix
157 |      */
158 |     public void printInfo(Span span, String prefix) {
159 |         for (String feature: span.featureSet) {
160 |             double weight = weights.get(feature);
161 |             System.out.println(prefix + feature + "   " + weight);
162 |         }
163 |     }
164 | 
165 |     /**
166 |      * Write the current feature weights to a file
167 |      * @param fileName
168 |      * @throws FileNotFoundException
169 |      */
170 |     public void printWeights(String fileName) throws FileNotFoundException {
171 |         FileOutputStream fos = new FileOutputStream(fileName);
172 |         PrintStream ps = new PrintStream(fos);
173 |         printWeights(ps, "");
174 |     }
175 | 
176 |     /**
177 |      * Print the current feature weights to stdout
178 |      */
179 |     public void printWeights() {
180 |         printWeights(System.out, "");
181 |     }
182 | 
183 |     /**
184 |      * Write the current feature weights to a stream, prepend each line with the specified prefix
185 |      * @param out
186 |      * @param prefix
187 |      */
188 |     public void printWeights(PrintStream out, String prefix) {
189 |         for (Map.Entry entry : weights.weightMap.entrySet()) {
190 |             out.println(prefix + "-->" + entry.getKey() + "\t" + entry.getValue());
191 |         }
192 |     }
193 | 
194 | 
195 |     /**
196 |      * Calculate the sigmoid of x
197 |      * @param x
198 |      * @return
199 |      */
200 |     public static double sigmoid(double x) {
201 |         return 1/(1+Math.exp(-x));
202 |     }
203 | 
204 | }
205 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/perceptron/Weights.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.perceptron;
 19 | 
 20 | import java.io.Serializable;
 21 | import java.util.HashMap;
 22 | import java.util.Map;
 23 | 
 24 | /**
 25 |  * Store a set of weights associated to strings
 26 |  * Created by scheibcn on 11/5/15.
 27 |  */
 28 | public class Weights implements Serializable {
 29 |     private static final long serialVersionUID = 2945274488514737545L;
 30 | 
 31 |     // map for holding weights
 32 |     Map<String, Double> weightMap;
 33 | 
 34 |     // map for storing the weight history for averaging
 35 |     // for a clean description of the algorithm,
 36 |     // see for example Chapter 3 in Hal Daume's "A Course in Machine Learning"
 37 |     Map<String, Double> weightCacheMap;
 38 | 
 39 |     public boolean doAveraging = true;
 40 |     int averagingCoefficient = 0;
 41 | 
 42 |     public Weights() {
 43 |         // allocate some large maps
 44 |         weightMap = new HashMap<String, Double>(100000);
 45 |         weightCacheMap = new HashMap<String, Double>(100000);
 46 |     }
 47 | 
 48 |     /**
 49 |      * Resets all weights to 0
 50 |      */
 51 |     public void resetWeights() {
 52 |         averagingCoefficient = 0;
 53 |         weightMap.clear();
 54 |         weightCacheMap.clear();
 55 |     }
 56 | 
 57 |     /**
 58 |      * Get the most recent weight of a feature. Returns 0 if the feature is unknown.
 59 |      * @param feature
 60 |      * @return
 61 |      */
 62 |     public double get(String feature) {
 63 |         if (weightMap.containsKey(feature)) {
 64 |             return weightMap.get(feature);
 65 |         } else {
 66 |             return 0;
 67 |         }
 68 |     }
 69 | 
 70 |     /**
 71 |      * Get the averaged weight of a feature. Returns 0 if the feature is unknown.
 72 |      * @param feature
 73 |      * @return
 74 |      */
 75 |     public double getAvg(String feature) {
 76 |         if (weightMap.containsKey(feature)) {
 77 |             Double cache = weightCacheMap.get(feature);
 78 |             if (cache == null) cache = 0.0;
 79 |             return weightMap.get(feature) - (cache/averagingCoefficient);
 80 |         } else {
 81 |             return 0;
 82 |         }
 83 |     }
 84 | 
 85 |     /**
 86 |      * Update the weight of a feature by value
 87 |      * @param feature
 88 |      * @param value
 89 |      */
 90 |     public void update(String feature, double value) {
 91 |         // update the weight of the feature
 92 |         if (!weightMap.containsKey(feature)) {
 93 |             weightMap.put(feature, value);
 94 |         } else {
 95 |             weightMap.put(feature, weightMap.get(feature) + value);
 96 |         }
 97 | 
 98 |         // also add to averaging map if averaging is on
 99 |         if (doAveraging) {
100 |             if (!weightCacheMap.containsKey(feature)) {
101 |                 weightCacheMap.put(feature, value * averagingCoefficient);
102 |             } else {
103 |                 weightCacheMap.put(feature, weightCacheMap.get(feature) + value * averagingCoefficient);
104 |             }
105 | 
106 |             averagingCoefficient++;
107 |         }
108 |     }
109 | }
110 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/run/Common.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.run;
 19 | 
 20 | import ims.cs.lingdata.Document;
 21 | import ims.cs.parc.ProcessedCorpus;
 22 | import ims.cs.qsample.features.SpanFeatures;
 23 | import ims.cs.qsample.models.QuotationPerceptrons;
 24 | import ims.cs.qsample.spans.Span;
 25 | import ims.cs.util.StaticConfig;
 26 | 
 27 | import java.io.*;
 28 | import java.util.List;
 29 | import java.util.zip.GZIPInputStream;
 30 | import java.util.zip.GZIPOutputStream;
 31 | 
 32 | /**
 33 |  * Some common functions
 34 |  * Created by scheibcn on 3/5/16.
 35 |  */
 36 | public abstract class Common {
 37 |     /**
 38 |      * Writes the predictions to a file in BIO format
 39 |      * @param trainDocs
 40 |      * @param testDocs
 41 |      * @param valDocs
 42 |      * @param resDocs
 43 |      */
 44 |     public static void writePredictionsToFile(List<Document> trainDocs, List<Document> testDocs, List<Document> valDocs, List<Document> resDocs) {
 45 |         // if in text mode, write empty line after sentence ends and write cues
 46 |         boolean writeNewLineAfterSentence = StaticConfig.cliMode == StaticConfig.CliMode.TEXT;
 47 |         boolean writeCues = StaticConfig.cliMode == StaticConfig.CliMode.TEXT;
 48 | 
 49 |         // try to write predictions
 50 |         try {
 51 |             if (trainDocs != null) ProcessedCorpus.savePredictionsToFile(trainDocs, "train-final", writeNewLineAfterSentence, writeCues);
 52 |             if (testDocs != null) ProcessedCorpus.savePredictionsToFile(testDocs, "test-final", writeNewLineAfterSentence, writeCues);
 53 |             if (valDocs != null) ProcessedCorpus.savePredictionsToFile(valDocs, "val-final", writeNewLineAfterSentence, writeCues);
 54 |             if (resDocs != null) ProcessedCorpus.savePredictionsToFile(resDocs, "res-final", writeNewLineAfterSentence, writeCues);
 55 |         } catch (IOException e) {
 56 |             e.printStackTrace();
 57 |             System.out.println("Unable to write results to file");
 58 |         }
 59 | 
 60 | 
 61 |     }
 62 | 
 63 |     /**
 64 |      * Writes out all perceptron models
 65 |      * @param perceptrons
 66 |      * @param fileName
 67 |      * @throws IOException
 68 |      */
 69 |     public static void serializeModels(QuotationPerceptrons perceptrons, String fileName) throws IOException {
 70 |         System.out.println("Writing perceptron model to " + fileName);
 71 |         ObjectOutputStream outputStream = new ObjectOutputStream (new GZIPOutputStream(new FileOutputStream(fileName)));
 72 |         outputStream.writeObject(perceptrons);
 73 |     }
 74 | 
 75 |     /**
 76 |      * Reads all perceptron models from a file
 77 |      * @param fileName
 78 |      * @return
 79 |      * @throws IOException
 80 |      * @throws ClassNotFoundException
 81 |      */
 82 |     public static QuotationPerceptrons deserializeModels(String fileName) throws IOException, ClassNotFoundException {
 83 |         System.out.println("Loading perceptron model from " + fileName);
 84 |         ObjectInputStream inputStream = new ObjectInputStream(new GZIPInputStream(new FileInputStream(fileName)));
 85 |         return (QuotationPerceptrons) inputStream.readObject();
 86 |     }
 87 | 
 88 |     /**
 89 |      * Adds features to gold spans
 90 |      * @param documents
 91 |      */
 92 |     public static void addFeaturesToGoldSpans(List<Document> documents) {
 93 |         for (Document document : documents) {
 94 |             for (Span goldSpan : document.goldSpanSet) {
 95 |                 SpanFeatures.addAllSpanFeatures(goldSpan);
 96 |             }
 97 |         }
 98 |     }
 99 | 
100 |     public static String pathConcat (String path, String subDir) {
101 |         return new File(new File(path), subDir).toString();
102 |     }
103 | 
104 | 
105 | }
106 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/run/PlainTextCorpusReader.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.run;
 19 | 
 20 | import ims.cs.lingdata.*;
 21 | import ims.cs.parc.ProcessedCorpus;
 22 | import ims.cs.util.StaticConfig;
 23 | import org.xml.sax.SAXException;
 24 | 
 25 | import javax.xml.parsers.ParserConfigurationException;
 26 | import java.io.*;
 27 | import java.util.ArrayList;
 28 | import java.util.Arrays;
 29 | import java.util.List;
 30 | 
 31 | /**
 32 |  * Created by scheibcn on 6/1/16.
 33 |  */
 34 | public class PlainTextCorpusReader {
 35 | 
 36 |     /**
 37 |      * Read document, one sentence per line
 38 |      * @param file
 39 |      * @return
 40 |      */
 41 |     public static Document readDocument(File file) throws IOException {
 42 |         StringBuilder sb = new StringBuilder();
 43 |         BufferedReader reader = new BufferedReader(new FileReader(file));
 44 | 
 45 |         // read all text from file
 46 |         String line;
 47 |         while ((line = reader.readLine()) != null) {
 48 |             sb.append(line);
 49 |             sb.append('\n');
 50 |         }
 51 | 
 52 |         // build a document with some bogus structure
 53 |         String text = sb.toString();
 54 | 
 55 |         Document d = new Document();
 56 |         Sentence s = new Sentence();
 57 |         Token t = new Token();
 58 | 
 59 |         // add text and set byte count
 60 |         t.goldText = text;
 61 |         t.goldByteCount = new ByteCount(0, t.goldText.length());
 62 | 
 63 |         // bookkeeping
 64 |         s.tokenList = new ArrayList<>();
 65 |         s.tokenList.add(t);
 66 | 
 67 |         d.sentenceList = new ArrayList<>();
 68 |         d.sentenceList.add(s);
 69 | 
 70 |         d.tokenList = new ArrayList<>();
 71 |         d.tokenList.add(t);
 72 |         d.text = text;
 73 | 
 74 |         // build a document id from the file and directory names
 75 |         d.docId = new PlainTextDocId(file.getParentFile().getName(), file.getName());
 76 | 
 77 |         reader.close();
 78 | 
 79 |         return d;
 80 |     }
 81 | 
 82 |     public static ProcessedCorpus readDocuments(String directory) throws ClassNotFoundException, SAXException, ParserConfigurationException, IOException {
 83 |         List<Document> documentList = new ArrayList<>();
 84 | 
 85 |         // import all files in the directory
 86 |         File dir = new File(directory);
 87 |         File[] files = dir.listFiles();
 88 |         Arrays.sort(files);
 89 | 
 90 |         for (File file : files) {
 91 |             if (StaticConfig.verbose) System.out.println(file);
 92 |             Document document = readDocument(file);
 93 |             documentList.add(document);
 94 |         }
 95 | 
 96 |         PlainTextCorpus corpus = new PlainTextCorpus(documentList);
 97 | 
 98 |         return new ProcessedCorpus(corpus);
 99 |     }
100 | 
101 | 
102 |     public static void pipeline() {
103 | 
104 |     }
105 | 
106 |     public static Document dummyDocument () {
107 |         Document d = new Document();
108 |         Sentence s = new Sentence();
109 |         Token t = new Token();
110 | 
111 |         t.goldText = "\"I am very disappointed,\" said Dr. Miller.\n Futher, he reported that everything was fine.";
112 |         t.goldByteCount = new ByteCount(0, t.goldText.length());
113 | 
114 |         s.tokenList = new ArrayList<>();
115 |         s.tokenList.add(t);
116 | 
117 |         d.sentenceList = new ArrayList<>();
118 |         d.sentenceList.add(s);
119 | 
120 |         d.tokenList = new ArrayList<>();
121 |         d.tokenList.add(t);
122 |         d.text = t.goldText;
123 | 
124 |         d.docId = new PlainTextDocId("dummyTestDirectory1", "dummyTestFile1");
125 | 
126 |         return d;
127 |     }
128 | }
129 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/run/RunCrf.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.run;
 19 | 
 20 | import ims.cs.lingdata.Document;
 21 | import ims.cs.parc.PARCCorpus;
 22 | import ims.cs.parc.ProcessedCorpus;
 23 | import ims.cs.qsample.evaluate.EvaluateSpan;
 24 | import ims.cs.qsample.models.CrfClassifier;
 25 | import ims.cs.qsample.models.QuotationPerceptrons;
 26 | import ims.cs.qsample.perceptron.PerceptronTrainer;
 27 | import ims.cs.util.MultiOutputStream;
 28 | import ims.cs.util.NewStaticPrinter;
 29 | import ims.cs.util.StaticConfig;
 30 | import org.xml.sax.SAXException;
 31 | 
 32 | import javax.xml.parsers.ParserConfigurationException;
 33 | import java.io.IOException;
 34 | import java.util.List;
 35 | 
 36 | /**
 37 |  * Run an experiment with a CRF model
 38 |  * Created by scheibcn on 3/3/16.
 39 |  */
 40 | public class RunCrf {
 41 | 
 42 | 
 43 |     /**
 44 |      * Run the full CRF training and testing pipeline
 45 |      * @param trainDocs training documents
 46 |      * @param testDocs test documents (may be null)
 47 |      * @param valDocs validation documents (may be null)
 48 |      * @param resDocs resubstitution documents (may be null)
 49 |      * @param beginMargin positive margin for begin perceptron
 50 |      * @param endMargin positive margin for end perceptron
 51 |      * @param cueMargin positive margin for cue perceptron
 52 |      * @param numIter number of epochs for training
 53 |      * @param perceptrons optionally: specify some pre-trained perceptrons
 54 |      * @param crfClassifier optionally: specify a pre-trained CRF
 55 |      * @return final CRF model
 56 |      * @throws IOException
 57 |      * @throws ClassNotFoundException
 58 |      */
 59 |     public static CrfClassifier runCrfPipeline(List<Document> trainDocs, List<Document> testDocs, List<Document> valDocs, List<Document> resDocs,
 60 |                                             double beginMargin, double endMargin, double cueMargin,
 61 |                                             int numIter, QuotationPerceptrons perceptrons, CrfClassifier crfClassifier) throws IOException, ClassNotFoundException {
 62 | 
 63 |         // train a cue model if necessary, then predict
 64 |         if (perceptrons == null) {
 65 |             PerceptronTrainer.trainAllPerceptronsAndApply(trainDocs, testDocs, valDocs, resDocs, beginMargin, endMargin, cueMargin, true, 10, 10);
 66 |         } else {
 67 |             perceptrons.predictionPipelineCue(trainDocs, testDocs, valDocs, resDocs);
 68 |             perceptrons.predictionPipelineBoundary(trainDocs, testDocs, valDocs, resDocs);
 69 |         }
 70 | 
 71 |         // train CRF
 72 |         if (crfClassifier == null) {
 73 |             crfClassifier = new CrfClassifier();
 74 |             crfClassifier.numIter = numIter;
 75 |             crfClassifier.train(trainDocs, testDocs, valDocs, resDocs);
 76 |         }
 77 | 
 78 |         // apply CRF
 79 |         System.out.println("Applying CRF to test data");
 80 |         crfClassifier.test(trainDocs, testDocs, valDocs, resDocs);
 81 | 
 82 |         // evaluate
 83 |         EvaluateSpan.evaluateAndPrint("", "|", trainDocs, testDocs, valDocs, resDocs);
 84 | 
 85 |         // save predictions
 86 |         Common.writePredictionsToFile(trainDocs, testDocs, valDocs, resDocs);
 87 | 
 88 |         // output feature weights
 89 |         // this takes a lot of time, so it's deactivated right now
 90 |         if (false) crfClassifier.print();
 91 | 
 92 |         return crfClassifier;
 93 |     }
 94 | 
 95 |     /**
 96 |      * This runs the full experimental pipeline w/ training and testing
 97 |      * @return
 98 |      * @throws ClassNotFoundException
 99 |      * @throws SAXException
100 |      * @throws ParserConfigurationException
101 |      * @throws IOException
102 |      */
103 |     public static CrfClassifier fullExperiment() throws ClassNotFoundException, SAXException, ParserConfigurationException, IOException {
104 |         ProcessedCorpus pc = new ProcessedCorpus(PARCCorpus.getInstance());
105 |         List<Document> trainDocs = pc.getTrain();
106 |         List<Document> testDocs = pc.getTest();
107 |         List<Document> valDocs = pc.getDev();
108 |         List<Document> resDocs = pc.getTrainSample(10);
109 | 
110 |         return runCrfPipeline(trainDocs, testDocs, valDocs, resDocs, StaticConfig.beginMargin, StaticConfig.endMargin, StaticConfig.cueMargin, 500, null, null);
111 |     }
112 | 
113 |     /**
114 |      * Running this program will train the CRF model as described in the paper
115 |      * @param args
116 |      * @throws ClassNotFoundException
117 |      * @throws SAXException
118 |      * @throws ParserConfigurationException
119 |      * @throws IOException
120 |      */
121 |     public static void main(String[] args) throws ClassNotFoundException, SAXException, ParserConfigurationException, IOException {
122 |         String logFileName = NewStaticPrinter.getLogFileName(Common.pathConcat(StaticConfig.outputDirectory, "crf-"));
123 |         NewStaticPrinter.init(logFileName);
124 |         MultiOutputStream.init(logFileName);
125 | 
126 |         CrfClassifier crf = fullExperiment();
127 |         crf.saveCrf(logFileName + ".crfmodel");
128 |     }
129 | }
130 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/run/RunHeuristicTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.qsample.run;
 19 | 
 20 | import ims.cs.lingdata.Document;
 21 | import ims.cs.parc.PARCCorpus;
 22 | import ims.cs.parc.ProcessedCorpus;
 23 | import ims.cs.qsample.evaluate.EvaluateSpan;
 24 | import ims.cs.qsample.greedysample.HeuristicSampler;
 25 | import ims.cs.qsample.models.QuotationPerceptrons;
 26 | import ims.cs.qsample.perceptron.PerceptronTrainer;
 27 | import ims.cs.util.MultiOutputStream;
 28 | import ims.cs.util.NewStaticPrinter;
 29 | import ims.cs.util.StaticConfig;
 30 | import org.xml.sax.SAXException;
 31 | 
 32 | import javax.xml.parsers.ParserConfigurationException;
 33 | import java.io.IOException;
 34 | import java.util.List;
 35 | 
 36 | /**
 37 |  * Run an experiment with the greedy heuristic model
 38 |  * Created by scheibcn on 11/5/15.
 39 |  */
 40 | public class RunHeuristicTest {
 41 | 
 42 |     // whether to shuffle tokens during prediction
 43 |     static boolean doShuffleTokens = false;
 44 |     static boolean incrementalPrediction = false;
 45 | 
 46 |     /**
 47 |      * Run the full greedy heuristic training and testing pipeline
 48 |      * @param trainDocs training documents
 49 |      * @param testDocs test documents (may be null)
 50 |      * @param valDocs validation documents (may be null)
 51 |      * @param resDocs resubstitution documents (may be null)
 52 |      * @param beginMargin positive margin for begin perceptron
 53 |      * @param endMargin positive margin for end perceptron
 54 |      * @param cueMargin positive margin for cue perceptron
 55 |      * @param model optionally: specify some pre-trained perceptrons
 56 |      * @return final perceptron models
 57 |      */
 58 |     public static QuotationPerceptrons runHeuristicPipeline(List<Document> trainDocs, List<Document> testDocs, List<Document> valDocs, List<Document> resDocs,
 59 |                                             double beginMargin, double endMargin, double cueMargin, QuotationPerceptrons model) {
 60 | 
 61 |         // train model or predict
 62 |         if (model == null) {
 63 |             model = PerceptronTrainer.trainAllPerceptronsAndApply(trainDocs, testDocs, valDocs, resDocs, beginMargin, endMargin, cueMargin, false, 10, 10);
 64 |         } else {
 65 |             model.predictionPipelineCue(trainDocs, testDocs, valDocs, resDocs);
 66 |             model.predictionPipelineBoundary(trainDocs, testDocs, valDocs, resDocs);
 67 |         }
 68 | 
 69 | 
 70 |         // debug output
 71 |         for (Document document: testDocs) NewStaticPrinter.printPerceptronPrediction(document, "PP");
 72 |         NewStaticPrinter.printN("-", 80);
 73 | 
 74 |         // SAMPLING
 75 |         HeuristicSampler sampler = new HeuristicSampler();
 76 |         sampler.doShuffleTokens = doShuffleTokens;
 77 | 
 78 | 
 79 |         int[] maxDistances;
 80 |         int[] maxLengths;
 81 | 
 82 |         if (incrementalPrediction) {   /* version 1: incremental prediction -- performs slightly worse */
 83 |             maxDistances = new int[]{5, 10, 20, 30};
 84 |             maxLengths = new int[]{50, 50, 50, 50};
 85 |         } else {                       /* version 2: full prediction immediately */
 86 |             maxDistances = new int[]{30};
 87 |             maxLengths = new int[]{50};
 88 |         }
 89 | 
 90 |         for (int i = 0; i < maxDistances.length; i++) {
 91 |             // sample
 92 |             int maxDistance = maxDistances[i];
 93 |             int maxLength = maxLengths[i];
 94 | 
 95 |             if (trainDocs != null) sampler.sampleGreedy(trainDocs, maxDistance, maxLength);
 96 |             if (testDocs != null) sampler.sampleGreedy(testDocs, maxDistance, maxLength);
 97 |             if (valDocs != null) sampler.sampleGreedy(valDocs, maxDistance, maxLength);
 98 |             if (resDocs != null) sampler.sampleGreedy(resDocs, maxDistance, maxLength);
 99 | 
100 |             // evaluate
101 |             EvaluateSpan.evaluateAndPrint("" + maxDistance + " ", "|", trainDocs, testDocs, valDocs, resDocs);
102 |         }
103 | 
104 |         // save predictions
105 |         Common.writePredictionsToFile(trainDocs, testDocs, valDocs, resDocs);
106 | 
107 |         return model;
108 |     }
109 | 
110 |     /**
111 |      * This runs the full experimental pipeline w/ training and testing
112 |      * @throws ClassNotFoundException
113 |      * @throws SAXException
114 |      * @throws ParserConfigurationException
115 |      * @throws IOException
116 |      */
117 |     public static void fullExperiment() throws ClassNotFoundException, SAXException, ParserConfigurationException, IOException {
118 |         ProcessedCorpus pc = new ProcessedCorpus(PARCCorpus.getInstance());
119 |         List<Document> trainDocs = pc.getTrain();
120 |         List<Document> testDocs = pc.getTest();
121 |         List<Document> valDocs = pc.getDev();
122 |         List<Document> resDocs = pc.getTrainSample(10);
123 | 
124 |         runHeuristicPipeline(trainDocs, testDocs, valDocs, resDocs, StaticConfig.beginMargin, StaticConfig.endMargin, StaticConfig.cueMargin, null);
125 |     }
126 | 
127 |     /**
128 |      * Run this to train a model without going through QSample.main()
129 |      * @param args
130 |      * @throws ClassNotFoundException
131 |      * @throws SAXException
132 |      * @throws ParserConfigurationException
133 |      * @throws IOException
134 |      */
135 |     public static void main (String[] args) throws ClassNotFoundException, SAXException, ParserConfigurationException, IOException {
136 |         String logFileName = NewStaticPrinter.getLogFileName("/home/users1/scheibcn/quotations/results/txt/joint-first-run/heuristic-");
137 |         MultiOutputStream.init(logFileName);
138 |         NewStaticPrinter.init(logFileName);
139 | 
140 |         fullExperiment();
141 |     }
142 | }
143 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/spans/SpanBegin.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.qsample.spans;
19 | 
20 | import ims.cs.qsample.greedysample.HasScore;
21 | 
22 | /**
23 |  * Representation of a span begin. It is useful to have this as a separate class since this makes sampling easier.
24 |  * Created by scheibcn on 11/5/15.
25 |  */
26 | public class SpanBegin implements HasScore {
27 |     // each begin has a position and a score
28 |     public int position;
29 |     public Double score = null;
30 | 
31 |     public SpanBegin(int position, double score) {
32 |         this.position = position;
33 |         this.score = score;
34 |     }
35 | 
36 |     public SpanBegin(int position) {
37 |         this.position = position;
38 |     }
39 | 
40 |     public double getScore() {
41 |         return score;
42 |     }
43 | 
44 |     @Override
45 |     public String toString() {
46 |         return "SpanBegin(pos=" + position + ",score=" + score + ")";
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/qsample/spans/SpanEnd.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of QSample.
 3 |  * QSample is free software: you can redistribute it and/or modify
 4 |  * it under the terms of the GNU General Public License as published by
 5 |  * the Free Software Foundation, either version 3 of the License, or
 6 |  * (at your option) any later version.
 7 |  *
 8 |  * QSample is distributed in the hope that it will be useful,
 9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 |  * GNU General Public License for more details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License
14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
15 |  */
16 | 
17 | 
18 | package ims.cs.qsample.spans;
19 | 
20 | import ims.cs.qsample.greedysample.HasScore;
21 | 
22 | /**
23 |  * Representation of a span end. It is useful to have this as a separate class since this makes sampling easier.
24 |  * Created by scheibcn on 11/5/15.
25 |  */
26 | public class SpanEnd implements HasScore {
27 |     // each end has a position and a score
28 |     public int position;
29 |     public Double score = null;
30 | 
31 |     public SpanEnd(int position, double score) {
32 |         this.position = position;
33 |         this.score = score;
34 |     }
35 | 
36 |     public SpanEnd(int position) {
37 |         this.position = position;
38 |     }
39 | 
40 |     public double getScore() {
41 |         return score;
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/util/MultiOutputStream.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.util;
 19 | 
 20 | import java.io.*;
 21 | import java.text.DateFormat;
 22 | import java.text.SimpleDateFormat;
 23 | import java.util.Date;
 24 | 
 25 | /**
 26 |  * An extension of the default output stream that provides functionality to write to multiple streams at once.
 27 |  * We can use this to "tee" standard out and standard error into a file, which makes for a cheap and somewhat dirty
 28 |  * logging alternative.
 29 |  *
 30 |  * Adapted from http://www.codeproject.com/Tips/315892/A-quick-and-easy-way-to-direct-Java-System-out-to
 31 |  */
 32 | public class MultiOutputStream extends OutputStream
 33 | {
 34 | 
 35 |     OutputStream[] outputStreams;
 36 | 
 37 |     public MultiOutputStream(OutputStream... outputStreams)
 38 |     {
 39 |         this.outputStreams= outputStreams;
 40 |     }
 41 | 
 42 |     @Override
 43 |     public void write(int b) throws IOException
 44 |     {
 45 |         for (OutputStream out: outputStreams)
 46 |             out.write(b);
 47 |     }
 48 | 
 49 |     @Override
 50 |     public void write(byte[] b) throws IOException
 51 |     {
 52 |         for (OutputStream out: outputStreams)
 53 |             out.write(b);
 54 |     }
 55 | 
 56 |     @Override
 57 |     public void write(byte[] b, int off, int len) throws IOException
 58 |     {
 59 |         for (OutputStream out: outputStreams)
 60 |             out.write(b, off, len);
 61 |     }
 62 | 
 63 |     @Override
 64 |     public void flush() throws IOException
 65 |     {
 66 |         for (OutputStream out: outputStreams)
 67 |             out.flush();
 68 |     }
 69 | 
 70 |     @Override
 71 |     public void close() throws IOException
 72 |     {
 73 |         for (OutputStream out: outputStreams)
 74 |             out.close();
 75 |     }
 76 | 
 77 |     /**
 78 |      * Write stdout and stderr to two separate files
 79 |      * @param fnOut
 80 |      * @param fnErr
 81 |      */
 82 |     public static void init(String fnOut, String fnErr) {
 83 |         System.out.println("Logging stdout to: " + fnOut);
 84 |         System.out.println("Logging stdout to: " + fnErr);
 85 | 
 86 |         try
 87 |         {
 88 |             FileOutputStream fout= new FileOutputStream(fnOut);
 89 |             FileOutputStream ferr= new FileOutputStream(fnErr);
 90 | 
 91 |             MultiOutputStream multiOut= new MultiOutputStream(System.out, fout);
 92 |             MultiOutputStream multiErr= new MultiOutputStream(System.err, ferr);
 93 | 
 94 |             PrintStream stdout= new PrintStream(multiOut);
 95 |             PrintStream stderr= new PrintStream(multiErr);
 96 | 
 97 |             System.setOut(stdout);
 98 |             System.setErr(stderr);
 99 |         }
100 |          catch (FileNotFoundException e) {
101 |             e.printStackTrace();
102 |         }
103 | 
104 |     }
105 | 
106 |     /**
107 |      * Write stdout and stderr into the same file
108 |      * @param fnOutAndErr
109 |      */
110 |     public static void init(String fnOutAndErr) {
111 |         System.out.println("Logging all output to: " + fnOutAndErr);
112 | 
113 |         try
114 |         {
115 |             FileOutputStream fout= new FileOutputStream(fnOutAndErr);
116 | 
117 |             MultiOutputStream multiOut= new MultiOutputStream(System.out, fout);
118 | 
119 |             PrintStream stdout= new PrintStream(multiOut);
120 | 
121 |             System.setOut(stdout);
122 |             System.setErr(stdout);
123 |         }
124 |         catch (FileNotFoundException e) {
125 |             e.printStackTrace();
126 |         }
127 | 
128 |     }
129 | 
130 | }
131 | 


--------------------------------------------------------------------------------
/src/main/java/ims/cs/util/NewStaticPrinter.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of QSample.
  3 |  * QSample is free software: you can redistribute it and/or modify
  4 |  * it under the terms of the GNU General Public License as published by
  5 |  * the Free Software Foundation, either version 3 of the License, or
  6 |  * (at your option) any later version.
  7 |  *
  8 |  * QSample is distributed in the hope that it will be useful,
  9 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  * GNU General Public License for more details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License
 14 |  * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 15 |  */
 16 | 
 17 | 
 18 | package ims.cs.util;
 19 | 
 20 | import ims.cs.lingdata.Document;
 21 | import ims.cs.lingdata.Token;
 22 | import ims.cs.qsample.spans.Span;
 23 | 
 24 | import java.io.FileNotFoundException;
 25 | import java.io.PrintWriter;
 26 | import java.text.DateFormat;
 27 | import java.text.SimpleDateFormat;
 28 | import java.util.Date;
 29 | 
 30 | /**
 31 |  * A static printer to easily log output. Used mostly for debugging purposes.
 32 |  * Created by scheibcn on 3/3/16.
 33 |  */
 34 | public class NewStaticPrinter {
 35 |     // printer may be turned off
 36 |     public static boolean isOn = true;
 37 | 
 38 |     public static String fileRoot;
 39 |     public static String fileName;
 40 |     static PrintWriter writer;
 41 | 
 42 |     /**
 43 |      * Pass function to do nothing.
 44 |      */
 45 |     public static void pass() {}
 46 | 
 47 |     /**
 48 |      * Sets a log file name from the specified log file root
 49 |      * @param logFileName
 50 |      * @throws FileNotFoundException
 51 |      */
 52 |     public static void init(String logFileName) throws FileNotFoundException {
 53 |         fileRoot = logFileName;
 54 |         fileName = logFileName + ".debug";
 55 |         if (isOn) writer = new PrintWriter(fileName);
 56 |     }
 57 | 
 58 | 
 59 |     /**
 60 |      * Generates a log file name from the specified log file root
 61 |      * @param prefix
 62 |      * @return
 63 |      */
 64 |     public static String getLogFileName (String prefix) {
 65 |         DateFormat dateFormat = new SimpleDateFormat("yyyyMMdd-HH:mm:ss");
 66 |         Date date = new Date();
 67 | 
 68 |         return prefix + dateFormat.format(date) + ".log";
 69 |     }
 70 | 
 71 |     /**
 72 |      * Print line to log file
 73 |      * @param s
 74 |      */
 75 |     public static void println(String s) {
 76 |         if (isOn) {
 77 |             writer.write(s);
 78 |             writer.write("\n");
 79 |         }
 80 |     }
 81 | 
 82 |     /**
 83 |      * Print to log file
 84 |      * @param s
 85 |      */
 86 |     public static void print(String s) {
 87 |         if (isOn) {
 88 |             writer.write(s);
 89 |         }
 90 |     }
 91 | 
 92 |     /**
 93 |      * Print n copies of s to the log file
 94 |      * @param s
 95 |      * @param n
 96 |      */
 97 |     public static void printN(String s, int n) {
 98 |         for (int i = 0; i < n; i++) print(s);
 99 |         println("");
100 |     }
101 | 
102 | 
103 |     /**
104 |      * Print the perceptron predictions for the given document to the log file
105 |      * @param document
106 |      * @param prefix string to prepend for each line
107 |      */
108 |     public static void printPerceptronPrediction (Document document, String prefix) {
109 |         for (Token token : document.getTokenList()) {
110 |             StringBuilder line = new StringBuilder();
111 | 
112 |             // prefix
113 |             line.append(prefix);
114 |             line.append("\t");
115 | 
116 |             // add token information
117 |             line.append(token.predText);
118 |             line.append("\t");
119 | 
120 | 
121 |             // gold information
122 |             boolean goldBegin = token.startsGoldContentSpan();
123 |             boolean goldEnd = token.endsGoldContentSpan();
124 |             boolean goldCue = token.isGoldCue();
125 | 
126 |             if (goldBegin) line.append('B');
127 |             else line.append('_');
128 | 
129 |             if (goldEnd) line.append('E');
130 |             else line.append('_');
131 | 
132 |             if (goldCue) line.append('C');
133 |             else line.append('_');
134 | 
135 |             line.append('\t');
136 | 
137 |             // predicted information
138 |             if (token.perceptronBeginScore > 0) line.append('B');
139 |             else line.append('_');
140 | 
141 |             if (token.perceptronEndScore > 0) line.append('E');
142 |             else line.append('_');
143 | 
144 |             if (token.isPredictedCue) line.append('C');
145 |             else line.append('_');
146 | 
147 |             line.append('\t');
148 | 
149 |             // scores
150 |             line.append(token.perceptronBeginScore); line.append('\t');
151 |             line.append(token.perceptronEndScore); line.append('\t');
152 |             line.append(token.perceptronCueScore); line.append('\t');
153 |             line.append('\t');
154 | 
155 |             // scores
156 |             line.append(token.numTimesSampledBegin); line.append('\t');
157 |             line.append(token.numTimesSampledEnd); line.append('\t');
158 |             line.append(token.numTimesSampledCue); line.append('\t');
159 | 
160 | 
161 |             println(line.toString());
162 |         }
163 |     }
164 | 
165 |     /**
166 |      * Print document predictions and gold information using SGML-style tags
167 |      * @param doc
168 |      */
169 |     public static void printAnnotatedDocument(Document doc) {
170 |         StringBuilder sb = new StringBuilder();
171 |         for (int i = 0; i < doc.tokenList.size(); i++) {
172 |             if (Span.anyBeginsAt(doc.goldSpanSet, i)) sb.append("<GOLD>");
173 |             if (Span.anyBeginsAt(doc.predictedSpanSet, i)) sb.append("<pred>");
174 |             sb.append(doc.tokenList.get(i).predText);
175 |             if (Span.anyEndsAt(doc.predictedSpanSet, i)) sb.append("</pred>");
176 |             if (Span.anyEndsAt(doc.goldSpanSet, i)) sb.append("</GOLD>");
177 |             sb.append(" ");
178 |         }
179 | 
180 |         println(sb.toString());
181 |     }
182 | 
183 | }
184 | 


--------------------------------------------------------------------------------