├── .circleci └── config.yml ├── .gitignore ├── .gitmodules ├── LICENSE.txt ├── README.md ├── build.sbt ├── cli ├── .gitignore ├── README.md ├── build.sbt └── src │ └── main │ └── scala │ └── org │ └── allenai │ └── scienceparse │ └── RunSP.scala ├── core ├── .gitignore ├── README.md ├── build.sbt ├── scripts │ ├── .gitignore │ ├── cleanGoldData.py │ └── findPRErrorPairs.py └── src │ ├── main │ ├── java │ │ └── org │ │ │ └── allenai │ │ │ └── scienceparse │ │ │ ├── BibRecord.java │ │ │ ├── CRFBibRecordParser.java │ │ │ ├── CheckReferences.java │ │ │ ├── CitationRecord.java │ │ │ ├── DirectoryPaperSource.java │ │ │ ├── ExtractReferences.java │ │ │ ├── ExtractedMetadata.java │ │ │ ├── FallbackPaperSource.java │ │ │ ├── GazetteerFeatures.java │ │ │ ├── MarkableFileInputStream.java │ │ │ ├── PDFDocToPartitionedText.java │ │ │ ├── PDFPredicateExtractor.java │ │ │ ├── PDFToCRFInput.java │ │ │ ├── PaperSource.java │ │ │ ├── PaperToken.java │ │ │ ├── Parser.java │ │ │ ├── ParserGroundTruth.java │ │ │ ├── ParserLMFeatures.java │ │ │ ├── ReferencesPredicateExtractor.java │ │ │ ├── RegexWithTimeout.java │ │ │ ├── RetryPaperSource.java │ │ │ ├── ScholarBucketPaperSource.java │ │ │ ├── Section.java │ │ │ ├── StringLongHash.java │ │ │ ├── WordVectorCache.java │ │ │ └── pdfapi │ │ │ ├── PDFDoc.java │ │ │ ├── PDFExtractor.java │ │ │ ├── PDFFontMetrics.java │ │ │ ├── PDFLine.java │ │ │ ├── PDFMetadata.java │ │ │ ├── PDFPage.java │ │ │ ├── PDFToken.java │ │ │ └── PdfDocExtractionResult.java │ ├── resources │ │ ├── golddata │ │ │ ├── dblp │ │ │ │ ├── authorFullName.tsv │ │ │ │ └── title.tsv │ │ │ └── isaac │ │ │ │ ├── abstracts.tsv │ │ │ │ ├── bib-authors.tsv │ │ │ │ ├── bib-titles.tsv │ │ │ │ ├── bib-venues.tsv │ │ │ │ ├── bib-years.tsv │ │ │ │ ├── bibliographies.tsv │ │ │ │ ├── bibs-to-tsv.py │ │ │ │ ├── import_bib_gold.py │ │ │ │ ├── mentions.tsv │ │ │ │ └── tsv-to-gold.py │ │ ├── opennlp │ │ │ └── tools │ │ │ │ └── tokenize │ │ │ │ └── en-token.bin │ │ └── org │ │ │ └── allenai │ │ │ └── scienceparse │ │ │ └── pipeline │ │ │ └── highfreq.tsv │ └── scala │ │ └── org │ │ └── allenai │ │ └── scienceparse │ │ ├── BibTraining.scala │ │ ├── CachedGrobidServer.scala │ │ ├── Evaluation.scala │ │ ├── GazetteerFromPMC.scala │ │ ├── GrobidParser.scala │ │ ├── InterleavingIterator.scala │ │ ├── JsonProtocol.scala │ │ ├── LabeledData.scala │ │ ├── LabeledDataEvaluation.scala │ │ ├── PrintCRFInput.scala │ │ ├── PrintFeaturizedCRFInput.scala │ │ ├── S2PaperSource.scala │ │ ├── StringUtils.scala │ │ ├── Training.scala │ │ ├── Utilities.scala │ │ └── pipeline │ │ ├── Bucketizers.scala │ │ ├── Normalizers.scala │ │ └── SimilarityMeasures.scala │ └── test │ ├── java │ └── org │ │ └── allenai │ │ └── scienceparse │ │ ├── CRFBibRecordParserTest.java │ │ ├── CheckReferencesTest.java │ │ ├── ExtractReferencesTest.java │ │ ├── GazetteerFeaturesTest.java │ │ ├── HeaderIntegrationTest.java │ │ ├── PDFPredicateExtractorTest.java │ │ ├── PDFToCRFInputTest.java │ │ ├── ParserLMFeaturesTest.java │ │ ├── ParserTest.java │ │ └── pdfapi │ │ └── PDFExtractorTest.java │ ├── resources │ ├── 2a774230b5328df3f8125da9b84a82d92b46a240.pdf │ ├── 403b61d52192d6cf23c92a3da68ba08f03a954e4.pdf │ ├── 6c46de8a4399840548a056d13d38e1f54da2.pdf │ ├── P07-1088-labels.txt │ ├── P07-1088.pdf │ ├── P14-1059-labels.txt │ ├── P14-1059.extraction.json │ ├── P14-1059.pdf │ ├── a7c25298c607d5bf32e3301b6b209431e2a7f830.pdf │ ├── agarwal11.extraction.json │ ├── agarwal11.pdf │ ├── aimag10.extraction.json │ ├── aimag10.pdf │ ├── bagnell11.extraction.json │ ├── bagnell11.pdf │ ├── bohnet09.extraction.json │ ├── bohnet09.pdf │ ├── bunescu-acl07.pdf │ ├── bunescu-acl07.txt │ ├── c0690a1d74ab781bd54f9fa7e67267cce656.pdf │ ├── c921a74c209e720534939dfa191d639e647dd242.pdf │ ├── coordinate_calibrator.pdf │ ├── coratest.txt │ ├── ding11.extraction.json │ ├── ding11.pdf │ ├── dyer12.extraction.json │ ├── dyer12.pdf │ ├── e4faf2c1d76b9bf8f8b4524dfb8c5c6b93be5f35.pdf │ ├── fader11.extraction.json │ ├── fader11.pdf │ ├── gazetteer-test │ │ ├── education.university.small.txt │ │ └── names.male.txt │ ├── groundTruth.json │ ├── id-titles.txt │ ├── kermittest.txt │ ├── logback-test.xml │ ├── map-reduce.extraction.json │ ├── map-reduce.pdf │ ├── model-bib-crf-test.dat │ ├── mono04.extraction.json │ ├── mono04.pdf │ ├── mooney05.extraction.json │ ├── mooney05.pdf │ ├── papers-parsebugs.json │ ├── pedersen04.extraction.json │ ├── pedersen04.pdf │ ├── proto06.extraction.json │ ├── proto06.pdf │ ├── roark13.extraction.json │ ├── roark13.pdf │ ├── senellart10.extraction.json │ ├── senellart10.pdf │ ├── seung08.extraction.json │ ├── seung08.pdf │ ├── smith07.extraction.json │ ├── smith07.pdf │ ├── smola10.extraction.json │ ├── smola10.pdf │ ├── superscripttest.pdf │ ├── tagged_references.txt │ ├── testng.xml │ ├── umasstest.txt │ ├── zolotov04.extraction.json │ └── zolotov04.pdf │ └── scala │ └── org │ └── allenai │ └── scienceparse │ ├── CoraExtractionSpec.scala │ ├── JavaTestSuite.scala │ ├── JsonProtocolSpec.scala │ ├── MetaEvalSpec.scala │ └── StringUtilsSpec.scala ├── project ├── build.properties └── plugins.sbt └── server ├── .gitignore ├── README.md ├── build.sbt └── src └── main ├── resources ├── application.conf └── logback.xml └── scala └── org └── allenai └── scienceparse ├── FeedbackStore.scala └── SPServer.scala /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | working_directory: ~/science-parse 5 | docker: 6 | - image: openjdk:8 7 | environment: 8 | SBT_VERSION: 1.2.8 9 | steps: 10 | - run: echo 'export ARTIFACT_BUILD=$CIRCLE_PROJECT_REPONAME-$CIRCLE_BUILD_NUM.zip' >> $BASH_ENV 11 | - run: 12 | name: Get sbt binary 13 | command: | 14 | apt update && apt install -y curl 15 | curl -L -o sbt-$SBT_VERSION.deb https://dl.bintray.com/sbt/debian/sbt-$SBT_VERSION.deb 16 | dpkg -i sbt-$SBT_VERSION.deb 17 | rm sbt-$SBT_VERSION.deb 18 | apt-get update && apt-get clean && apt-get autoclean 19 | - checkout 20 | - restore_cache: 21 | # Read about caching dependencies: https://circleci.com/docs/2.0/caching/ 22 | key: sbt-cache 23 | - run: 24 | name: Clean package 25 | command: cat /dev/null | sbt clean 26 | - run: 27 | name: Test package 28 | command: cat /dev/null | sbt test 29 | - store_artifacts: # for display in Artifacts: https://circleci.com/docs/2.0/artifacts/ 30 | path: target/universal/science-parse.zip 31 | destination: science-parse 32 | - save_cache: 33 | key: sbt-cache 34 | paths: 35 | - "~/.ivy2/cache" 36 | - "~/.sbt" 37 | - "~/.m2" 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # gradle 2 | .gradle/** 3 | build/** 4 | gradle/** 5 | out/** 6 | gradlew.* 7 | gradlew 8 | 9 | # intellij 10 | *.ipr 11 | *.iws 12 | *.iml 13 | 14 | # emacs 15 | .*~ 16 | 17 | # sbt 18 | target/** 19 | project/target/** 20 | project/project/** 21 | 22 | EvalErrors.log 23 | 24 | !/cli/ 25 | !/core/ 26 | !/server/ 27 | !/project/ 28 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ext/ml"] 2 | path = ext/ml 3 | url = git@github.com:allenai/ml.git 4 | branch = mr-patch -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Science Parse 2 | 3 | Science Parse parses scientific papers (in PDF form) and returns them in structured form. As of today, it supports these fields: 4 | * Title 5 | * Authors 6 | * Abstract 7 | * Sections (each with heading and body text) 8 | * Bibliography, each with 9 | * Title 10 | * Authors 11 | * Venue 12 | * Year 13 | * Mentions, i.e., places in the paper where bibliography entries are mentioned 14 | 15 | In JSON format, the [output looks like this](http://scienceparse.allenai.org/v1/498bb0efad6ec15dd09d941fb309aa18d6df9f5f) (or like [this, if you want sections](http://scienceparse.allenai.org/v1/498bb0efad6ec15dd09d941fb309aa18d6df9f5f?skipFields=sections)). The easiest way to get started is to use the output from this server. 16 | 17 | ## New version: SPv2 18 | 19 | There is a new version of science-parse out that works in a completely different way. It has fewer 20 | features, but higher quality in the output. Check out the details at https://github.com/allenai/spv2. 21 | 22 | ## Get started 23 | 24 | There are three different ways to get started with SP. Each has its own document: 25 | 26 | * [Server](server/README.md): This contains the SP server. It's useful for PDF parsing as a service. It's also probably the easiest way to get going. 27 | * [CLI](cli/README.md): This contains the command line interface to SP. That's most useful for batch processing. 28 | * [Core](core/README.md): This contains SP as a library. It has all the extraction code, plus training and evaluation. Both server and CLI use this to do the actual work. 29 | 30 | ## How to include into your own project 31 | 32 | The current version is `3.0.0`. If you want to include it in your own project, use this: 33 | 34 | For SBT: 35 | ``` 36 | libraryDependencies += "org.allenai" %% "science-parse" % "3.0.0" 37 | ``` 38 | 39 | For Maven: 40 | ``` 41 | 42 | org.allenai 43 | science-parse_2.12 44 | 3.0.0 45 | 46 | ``` 47 | 48 | The first time you run it, SP will download some rather large model files. Don't be alarmed! The model files are cached, and startup is much faster the second time. 49 | 50 | For licensing reasons, SP does not include libraries for some image formats. Without these 51 | libraries, SP cannot process PDFs that contain images in these formats. If you have no 52 | licensing restrictions in your project, we recommend you add these additional dependencies to your 53 | project as well: 54 | ``` 55 | "com.github.jai-imageio" % "jai-imageio-core" % "1.2.1", 56 | "com.github.jai-imageio" % "jai-imageio-jpeg2000" % "1.3.0", // For handling jpeg2000 images 57 | "com.levigo.jbig2" % "levigo-jbig2-imageio" % "1.6.5", // For handling jbig2 images 58 | ``` 59 | 60 | ## Development 61 | 62 | This project is a hybrid between Java and Scala. The interaction between the languages is fairly seamless, and SP can be used as a library in any JVM-based language. 63 | 64 | Our build system is sbt. To build science-parse, you have to have sbt installed and working. You can 65 | find details about that at https://www.scala-sbt.org. 66 | 67 | Once you have sbt set up, just start `sbt` in the main project folder to launch sbt's shell. There 68 | are many things you can do in the shell, but here are the most important ones: 69 | * `+test` runs all the tests in all the projects across Scala versions. 70 | * `cli/assembly` builds a runnable superjar (i.e., a jar with all dependencies bundled) for the 71 | project. You can run it (from bash, not from sbt) with `java -Xmx10g -jar `. 72 | * `server/assembly` builds a runnable superjar for the webserver. 73 | * `server/run` starts the server directly from the sbt shell. 74 | 75 | ### Lombok 76 | 77 | This project uses [Lombok](https://projectlombok.org) which requires you to enable annotation processing inside of an IDE. 78 | [Here](https://plugins.jetbrains.com/plugin/6317) is the IntelliJ plugin and you'll need to enable annotation processing (instructions [here](https://www.jetbrains.com/idea/help/configuring-annotation-processing.html)). 79 | 80 | Lombok has a lot of useful annotations that give you some of the nice things in Scala: 81 | 82 | * `val` is equivalent to `final` and the right-hand-side class. It gives you type-inference via some tricks 83 | * Check out [`@Data`](https://projectlombok.org/features/Data.html) 84 | 85 | ## Thanks 86 | 87 | Special thanks goes to @kermitt2, whose work on [kermitt2/grobid](https://github.com/kermitt2/grobid) inspired Science Parse, and helped us get started with some labeled data. 88 | 89 | Releasing new versions 90 | ---------------------- 91 | 92 | This project releases to BinTray. To make a release: 93 | 94 | 1. Pull the latest code on the master branch that you want to release 95 | 1. Tag the release `git tag -a vX.Y.Z -m "Release X.Y.Z"` replacing X.Y.Z with the correct version 96 | 1. Push the tag back to origin `git push origin vX.Y.Z` 97 | 1. Release the build on Bintray `sbt +publish` (the "+" is required to cross-compile) 98 | 1. Verify publication [on bintray.com](https://bintray.com/allenai/maven) 99 | 1. Bump the version in `build.sbt` on master (and push!) with X.Y.Z+1 (e.g., 2.5.1 after 100 | releasing 2.5.0) 101 | 102 | If you make a mistake you can rollback the release with `sbt bintrayUnpublish` and retag the 103 | version to a different commit as necessary. 104 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | ivyLoggingLevel in ThisBuild := UpdateLogging.Quiet 2 | 3 | lazy val scala211 = "2.11.12" 4 | lazy val scala212 = "2.12.9" 5 | lazy val scala213 = "2.13.0" // Not supported yet (collections changes required) 6 | lazy val supportedScalaVersions = List(scala212, scala211) 7 | 8 | ThisBuild / organization := "org.allenai.scienceparse" 9 | ThisBuild / scalaVersion := scala212 10 | ThisBuild / name := "science-parse" 11 | ThisBuild / version := "3.0.1" 12 | 13 | lazy val commonSettings = Seq( 14 | crossScalaVersions := supportedScalaVersions, 15 | resolvers ++= Seq( 16 | Resolver.jcenterRepo, 17 | Resolver.bintrayRepo("allenai", "maven") 18 | ), 19 | javaOptions += s"-Dlogback.appname=${name.value}", 20 | // release settings 21 | licenses += ("Apache-2.0", url("http://www.apache.org/licenses/LICENSE-2.0.html")), 22 | homepage := Some(url("https://github.com/allenai/science-parse")), 23 | scmInfo := Some(ScmInfo( 24 | url("https://github.com/allenai/science-parse"), 25 | "https://github.com/allenai/science-parse.git")), 26 | bintrayRepository := "maven", 27 | bintrayOrganization := Some("allenai"), 28 | publishMavenStyle := true, 29 | publishArtifact in Test := false, 30 | pomIncludeRepository := { _ => false }, 31 | pomExtra := 32 | 33 | 34 | allenai-dev-role 35 | Allen Institute for Artificial Intelligence 36 | dev-role@allenai.org 37 | 38 | 39 | ) 40 | 41 | lazy val root = (project in file(".")) 42 | .aggregate( 43 | core, 44 | cli, 45 | server 46 | ) 47 | .settings( 48 | crossScalaVersions := Nil, 49 | publish / skip := true, 50 | commonSettings 51 | ) 52 | 53 | lazy val core = (project in file("core")). 54 | settings( 55 | description := "Java to extract titles, authors, abstracts, body text, and bibliographies from scholarly documents", 56 | name := "science-parse-core", 57 | commonSettings 58 | ) 59 | 60 | lazy val cli = (project in file("cli")). 61 | settings( 62 | description := "Java CLI to extract titles, authors, abstracts, body text, and bibliographies from scholarly documents", 63 | name := "science-parse-cli", 64 | commonSettings 65 | ).dependsOn(core) 66 | 67 | lazy val server = (project in file("server")). 68 | settings( 69 | description := "Java server to extract titles, authors, abstracts, body text, and bibliographies from scholarly documents", 70 | name := "science-parse-server", 71 | commonSettings 72 | ).dependsOn(core) 73 | -------------------------------------------------------------------------------- /cli/.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | -------------------------------------------------------------------------------- /cli/README.md: -------------------------------------------------------------------------------- 1 | # Science Parse Command Line Interface 2 | 3 | Science Parse has a command line interface called "RunSP". To build it into a super-jar, run `sbt cli/assembly`. This will download all dependencies and make a bundled jar that contains SP itself, plus all of its dependencies. 4 | 5 | You can run it like this: 6 | ``` 7 | java -jar jarfile.jar --help 8 | ``` 9 | That will print all the command line options you can use with it. I won't describe all of them here, but there are a few important described below. 10 | 11 | ## Memory 12 | 13 | Science Parse needs quite a lot of memory. We recommend you run it with at least 6GB of heap, like this: 14 | ``` 15 | java -Xmx6g -jar jarfile.jar 18bc3569da037a6cb81fb081e2856b77b321c139 16 | ``` 17 | Note that some documents need more memory to parse than others. 18 | 19 | ## Specifying input 20 | 21 | `RunSP` can parse multiple files at the same time. You can parse thousands of PDFs like this. It will try to parse as many of them in parallel as your computer allows. 22 | 23 | `RunSP` takes input as positional parameters. Input can be any of the following: 24 | * S2 Paper ID (example: `java -Xmx6g -jar jarfile.jar 18bc3569da037a6cb81fb081e2856b77b321c139`). This will download the paper with the given ID from S2, and parse it. 25 | * PDF File (example: `java -Xmx6g -jar jarfile.jar paper.pdf`). This will parse the given PDF. 26 | * Directory (example: `java -Xmx6g -jar jarfile.jar my_directory/`). This will find all PDFs in that directory, and its subdirectories, and parse them. 27 | * Text file (example: `java -Xmx6g -jar jarfile.jar papers.txt`). Every line in the text file must be either an S2 Paper ID, a path to a PDF file, a path to a directory containing PDF files, or another text file that will be processed the same way. 28 | 29 | ## Specifying output 30 | 31 | By default, `RunSP` prints its output to standard out, in a prettyfied JSON format. This behavior can be changed with the `-o` and `-f` options. 32 | 33 | * `-o `: This option will write output JSON files into the specified directory, one per input document. 34 | * `-f `: This option will write output JSON into the specified file, one line per input document. 35 | 36 | If you specify both at the same time, it does both. 37 | 38 | If you specify none, it prints the output to stdout. 39 | -------------------------------------------------------------------------------- /cli/build.sbt: -------------------------------------------------------------------------------- 1 | javaOptions in run += s"-Xmx10G" 2 | 3 | fork := true 4 | 5 | mainClass in assembly := Some("org.allenai.scienceparse.RunSP") 6 | 7 | assemblyMergeStrategy in assembly := { 8 | case "logback.xml" => MergeStrategy.first 9 | case x => (assemblyMergeStrategy in assembly).value.apply(x) 10 | } 11 | 12 | libraryDependencies ++= Seq( 13 | "org.slf4j" % "jcl-over-slf4j" % "1.7.7", 14 | "com.fasterxml.jackson.core" % "jackson-core" % "2.7.9", 15 | "com.fasterxml.jackson.core" % "jackson-databind" % "2.7.9", 16 | "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.7.9", 17 | "com.github.scopt" %% "scopt" % "3.4.0" 18 | ) 19 | -------------------------------------------------------------------------------- /core/.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | /test-output/ 3 | -------------------------------------------------------------------------------- /core/README.md: -------------------------------------------------------------------------------- 1 | # Science Parse as a library 2 | 3 | The most flexible way to use SP, but also the most complicated, is to use it as a library. 4 | 5 | ## Parsing documents 6 | 7 | The main entry point into Science Parse is the [`Parser`](src/main/java/org/allenai/scienceparse/Parser.java) class. In Java, you can use it like this: 8 | ```Java 9 | import org.allenai.scienceparse.Parser; 10 | import org.allenai.scienceparse.ExtractedMetadata; 11 | 12 | final Parser parser = Parser.getInstance(); 13 | 14 | // Parse without timeout 15 | final ExtractedMetadata em = parser.doParse(inputStream); 16 | 17 | // Parse with timeout 18 | final ExtractedMetadata em = parser.doParseWithTimeout(inputStream, 30000); // 30 second timeout 19 | // This might throw ParsingTimeout, which is a type of RuntimeException. 20 | ``` 21 | 22 | This will attempt to parse the PDF in `inputStream` into an object of type [`ExtractedMetadata`](src/main/java/org/allenai/scienceparse/ExtractedMetadata.java), which looks like this:. 23 | 24 | ```Java 25 | public class ExtractedMetadata { 26 | public Source source; 27 | public String title; 28 | public List authors; 29 | public List
sections; 30 | public List references; 31 | public List referenceMentions; 32 | public int year; 33 | public String abstractText; 34 | public String creator; // program that created the PDF, i.e. LaTeX or PowerPoint or something else 35 | 36 | // ... 37 | } 38 | ``` 39 | 40 | For more detail, we recommend diving into the code, or asking a question (create an issue). 41 | 42 | ## Training models 43 | 44 | SP uses four model files: 45 | 1. The general CRF model for title and authors 46 | 2. The CRF model for bibliographies 47 | 3. The gazetteer 48 | 4. Word vectors 49 | 50 | We provide defaults for these. During normal startup, SP will download all four and cache them locally. 51 | 52 | You can also train your own. This README is too small to contain a full introduction to how to do this, but to get you started, check out these entry points: 53 | 1. The general CRF model is created by the executable class [`org.allenai.scienceparse.Training`](src/main/scala/org/allenai/scienceparse/Training.scala). Run it with `sbt "core/runMain org.allenai.scienceparse.Training --help"` to see all the options it supports. If you specify nothing but an output location, it will train itself with the same parameters that were used to create the default models. 54 | 2. The bibliography CRF model is created by the executable class [`org.allenai.scienceparse.BibTraining`](src/main/scala/org/allenai/scienceparse/BibTraining.scala). Run it with `sbt "core/runMain org.allenai.scienceparse.BibTraining --help"` to see all the options it supports. As with the general model, if you specify nothing but an output location, it will train itself with the same parameters that were used to create the default models. 55 | 3. You can specify a custom gazetteer as well, but the gazetteer is currently not used at runtime, only during training. To experiment with it, download the default gazetteer for a peek at the format. It's an uncomplicated JSON format. 56 | 4. We never experimented with different vectors. There is currently no way to change the ones we provide. 57 | 58 | The SP Server, the SP CLI, and the evaluation code (see below) can be instructed to load other models than the default ones from the command line. Run any of them with `--help` to see details. 59 | 60 | Abstract extraction is purely rule-based, and not part of any of these models. Section extraction comes from the `pdffigures2` project, which SP depends on. Both of these are unaffected by any changes to the models. 61 | 62 | ## Evaluating models 63 | 64 | There are several ways to evaluate changes to the models and rules. These are my two favorites: 65 | * `sbt "core/runMain org.allenai.scienceparse.LabeledDataEvaluation --compareAgainstGold"`: This evaluates SP against gold-annotated documents from the Computer Science domain. 66 | * `sbt "core/runMain org.allenai.scienceparse.LabeledDataEvaluation --compareAgainstPMC"`: This evaluates SP against documents from [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc/). They are mostly from the medical domain. PMC documents are accompanied by a rich XML structure describing the contents of the PDFs in detail. It is a great source of labeled data for both evaluation and training. Note that this will download several gigabytes of PDFs before it begins the evaluation. As always, those gigabytes are cached and don't have to be downloaded a second time. 67 | 68 | To see other options, run sbt "core/runMain org.allenai.scienceparse.LabeledDataEvaluation --help". 69 | 70 | PDF parsing is different from platform to platform, and can depend even on the locally installed packages. As such, we see different training and evaluation performance for the same code, when run on different machines. To keep things consistent, we do all our test runs on Ubuntu 14.04.5. 71 | -------------------------------------------------------------------------------- /core/build.sbt: -------------------------------------------------------------------------------- 1 | javaOptions in Test += s"-Xmx10G" 2 | 3 | fork in Test := true 4 | 5 | assemblyJarName in assembly := s"science-parse-${version.value}.jar" 6 | 7 | libraryDependencies ++= Seq( 8 | "org.allenai.common" %% "common-core" % "2.0.0" excludeAll ( 9 | ExclusionRule(organization = "org.apache.common", name = "commons-math3") 10 | ), 11 | "org.apache.pdfbox" % "pdfbox" % "2.0.9" exclude ("commons-logging", "commons-logging"), 12 | "org.apache.pdfbox" % "fontbox" % "2.0.9" exclude ("commons-logging", "commons-logging"), 13 | "org.slf4j" % "jcl-over-slf4j" % "1.7.7", 14 | "org.allenai" % "ml" % "0.16" excludeAll ( 15 | ExclusionRule(organization = "args4j"), 16 | ExclusionRule(organization = "org.slf4j", name="slf4j-simple") 17 | ), 18 | "org.projectlombok" % "lombok" % "1.16.20", 19 | "com.goldmansachs" % "gs-collections" % "6.1.0", 20 | "org.scalatest" %% "scalatest" % "2.2.1" % Test, 21 | "org.testng" % "testng" % "6.8.1" % Test, 22 | "org.allenai.common" %% "common-testkit" % "2.0.0" % Test, 23 | "org.allenai.datastore" %% "datastore" % "2.0.0", 24 | "org.bouncycastle" % "bcprov-jdk15on" % "1.54", 25 | "org.bouncycastle" % "bcmail-jdk15on" % "1.54", 26 | "org.bouncycastle" % "bcpkix-jdk15on" % "1.54", 27 | "org.jsoup" % "jsoup" % "1.8.1", 28 | "org.apache.commons" % "commons-lang3" % "3.4", 29 | "commons-io" % "commons-io" % "2.4", 30 | "com.amazonaws" % "aws-java-sdk-s3" % "1.11.213" exclude ("commons-logging", "commons-logging"), 31 | "org.allenai.word2vec" %% "word2vecjava" % "2.0.0" 32 | exclude ("log4j", "log4j") 33 | exclude ("commons-logging", "commons-logging"), 34 | "com.google.guava" % "guava" % "18.0", 35 | "org.scala-lang.modules" %% "scala-java8-compat" % "0.8.0", 36 | "org.scala-lang.modules" %% "scala-xml" % "1.0.6", 37 | "org.scalaj" %% "scalaj-http" % "2.3.0", 38 | "org.allenai" %% "pdffigures2" % "0.1.0", 39 | "io.spray" %% "spray-json" % "1.3.5", 40 | "de.ruedigermoeller" % "fst" % "2.47", 41 | "org.apache.opennlp" % "opennlp-tools" % "1.7.2" 42 | 43 | // So SP can parse more image formats 44 | // These are disabled by default, because they are not licensed flexibly enough. 45 | //"com.github.jai-imageio" % "jai-imageio-core" % "1.2.1", 46 | //"com.github.jai-imageio" % "jai-imageio-jpeg2000" % "1.3.0", // For handling jpeg2000 images 47 | //"com.levigo.jbig2" % "levigo-jbig2-imageio" % "1.6.5" // For handling jbig2 images 48 | ) 49 | -------------------------------------------------------------------------------- /core/scripts/.gitignore: -------------------------------------------------------------------------------- 1 | /*.pyc 2 | -------------------------------------------------------------------------------- /core/scripts/cleanGoldData.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # This script takes a file with error pairs (found with findPRErrorPairs.py and 4 | # then fixed by hand), and a file with gold data, and fixes the gold data 5 | # according to the file of error pairs. 6 | 7 | import sys 8 | 9 | # read replacements 10 | paper2replacements = {} 11 | for line in open(sys.argv[1]): 12 | (paperId, ourExtraction, goldData) = [x.strip() for x in line.strip().split("\t")] 13 | replacements = paper2replacements.get(paperId, {}) 14 | replacements[goldData] = ourExtraction 15 | paper2replacements[paperId] = replacements 16 | 17 | # read gold data, write new file to stdout 18 | for line in open(sys.argv[2]): 19 | line = [x.strip() for x in line.strip().split("\t")] 20 | 21 | paperId = line[0] 22 | sys.stdout.write(paperId) 23 | replacements = paper2replacements.get(paperId, {}) 24 | 25 | names = line[1:] 26 | for name in names: 27 | replacement = replacements.get(name, name) 28 | sys.stdout.write("\t" + replacement) 29 | 30 | sys.stdout.write("\n") 31 | -------------------------------------------------------------------------------- /core/scripts/findPRErrorPairs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # This script goes through the logs from an evaluation run, and identifies 4 | # pairs of precision and recall errors. The intuition is that usually when we 5 | # get a name wrong, it shows up as a recall error (because we missed the gold 6 | # name), and also as a precision error (because we extracted a name not in the 7 | # gold data). Usually these names are very similar, and usually these are not 8 | # really errors. So this script identifies likely pairs of precision and recall 9 | # errors, and outputs them into a table of pairs of names. These pairs are 10 | # likely different forms of the same name, and should probably be treated equal 11 | # by downstream tools. 12 | # 13 | # Examples of error pairs that aren't really errors: 14 | # Extracted: Michael Schumacher Gold: Mike Schumacher 15 | # Extracted: Alfred Neuman Gold: Alfred E. Neuman 16 | # Extracted: H G Wells Gold: H. Wells 17 | 18 | def ngrams(s, n): 19 | for i in xrange(len(s) - n + 1): 20 | yield s[i:i+n] 21 | 22 | def main(): 23 | import sys 24 | evalLog = [line.strip().split("\t")[1:] for line in open(sys.argv[1]) if line.startswith("authorFullNameNormalized\t")] 25 | paperId2errors = {} 26 | for (pr, paperId, normalized, unnormalized) in evalLog: 27 | errors = paperId2errors.get(paperId, []) 28 | errors.append((pr, normalized, unnormalized)) 29 | paperId2errors[paperId] = errors 30 | 31 | for (paperId, errors) in paperId2errors.iteritems(): 32 | precisionErrors = [e[1:] for e in errors if e[0] == "precision"] 33 | recallErrors = [e[1:] for e in errors if e[0] == "recall"] 34 | for (pNormalized, pUnnormalized) in precisionErrors: 35 | pNgrams = set(ngrams(pNormalized, 2)) | set(ngrams(pNormalized, 3)) 36 | for (rNormalized, rUnnormalized) in recallErrors: 37 | rNgrams = set(ngrams(rNormalized, 2)) | set(ngrams(rNormalized, 3)) 38 | score = float(len(pNgrams & rNgrams)) / max(len(pNgrams), len(rNgrams)) 39 | if score >= 0.5: 40 | print "%s\t%s\t%s" % (paperId, pUnnormalized, rUnnormalized) 41 | 42 | if __name__=="__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/BibRecord.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import lombok.Data; 4 | import lombok.EqualsAndHashCode; 5 | 6 | import java.util.Calendar; 7 | import java.util.List; 8 | import java.util.regex.Pattern; 9 | import java.util.stream.Collectors; 10 | 11 | @Data @EqualsAndHashCode(exclude={"citeRegEx", "shortCiteRegEx"}) 12 | public class BibRecord { 13 | public static final int MINYEAR = 1800; 14 | public static final int MAXYEAR = Calendar.getInstance().get(Calendar.YEAR) + 10; 15 | 16 | // Something is wrong with sbt, lombok, and Scala/Java interop, making this unconstructable from 17 | // Scala if you don't write this custom constructor. 18 | public BibRecord( 19 | final String title, 20 | final List author, 21 | final String venue, 22 | final Pattern citeRegEx, 23 | final Pattern shortCiteRegEx, 24 | final int year 25 | ) { 26 | this.title = title; 27 | this.author = author; 28 | this.venue = venue; 29 | this.citeRegEx = citeRegEx; 30 | this.shortCiteRegEx = shortCiteRegEx; 31 | this.year = year; 32 | } 33 | 34 | static private String normalizeInitialsInAuthor(String author) { 35 | author = author.trim(); 36 | author = author.replaceAll("(\\p{Lu}\\.) (\\p{Lu}\\.)", "$1$2"); 37 | author = author.replaceAll("(\\p{Lu}\\.) (\\p{Lu}\\.)", "$1$2"); //twice to catch three-initial seq. 38 | return author; 39 | } 40 | 41 | public BibRecord withNormalizedAuthors() { 42 | return new BibRecord( 43 | title, 44 | author.stream(). 45 | map(BibRecord::normalizeInitialsInAuthor). 46 | filter(s -> !StringUtils.normalize(s.toLowerCase()).equals("et al")). 47 | collect(Collectors.toList()), 48 | venue, 49 | citeRegEx, 50 | shortCiteRegEx, 51 | year); 52 | } 53 | 54 | public BibRecord withTitle(final String newTitle) { 55 | return new BibRecord( 56 | newTitle, 57 | author, 58 | venue, 59 | citeRegEx, 60 | shortCiteRegEx, 61 | year); 62 | } 63 | 64 | private static String stripSuperscriptTags(final String s) { 65 | if(s == null) 66 | return null; 67 | else 68 | return s.replaceAll("[⍗⍐]", ""); 69 | } 70 | 71 | public BibRecord withoutSuperscripts() { 72 | return new BibRecord( 73 | stripSuperscriptTags(title), 74 | author.stream().map(BibRecord::stripSuperscriptTags).collect(Collectors.toList()), 75 | stripSuperscriptTags(venue), 76 | citeRegEx, 77 | shortCiteRegEx, 78 | year); 79 | } 80 | 81 | public String title; 82 | public final List author; 83 | public final String venue; 84 | public final Pattern citeRegEx; 85 | public final Pattern shortCiteRegEx; 86 | public final int year; 87 | } 88 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/CheckReferences.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import com.gs.collections.impl.set.mutable.primitive.LongHashSet; 4 | import org.allenai.scienceparse.ParserGroundTruth.Paper; 5 | 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.io.Serializable; 9 | import java.util.ArrayList; 10 | import java.util.Arrays; 11 | import java.util.List; 12 | 13 | public class CheckReferences implements Serializable { 14 | private LongHashSet paperHashCodes = new LongHashSet(); 15 | 16 | public CheckReferences(String jsonFile) throws IOException { 17 | addPapers(new ParserGroundTruth(jsonFile).papers); 18 | } 19 | 20 | public CheckReferences(final InputStream is) throws IOException { 21 | addPapers(new ParserGroundTruth(is).papers); 22 | } 23 | 24 | public int getHashSize() { 25 | return paperHashCodes.size(); 26 | } 27 | 28 | public void addPaper(String title, List authors, int year, String venue) { 29 | paperHashCodes.add(getHashCode(title, authors, year, venue)); 30 | } 31 | 32 | public boolean hasPaper(String title, List authors, int year, String venue) { 33 | return paperHashCodes.contains(getHashCode(title, authors, year, venue)); 34 | } 35 | 36 | public long getHashCode(String title, List authors, int year, String venue) { 37 | title = Parser.processTitle(title); 38 | authors = Parser.lastNames(authors); 39 | if(title==null) 40 | title = ""; 41 | if(authors==null) 42 | authors = new ArrayList(); 43 | long hashCode = ((long) authors.hashCode()) * ((long) Integer.MAX_VALUE) + ((long) title.hashCode()) 44 | + ((long) Integer.hashCode(year)); 45 | return hashCode; 46 | } 47 | 48 | public void addPapers(List papers) { 49 | for (Paper p : papers) { 50 | addPaper(p.title, Arrays.asList(p.authors), p.year, p.venue); 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/CitationRecord.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import lombok.Data; 4 | import lombok.experimental.Wither; 5 | 6 | @Data 7 | public class CitationRecord { 8 | public final int referenceID; 9 | 10 | @Wither 11 | public final String context; 12 | public final int startOffset; 13 | public final int endOffset; 14 | 15 | CitationRecord( 16 | final int referenceID, 17 | final String context, 18 | final int startOffset, 19 | final int endOffset 20 | ) { 21 | // Looks like we have to have an explicit constructor because otherwise, Scala freaks out when 22 | // using this class. 23 | this.referenceID = referenceID; 24 | this.context = context; 25 | this.startOffset = startOffset; 26 | this.endOffset = endOffset; 27 | } 28 | 29 | CitationRecord withConvertedSuperscriptTags() { 30 | final String newContext = context. 31 | replace('⍐', '('). 32 | replace('⍗', ')'); 33 | return this.withContext(newContext); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/DirectoryPaperSource.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import java.io.BufferedInputStream; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.FileNotFoundException; 7 | import java.io.InputStream; 8 | 9 | /** 10 | * A paper source that gets papers from a directory in the file system 11 | */ 12 | public class DirectoryPaperSource extends PaperSource { 13 | private final File dir; 14 | 15 | public DirectoryPaperSource(final File dir) { 16 | this.dir = dir; 17 | } 18 | 19 | @Override 20 | public InputStream getPdf(final String paperId) throws FileNotFoundException { 21 | final File file = new File(dir, paperId + ".pdf"); 22 | return new BufferedInputStream(new FileInputStream(file)); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/ExtractedMetadata.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import com.gs.collections.api.tuple.Pair; 4 | import com.gs.collections.impl.tuple.Tuples; 5 | import lombok.Data; 6 | import lombok.RequiredArgsConstructor; 7 | import lombok.extern.slf4j.Slf4j; 8 | import org.allenai.scienceparse.ParserGroundTruth.Paper; 9 | 10 | import java.util.ArrayList; 11 | import java.util.Arrays; 12 | import java.util.Calendar; 13 | import java.util.Date; 14 | import java.util.List; 15 | import java.util.regex.Pattern; 16 | 17 | 18 | /** 19 | * Simple container for extracted metadata. 20 | * 21 | * @author dcdowney 22 | */ 23 | @Data 24 | @Slf4j 25 | public class ExtractedMetadata { 26 | public static final String titleTag = "T"; //label used in labeled data 27 | public static final String authorTag = "A"; //label used in labeled data 28 | public static final String yearTag = "Y"; //label used in labeled data (bibliography only) 29 | public static final String venueTag = "V"; //label used in labeled data (bibliography only) 30 | 31 | transient private static Pattern emailDelimitersRegex = Pattern.compile(",|\\||;"); 32 | 33 | public enum Source { 34 | INVALID, 35 | CRF, 36 | META 37 | } 38 | 39 | public Source source; 40 | public String title; 41 | public List authors; 42 | public List emails; //extracted by special (non-CRF) heuristic process 43 | public List
sections; 44 | public List references; 45 | public List referenceMentions; 46 | public int year; 47 | public String abstractText; 48 | public String creator; // program that created the PDF, i.e. LaTeX or PowerPoint or something else 49 | 50 | /** 51 | * Constructs ExtractedMetadata from given text and labels 52 | * 53 | * @param toks 54 | * @param labels 55 | */ 56 | public ExtractedMetadata(List toks, List labels) { 57 | List lss = getSpans(labels); 58 | authors = new ArrayList(); 59 | for (LabelSpan ls : lss) { 60 | if (title == null && ls.tag.equals(titleTag)) { 61 | title = PDFToCRFInput.stringAt(toks, ls.loc); 62 | } else if (ls.tag.equals(authorTag)) { 63 | authors.add(PDFToCRFInput.stringAt(toks, ls.loc)); 64 | } 65 | } 66 | emails = getEmails(toks); 67 | } 68 | 69 | public ExtractedMetadata(String sTitle, List sAuthors, Date cDate) { 70 | title = sTitle; 71 | authors = sAuthors; 72 | if (cDate != null) { 73 | Calendar cal = Calendar.getInstance(); 74 | cal.setTime(cDate); 75 | year = cal.get(Calendar.YEAR); 76 | } 77 | emails = new ArrayList(); 78 | } 79 | 80 | public ExtractedMetadata(Paper p) { 81 | title = p.title; 82 | authors = Arrays.asList(p.authors); 83 | year = p.year; 84 | emails = new ArrayList(); 85 | } 86 | 87 | //assumes token contains @ 88 | public static List tokToMail(String tok) { 89 | ArrayList out = new ArrayList<>(); 90 | if (!tok.contains("@")) { 91 | return null; 92 | } 93 | tok = tok.replaceAll("\\P{Print}", ""); 94 | if (tok.contains(":")) { 95 | if (tok.split(":").length > 1) 96 | tok = tok.split(":")[1]; 97 | } 98 | 99 | String[] parts = tok.split("@"); 100 | 101 | if (parts.length == 2) { 102 | String domain = parts[1]; 103 | String emailStrings = parts[0]; 104 | String[] emails = new String[1]; 105 | if ((emailStrings.startsWith("{") && emailStrings.endsWith("}")) 106 | || (emailStrings.startsWith("[") && emailStrings.endsWith 107 | ("]")) || emailStrings.contains(",") || emailStrings.contains("|")) { 108 | emailStrings = emailStrings.replaceAll("\\{|\\}|\\[|\\]", ""); 109 | emails = emailStrings.split(emailDelimitersRegex.pattern()); 110 | } else { 111 | emails[0] = emailStrings; 112 | } 113 | for (String email : emails) { 114 | out.add(email.trim() + "@" + domain); 115 | } 116 | } else { 117 | log.debug("e-mail parts not 2"); 118 | } 119 | return out; 120 | } 121 | 122 | public static List getEmails(List toks) { 123 | ArrayList out = new ArrayList<>(); 124 | for (PaperToken t : toks) { 125 | if (t.getPdfToken() != null) { 126 | String stT = t.getPdfToken().token; 127 | if (stT != null && stT.contains("@")) 128 | out.addAll(tokToMail(stT)); 129 | } 130 | } 131 | return out; 132 | } 133 | 134 | public static List getSpans(List labels) { 135 | ArrayList out = new ArrayList(); 136 | int st = -1; 137 | String curTag = ""; 138 | for (int i = 0; i < labels.size(); i++) { 139 | String lab = labels.get(i); 140 | if (lab.equals("O")) { 141 | st = -1; 142 | } else if (lab.startsWith("B_")) { 143 | st = i; 144 | curTag = lab.substring(2); 145 | } else if (lab.startsWith("I_")) { 146 | String t = lab.substring(2); 147 | if (!curTag.equals(t)) { //mis-matched tags, do not extract 148 | st = -1; 149 | } 150 | } else if (lab.startsWith("E_")) { 151 | String t = lab.substring(2); 152 | if (curTag.equals(t) && st >= 0) { 153 | LabelSpan ls = new LabelSpan(curTag, (Pair) Tuples.pair(st, i + 1)); 154 | out.add(ls); 155 | st = -1; 156 | } 157 | } else if (lab.startsWith("W_")) { 158 | String t = lab.substring(2); 159 | LabelSpan ls = new LabelSpan(t, (Pair) Tuples.pair(i, i + 1)); 160 | out.add(ls); 161 | st = -1; 162 | } 163 | } 164 | return out; 165 | } 166 | 167 | public void setYearFromDate(Date cDate) { 168 | Calendar cal = Calendar.getInstance(); 169 | cal.setTime(cDate); 170 | year = cal.get(Calendar.YEAR); 171 | } 172 | 173 | public String toString() { 174 | StringBuffer out = new StringBuffer("T: " + title + "\r\n"); 175 | authors.forEach((String a) -> out.append("A: " + a + "\r\n")); 176 | emails.forEach((String a) -> out.append("E: " + a + "\r\n")); 177 | return out.toString(); 178 | } 179 | 180 | @RequiredArgsConstructor 181 | public static class LabelSpan { 182 | public final String tag; 183 | public final Pair loc; //(inclusive, exclusive) 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/FallbackPaperSource.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | 9 | /** 10 | * A paper source that uses other paper sources, one after the other, to try to locate a paper. 11 | */ 12 | public class FallbackPaperSource extends PaperSource { 13 | private final static Logger logger = 14 | LoggerFactory.getLogger(FallbackPaperSource.class); 15 | 16 | private final PaperSource[] sources; 17 | public FallbackPaperSource(final PaperSource... sources) { 18 | this.sources = sources; 19 | } 20 | 21 | @Override 22 | public InputStream getPdf(final String paperId) throws IOException { 23 | // Try all but the last source. 24 | for(int i = 0; i < sources.length - 1; ++i) { 25 | final PaperSource source = sources[i]; 26 | try { 27 | return source.getPdf(paperId); 28 | } catch (final Exception e) { 29 | logger.info( 30 | "Getting paper {} from source {} failed, {} more sources to try", 31 | paperId, 32 | i, 33 | sources.length - i - 1); 34 | } 35 | } 36 | 37 | // Try the last source. 38 | final PaperSource source = sources[sources.length - 1]; 39 | return source.getPdf(paperId); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/GazetteerFeatures.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.io.Serializable; 9 | import java.util.ArrayList; 10 | import java.util.Arrays; 11 | import java.util.List; 12 | 13 | import org.allenai.scienceparse.ExtractedMetadata.LabelSpan; 14 | 15 | import com.gs.collections.impl.map.mutable.primitive.ObjectIntHashMap; 16 | import com.gs.collections.impl.set.mutable.primitive.LongHashSet; 17 | import com.gs.collections.impl.tuple.Tuples; 18 | 19 | import lombok.extern.slf4j.Slf4j; 20 | 21 | /** 22 | * Holds gazetteers of journal names, person names, countries, etc. 23 | * Note: only retains gazetteer entries with length at most MAXLENGTH. 24 | * 25 | */ 26 | 27 | @Slf4j 28 | public class GazetteerFeatures implements Serializable { 29 | private static final long serialVersionUID = 1L; 30 | 31 | private LongHashSet [] hashSets; //each element represents a gazetteer, the long hashcodes of contained strings 32 | 33 | private String [] hashNames; 34 | 35 | private static int MAXLENGTH = 7; //maximum length (in words) of any gazetteer entry 36 | 37 | /** 38 | * Reads in string gazetteers, assumed to be one entry per line, one gazetteer per file in given directory. 39 | * @param inDir 40 | * @throws Exception 41 | */ 42 | public GazetteerFeatures(String inDir) throws IOException { 43 | File [] files = (new File(inDir)).listFiles(); 44 | hashSets = new LongHashSet[files.length]; 45 | hashNames = new String[files.length]; 46 | for(int i=0; i=0) { 62 | if(++ct == MAXLENGTH) { 63 | return false; 64 | } 65 | idx = s.indexOf(" ", idx+1); 66 | } 67 | return true; 68 | } 69 | 70 | private LongHashSet readGazetteer(File f) throws IOException { 71 | BufferedReader brIn = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); 72 | String sLine; 73 | LongHashSet out = new LongHashSet(); 74 | while((sLine = brIn.readLine())!=null) { 75 | if(sLine.startsWith("#")||sLine.trim().length()==0) 76 | continue; 77 | if(!withinLength(sLine)) 78 | continue; 79 | out.add(StringLongHash.hash(t(sLine))); 80 | } 81 | brIn.close(); 82 | return out; 83 | } 84 | 85 | public int size() { 86 | return hashSets.length; 87 | } 88 | 89 | public int sizeOfSet(int set) { 90 | return hashSets[set].size(); 91 | } 92 | 93 | public boolean inSet(String s, int i) { 94 | long hc = StringLongHash.hash(t(s)); 95 | return hashSets[i].contains(hc); 96 | } 97 | 98 | //returns whether a string is in each gazetteer 99 | public boolean [] inSet(String s) { 100 | long hc = StringLongHash.hash(t(s)); 101 | boolean [] out = new boolean[hashSets.length]; 102 | Arrays.fill(out, false); 103 | for(int i=0; i ws, int start, int length) { 120 | StringBuffer sb = new StringBuffer(); 121 | for(int i=start;i getSpansForGaz(List ws, int gn) { 128 | ArrayList out = new ArrayList<>(); 129 | for(int i=0; i getSpans(List ws) { 142 | ArrayList out = new ArrayList<>(); 143 | for(int i=0; i papers; 22 | public HashMap lookup = new HashMap<>(); 23 | 24 | private void buildLookup() { 25 | for (int i = 0; i < papers.size(); i++) { 26 | lookup.put(papers.get(i).id.substring(4), i); 27 | } 28 | } 29 | 30 | public ParserGroundTruth(List papers) throws IOException { 31 | this.papers = papers; 32 | buildLookup(); 33 | } 34 | 35 | public ParserGroundTruth(final InputStream is) throws IOException { 36 | try(final BufferedReader reader = 37 | new BufferedReader( 38 | new InputStreamReader(is, "UTF-8"))) { 39 | 40 | ObjectMapper om = new ObjectMapper(); 41 | ObjectReader r = om.reader().forType(new TypeReference() {}); 42 | 43 | papers = new ArrayList(); 44 | while (true) { 45 | final String line = reader.readLine(); 46 | if (line == null) 47 | break; 48 | papers.add(r.readValue(line)); 49 | } 50 | } 51 | 52 | log.info("Read " + papers.size() + " papers."); 53 | 54 | buildLookup(); 55 | papers.forEach((Paper p) -> { 56 | for (int i = 0; i < p.authors.length; i++) 57 | p.authors[i] = invertAroundComma(p.authors[i]); 58 | }); 59 | } 60 | 61 | public ParserGroundTruth(String jsonFile) throws IOException { 62 | this(new FileInputStream(jsonFile)); 63 | } 64 | 65 | public static String invertAroundComma(String in) { 66 | String[] fields = in.split(","); 67 | if (fields.length == 2) 68 | return (fields[1] + " " + fields[0]).trim(); 69 | else 70 | return in; 71 | } 72 | 73 | public Paper forKey(String key) { 74 | if (!lookup.containsKey(key)) { 75 | log.info("key not found: " + key); 76 | return null; 77 | } 78 | return papers.get(lookup.get(key)); 79 | } 80 | 81 | @Data 82 | public static class Paper { 83 | String id; 84 | String url; 85 | String title; 86 | String[] authors; 87 | int year; 88 | String venue; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/RegexWithTimeout.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import lombok.NonNull; 4 | 5 | import java.util.regex.Matcher; 6 | import java.util.regex.Pattern; 7 | 8 | final class RegexWithTimeout { 9 | public static class RegexTimeout extends RuntimeException { } 10 | 11 | public static Matcher matcher(final Pattern pattern, final CharSequence string) { 12 | final long timeout = 1500; //ms 13 | 14 | class TimeoutCharSequence implements CharSequence { 15 | private CharSequence inner; 16 | private long abortTime; 17 | 18 | public TimeoutCharSequence(final CharSequence inner, final long abortTime) { 19 | super(); 20 | this.inner = inner; 21 | this.abortTime = abortTime; 22 | } 23 | 24 | public char charAt(int index) { 25 | if(System.currentTimeMillis() >= abortTime) 26 | throw new RegexTimeout(); 27 | 28 | return inner.charAt(index); 29 | } 30 | 31 | public int length() { 32 | return inner.length(); 33 | } 34 | 35 | public CharSequence subSequence(int start, int end) { 36 | return new TimeoutCharSequence(inner.subSequence(start, end), abortTime); 37 | } 38 | 39 | @NonNull 40 | public String toString() { 41 | return inner.toString(); 42 | } 43 | } 44 | 45 | return pattern.matcher(new TimeoutCharSequence(string, System.currentTimeMillis() + timeout)); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/RetryPaperSource.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | 8 | /** 9 | * An instance of PaperSource that wraps another, and retries a bunch of times to get the paper 10 | */ 11 | @Slf4j 12 | public class RetryPaperSource extends PaperSource { 13 | private final PaperSource inner; 14 | private final int tries; 15 | 16 | public RetryPaperSource(final PaperSource inner) { 17 | this(inner, 3); 18 | } 19 | 20 | public RetryPaperSource(final PaperSource inner, final int tries) { 21 | this.inner = inner; 22 | this.tries = tries; 23 | } 24 | 25 | private void wait(int seconds) { 26 | final long endTime = System.currentTimeMillis() + 1000 * seconds; 27 | while(System.currentTimeMillis() < endTime) { 28 | try { 29 | Thread.sleep(Math.max(endTime - System.currentTimeMillis() + 1, 1)); 30 | } catch(final InterruptedException e) { 31 | // do nothing 32 | } 33 | } 34 | } 35 | 36 | @Override 37 | public InputStream getPdf(final String paperId) throws IOException { 38 | int triesLeft = tries; 39 | int previousWait = 1; 40 | int nextWait = 1; 41 | 42 | while(true) { 43 | triesLeft -= 1; 44 | try { 45 | return inner.getPdf(paperId); 46 | } catch(final IOException e) { 47 | log.warn( 48 | "{} while downloading paper {}, {} tries left", 49 | e.getClass().getSimpleName(), 50 | paperId, 51 | triesLeft); 52 | if(triesLeft <= 0) { 53 | throw e; 54 | } else { 55 | wait(nextWait); 56 | final int waited = nextWait; 57 | nextWait += previousWait; 58 | previousWait = waited; 59 | } 60 | } 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/ScholarBucketPaperSource.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import com.amazonaws.services.s3.AmazonS3; 4 | import com.amazonaws.services.s3.AmazonS3Client; 5 | import com.amazonaws.services.s3.model.AmazonS3Exception; 6 | import com.amazonaws.services.s3.model.S3Object; 7 | 8 | import java.io.BufferedInputStream; 9 | import java.io.FileInputStream; 10 | import java.io.IOException; 11 | import java.io.InputStream; 12 | import java.nio.file.CopyOption; 13 | import java.nio.file.Files; 14 | import java.nio.file.Path; 15 | import java.nio.file.StandardCopyOption; 16 | 17 | /** 18 | * Gets papers from the ai2-s2-pdfs bucket 19 | */ 20 | public class ScholarBucketPaperSource extends PaperSource { 21 | // make this a singleton 22 | private static ScholarBucketPaperSource instance = new ScholarBucketPaperSource(); 23 | protected ScholarBucketPaperSource() { } 24 | public static ScholarBucketPaperSource getInstance() { return instance; } 25 | 26 | private final static String bucket = "ai2-s2-pdfs"; 27 | private final static String privateBucket = "ai2-s2-pdfs-private"; 28 | private final static String[] buckets = {bucket, privateBucket}; 29 | private final AmazonS3 s3 = new AmazonS3Client(); 30 | 31 | private S3Object getS3Object(final String paperId) { 32 | final String key = paperId.substring(0, 4) + "/" + paperId.substring(4) + ".pdf"; 33 | 34 | for(int bucketIndex = 0; bucketIndex < buckets.length; ++bucketIndex) { 35 | try { 36 | return s3.getObject(buckets[bucketIndex], key); 37 | } catch (final AmazonS3Exception e) { 38 | if(bucketIndex < buckets.length - 1 && e.getStatusCode() == 404) 39 | continue; // Try again with the next bucket. 40 | 41 | final AmazonS3Exception rethrown = 42 | new AmazonS3Exception( 43 | String.format( 44 | "Error for key s3://%s/%s", 45 | bucket, 46 | key), 47 | e); 48 | rethrown.setExtendedRequestId(e.getExtendedRequestId()); 49 | rethrown.setErrorCode(e.getErrorCode()); 50 | rethrown.setErrorType(e.getErrorType()); 51 | rethrown.setRequestId(e.getRequestId()); 52 | rethrown.setServiceName(e.getServiceName()); 53 | rethrown.setStatusCode(e.getStatusCode()); 54 | throw rethrown; 55 | } 56 | } 57 | 58 | throw new IllegalStateException("We should never get here."); 59 | } 60 | 61 | @Override 62 | public InputStream getPdf(final String paperId) throws IOException { 63 | // We download to a temp file first. If we gave out an InputStream that comes directly from 64 | // S3, it would time out if the caller of this function reads the stream too slowly. 65 | final S3Object object = getS3Object(paperId); 66 | final Path tempFile = Files.createTempFile(paperId + ".", ".paper.pdf"); 67 | try { 68 | Files.copy(object.getObjectContent(), tempFile, StandardCopyOption.REPLACE_EXISTING); 69 | return new BufferedInputStream(Files.newInputStream(tempFile)); 70 | } finally { 71 | Files.deleteIfExists(tempFile); 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/Section.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import lombok.Data; 4 | 5 | @Data 6 | public class Section { 7 | public Section(final String heading, final String text) { 8 | this.heading = heading; 9 | this.text = text; 10 | } 11 | 12 | public String heading; 13 | public String text; 14 | } 15 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/StringLongHash.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | /** 4 | * Taken from @sfussenegger 5 | * http://stackoverflow.com/questions/1660501/what-is-a-good-64bit-hash-function-in-java-for-textual-strings 6 | */ 7 | 8 | public class StringLongHash { 9 | 10 | //adapted from String.hashCode() 11 | public static long hash(String string) { 12 | long h = 1125899906842597L; // prime 13 | int len = string.length(); 14 | 15 | for (int i = 0; i < len; i++) { 16 | h = 31*h + string.charAt(i); 17 | } 18 | return h; 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/WordVectorCache.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import org.allenai.word2vec.Searcher; 4 | import org.allenai.word2vec.Word2VecModel; 5 | 6 | import java.io.IOException; 7 | import java.nio.file.Path; 8 | import java.util.Map; 9 | import java.util.TreeMap; 10 | 11 | public class WordVectorCache { 12 | private static final Map path2searchers = new TreeMap(); 13 | 14 | public static Searcher searcherForPath(final Path path) throws IOException { 15 | synchronized (path2searchers) { 16 | Searcher result = path2searchers.get(path); 17 | if(result != null) 18 | return result; 19 | 20 | final Word2VecModel word2VecModel = Word2VecModel.fromBinFile(path.toFile()); 21 | result = word2VecModel.forSearch(); 22 | path2searchers.put(path, result); 23 | return result; 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/pdfapi/PDFDoc.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse.pdfapi; 2 | 3 | import lombok.Builder; 4 | import lombok.Data; 5 | import lombok.experimental.Wither; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | @Data 11 | @Builder 12 | public class PDFDoc { 13 | /** 14 | * Index in the lines of the first page which is the stop (one beyond the last) 15 | * line that makes the header of the document (the title, authors, etc.) 16 | *

17 | * This is < 0 if we can't find an appropriate header/main cut. 18 | */ 19 | @Wither public final List pages; 20 | public final PDFMetadata meta; 21 | 22 | public PDFDoc withoutSuperscripts() { 23 | final List newPages = new ArrayList<>(pages.size()); 24 | for(PDFPage page : pages) 25 | newPages.add(page.withoutSuperscripts()); 26 | return this.withPages(newPages); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/pdfapi/PDFFontMetrics.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse.pdfapi; 2 | 3 | import lombok.Data; 4 | import lombok.val; 5 | 6 | import java.util.concurrent.ConcurrentHashMap; 7 | 8 | @Data 9 | public class PDFFontMetrics { 10 | private static final ConcurrentHashMap canonical 11 | = new ConcurrentHashMap<>(); 12 | /** 13 | * The special value for when the underlying font didn't have 14 | * an extractable family name. 15 | */ 16 | public static String UNKNWON_FONT_FAMILY = "*UNKNOWN*"; 17 | public final String name; 18 | public final float ptSize; 19 | public final float spaceWidth; 20 | 21 | /** 22 | * Ensures one font object per unique font name 23 | * 24 | * @param name 25 | * @param ptSize 26 | * @param spaceWidth 27 | * @return 28 | */ 29 | public static PDFFontMetrics of(String name, float ptSize, float spaceWidth) { 30 | val fontMetrics = new PDFFontMetrics(name, ptSize, spaceWidth); 31 | val curValue = canonical.putIfAbsent(name, fontMetrics); 32 | return curValue != null ? curValue : fontMetrics; 33 | } 34 | 35 | public String stringRepresentation() { 36 | return String.format("%s-%f", name, ptSize); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/pdfapi/PDFLine.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse.pdfapi; 2 | 3 | import com.gs.collections.api.list.primitive.FloatList; 4 | import com.gs.collections.impl.list.mutable.primitive.FloatArrayList; 5 | import lombok.Builder; 6 | import lombok.Data; 7 | import lombok.experimental.Wither; 8 | import lombok.val; 9 | 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import java.util.regex.Pattern; 13 | import java.util.stream.Collectors; 14 | import java.util.stream.DoubleStream; 15 | 16 | /** 17 | * Immutable value class representing a single contiguous line of a PDF. A contiguous line means 18 | * a sequence of tokens/glyphs which are intended to be read sequentially. For instance, a two column 19 | * paper might have two lines at the same y-position. 20 | */ 21 | @Builder 22 | @Data 23 | public class PDFLine { 24 | @Wither public final List tokens; 25 | 26 | private DoubleStream projectCoord(int dim) { 27 | return tokens.stream().mapToDouble(t -> t.bounds.get(dim)); 28 | } 29 | 30 | /** 31 | * (0,0) origin bounds [x0,y0, x1, y1] for the entire line. 32 | * Should 33 | */ 34 | public FloatList bounds() { 35 | float x0 = (float) projectCoord(0).min().getAsDouble(); 36 | float y0 = (float) projectCoord(1).min().getAsDouble(); 37 | float x1 = (float) projectCoord(2).max().getAsDouble(); 38 | float y1 = (float) projectCoord(3).max().getAsDouble(); 39 | return FloatArrayList.newListWith(x0, y0, x1, y1); 40 | } 41 | 42 | public float height() { 43 | val bs = bounds(); 44 | return bs.get(3) - bs.get(1); 45 | } 46 | 47 | public String lineText() { 48 | return tokens.stream().map(PDFToken::getToken).collect(Collectors.joining(" ")); 49 | } 50 | 51 | public double avgFontSize() { 52 | return tokens.stream().mapToDouble(t -> t.getFontMetrics().getPtSize()).average().orElse(0.0); 53 | } 54 | 55 | public PDFLine withoutSuperscripts() { 56 | final List newTokens = new ArrayList<>(tokens.size()); 57 | for(PDFToken token : tokens) { 58 | final String newTokenText = token.token.replaceAll("⍐[^⍗]*⍗", ""); 59 | if(!newTokenText.isEmpty()) 60 | newTokens.add(token.withToken(newTokenText)); 61 | } 62 | return this.withTokens(newTokens); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/pdfapi/PDFMetadata.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse.pdfapi; 2 | 3 | import com.fasterxml.jackson.databind.ObjectMapper; 4 | import com.fasterxml.jackson.databind.ObjectWriter; 5 | import lombok.Builder; 6 | import lombok.Data; 7 | import lombok.SneakyThrows; 8 | import lombok.val; 9 | 10 | import java.io.FileInputStream; 11 | import java.io.InputStream; 12 | import java.util.Date; 13 | import java.util.List; 14 | 15 | /** 16 | * Immutable class representing information obtained from scanning for PDF 17 | * meta-data. Many pdf creation programs (like pdflatex) will actuallly output 18 | * information like these fields which substantially aids downstream extraction. 19 | */ 20 | @Builder 21 | @Data 22 | public class PDFMetadata { 23 | public final String title; 24 | public final List authors; 25 | public final List keywords; 26 | public final Date createDate; 27 | public final Date lastModifiedDate; 28 | public final String creator; 29 | 30 | // HACK(aria42) For external testing purpose 31 | @SneakyThrows 32 | public static void main(String[] args) { 33 | val extractor = new PDFExtractor(); 34 | ObjectWriter ow = new ObjectMapper().writer(); 35 | if (args.length <= 1) 36 | ow = ow.withDefaultPrettyPrinter(); 37 | for (final String arg : args) { 38 | String prefix = ""; 39 | if (args.length > 1) 40 | prefix = arg + "\t"; 41 | try (InputStream pdfInputStream = new FileInputStream(arg)) { 42 | try { 43 | PDFMetadata meta = extractor.extractFromInputStream(pdfInputStream).getMeta(); 44 | String json = ow.writeValueAsString(meta); 45 | System.out.println(prefix + json); 46 | } catch (final Exception e) { 47 | System.out.println(prefix + "ERROR: " + e); 48 | } 49 | } 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/pdfapi/PDFPage.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse.pdfapi; 2 | 3 | import lombok.Builder; 4 | import lombok.Data; 5 | import lombok.experimental.Wither; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | @Builder 11 | @Data 12 | public class PDFPage { 13 | @Wither public final List lines; 14 | public final int pageNumber; 15 | public final int pageWidth; 16 | public final int pageHeight; 17 | 18 | public PDFPage withoutSuperscripts() { 19 | final List newLines = new ArrayList<>(lines.size()); 20 | for(PDFLine line : lines) { 21 | final PDFLine newLine = line.withoutSuperscripts(); 22 | if(!newLine.tokens.isEmpty()) 23 | newLines.add(newLine); 24 | } 25 | return this.withLines(newLines); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/pdfapi/PDFToken.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse.pdfapi; 2 | 3 | import com.gs.collections.api.list.primitive.FloatList; 4 | import lombok.Builder; 5 | import lombok.Value; 6 | import lombok.experimental.Wither; 7 | 8 | @Builder 9 | @Value 10 | public class PDFToken { 11 | @Wither public final String token; 12 | public final PDFFontMetrics fontMetrics; 13 | /** 14 | * List of ints [x0, y0, x1, y1] where [0,0] is upper left 15 | */ 16 | public final FloatList bounds; 17 | } 18 | -------------------------------------------------------------------------------- /core/src/main/java/org/allenai/scienceparse/pdfapi/PdfDocExtractionResult.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse.pdfapi; 2 | 3 | import lombok.Builder; 4 | import lombok.Data; 5 | 6 | @Builder 7 | @Data 8 | public class PdfDocExtractionResult { 9 | public final PDFDoc document; 10 | public final boolean highPrecision; 11 | } 12 | -------------------------------------------------------------------------------- /core/src/main/resources/golddata/isaac/bib-years.tsv: -------------------------------------------------------------------------------- 1 | 15aa277b1054cdcdf7fc018e3a3abe2df7a1691b 2002 2008 2008 2007 2005 2004 1988 1998 2006 2005 2005 2008 2008 2 | ddf1ea128fbc14a203c3b3d44d0135bb4dc33ffe 2009 1996 2005 2005 1985 2000 1997 2005 2005 2004 1970 2001 2004 2009 1998 2000 1998 1964 2004 2003 1973 1991 2000 2007 1993 2004 1966 2002 3 | 97a6613da7eeb0ac4f70ae2e3b793364c794e6dc 1992 1997 2002 2000 2005 2006 2006 2001 1986 2003 1999 1999 1998 2004 2003 1999 1998 2006 0 2003 4 | 13050adb7aa8aaf2e1c38f2b2c7e3d070358d261 2000 1992 2005 2003 1996 1995 2007 2009 2009 1974 1989 1978 2010 2003 2007 2008 5 | d3ba6b48f62e2fe1802efb46c3799362572eeb1d 1960 1968 2001 1995 1998 2002 2000 2001 2005 1996 1991 1991 1988 2003 2002 2002 1995 2003 2002 1977 1963 2000 2000 1975 2000 1999 2001 1998 2000 1967 2003 2002 1999 2005 6 | fd906edf5833d0d506a220097ced14a03ff40a73 2009 2004 1999 2002 2009 1994 2003 1989 1997 1999 1977 2007 7 | 45705e7cf3337e2469b5dbcaa31579c28bde89d1 2014 2007 1950 2014 2011 2013 2015 2010 2006 2010 2012 2014 2013 2014 2014 2013 2014 1981 2012 2009 2005 2012 2014 2004 2012 8 | 41b3a5272d0c8f98b73e7275481cd917802ad8b7 2007 2009 2009 2007 2007 2012 2013 2009 2012 2006 2008 2005 2009 2012 1998 1985 9 | b5e6da04c35a586609a46bbbd7b1ad031a658b08 2004 2009 2008 2008 2010 1981 1984 1995 2001 1989 1970 2007 1937 2009 2007 2008 1969 2007 1999 2004 2010 2005 2005 2007 2007 2009 1970 1970 1970 2003 2009 1971 2010 2005 10 | 0eb7343cdd90265282bd261c0e48cf2fd73ea465 1976 2000 1999 2004 2001 1994 1995 2002 2003 1998 2005 2003 2001 1999 2003 2003 2000 2004 2004 2001 2001 2001 2003 2004 2002 2004 2002 1998 1986 2001 11 | 030cadedef2370bd296af07fc3324c6bb8409ba5 1997 1979 1993 1998 1994 1974 1994 1998 1961 1994 1997 1991 1995 2000 1984 1995 12 | 31368c6398a34b489f78708039177d858b171d13 2001 1990 2007 2007 2008 2007 2010 2010 2006 1998 2001 2002 2007 2007 2009 2009 2009 2007 2008 2002 2005 2009 2009 2006 2010 2003 2006 2007 2010 1992 1945 2002 2000 13 | 6ee7d70f2dbfc0d45fbf20485f82a9ed7e175725 2009 2012 2014 2011 0 2007 2012 2013 2013 2013 2011 2011 2011 2012 2009 2014 2013 1989 1998 2014 2013 2014 2014 2014 2014 2014 2014 2013 2006 2004 2001 2013 1999 2014 2013 2008 14 | 5268d3d7f15ffa9c6a904a138b2b2794263c856e 2002 2007 2005 1985 2002 2002 2002 2005 2001 1982 1997 2005 2002 0 1998 1983 1985 1975 15 | 3521f22e34fef8a53d55df180a76df5a7a4e7f87 2004 2000 2005 2006 1997 1997 2007 2002 2006 2007 2005 2006 2006 2004 2005 1998 1990 2004 2006 2006 2007 16 | bcae858633935c727739a73447b50b40b7c52794 2011 2010 2011 2011 2009 2010 2004 1977 2012 2006 2005 2012 2013 2005 2012 2012 2013 2013 2008 17 | 6f9167ddb392a43a7e36a2df1feefa184d82763e 2007 2007 2010 2010 0 2013 2011 0 2015 2012 2012 2014 2011 2009 2006 2010 2007 2010 2002 2013 2003 2009 2012 2013 2013 1979 2009 2001 2015 2015 2013 1953 2007 2011 1998 0 0 2011 2011 2014 2010 2008 2014 18 | 8ecd49c474b701f69d962f8337490c7f342266c3 2010 2011 2008 2010 2011 2010 19 | bdb1b4128730838eb2fed83829f46a9077eca9f7 2010 2011 2010 2012 1955 2011 2014 2007 2001 2001 2000 1969 1997 1950 1973 1988 1992 2008 2012 2012 2013 2013 2014 2009 20 | a98488969aed4d6add1115ce18c19c89b4826a92 1990 2009 2009 2003 2004 2009 2008 2011 2001 2012 2006 2009 2005 1997 1999 1996 1995 1998 2004 2008 2011 2011 2012 1999 2011 2009 2009 2004 2007 2001 1978 1979 1970 21 | 2942516df2695e73365e78a51e8bbe9ea1397f8f 2001 2003 2009 2007 2005 2008 2010 1994 2003 2000 2009 2004 1999 1980 2004 1991 1998 2008 2008 1997 1994 2003 1980 1995 1996 2005 22 | 9ba994ddf01d2431c6c8de129b3a5a7797e5a5e6 2010 2013 2000 2013 2014 2006 0 0 2015 2013 2010 2013 2008 2006 0 2012 1998 0 0 2004 2004 0 2006 2012 1986 1995 23 | a14feb1f5d1f35815eca17c91365a728a27ade94 2004 2001 2008 2003 2003 2010 2010 1967 2003 2004 2008 2005 2005 2003 2005 2003 2012 2010 2012 2008 24 | 2e939ed3bb378ea966bf9f710fc1138f4e16ef38 2004 2009 2011 2001 2012 2014 2001 2002 1981 1973 2006 2012 2013 1990 1996 2009 2013 2001 2003 1998 2010 2008 2013 2014 2000 2011 2004 1998 2000 2012 2009 1996 25 | 2a2b8a525eae19087cc0248a45a8e17de44b021f 2000 2010 2008 2011 2010 2008 2008 2011 2010 2011 2012 2012 1995 2002 2005 2010 2007 2009 0 2009 2007 2009 1976 2003 26 | 765f6ca92d5c228847c2ceb37b756ecf980c95a4 1995 1991 2008 2010 2006 2006 1978 1980 1991 1996 2003 2007 1995 27 | 10a1e6233fce78a5c6bd3a40cca3e9298da55abe 1989 1992 1997 1998 2000 2002 2002 2003 1992 1999 2003 1999 1990 1977 2001 2000 1976 1985 1990 1999 1998 2001 2000 1993 2000 2000 2003 2001 1998 0 2001 1990 1997 2002 1998 2001 2003 1995 2001 1989 1997 2001 1992 1999 2002 2002 1992 1999 2002 1996 1997 1999 1990 1981 2003 28 | 1ad4974c4d79b00c890bd2dd1562600bd9c7e2bd 1998 1999 1995 2005 2005 1992 2005 2004 1960 1999 2000 1997 1998 2004 1993 2000 2001 1998 2004 1992 2005 1977 2003 1992 1990 2000 2005 2007 2003 2003 1980 2001 1994 1993 2005 2003 1999 2001 2002 2006 1980 2002 1989 1987 2006 2002 1995 1969 1979 2001 29 | 45099df43a2692bb4eea8ec12b331bb827403d57 1987 1991 1987 1992 1994 1989 1992 1993 1993 1990 1991 1991 30 | 9e59d03fbb534832ced523250ee429f41893ab39 2006 2007 2006 2006 1985 2006 2004 2004 2003 1976 1998 1994 1993 2007 2005 31 | 2774393ecb042926ba7fa6957841853ffff0396d 1997 1998 1996 1997 1994 1999 1997 1997 1997 1990 1988 1987 1995 1992 1997 1997 1997 1983 1997 1995 32 | aeb717fbb9aac3501236bce498cbf8b98f5d8926 0 0 0 0 0 0 0 0 0 0 0 0 1952 0 1979 0 0 0 0 0 1977 33 | 5f17cac51538fc860379e2a4887586757be6182e 2005 2003 2007 2007 2007 2002 2006 2005 2007 2008 2008 2008 2008 2010 2002 2002 1997 2003 2007 1989 34 | 520633c68777988873f5aa011df45a5289c04217 1978 1993 1996 1996 2003 1993 1977 2011 1986 2009 0 0 2008 1977 2008 1981 2011 2012 0 35 | 01ce0903206717ac40f9a26ce9478bdeff5c1262 2006 2008 2010 2010 2011 2010 2006 2007 36 | 2dd3138ebdaad92ace967fe7d2842937a5c729e4 2005 37 | 05d1cee851e2d900ab78b3542237af4db84590fd 1989 1991 1979 1992 1994 1995 1989 38 | d9f69149832fa4ceb0ab3b0285863ae2c8977d2e 1995 2009 1997 2009 2004 1991 1994 1988 2008 2002 2008 2006 2002 2007 2006 2009 1978 39 | 50fb3680934ae424e4c73c18e9928e9741171d5b 1998 2006 2006 1998 2000 2003 2003 1996 2006 2006 2004 2006 1990 1986 2002 1999 2000 2000 1998 1997 2001 2004 1998 2005 1998 40 | 246cd6090088e84f7b7ce85f94ad1969ad860118 0 2006 2005 2006 2003 2005 2002 1941 2002 2000 2006 41 | 152119fe816def89f825ac5b56e85b16530d0bbe 2009 1997 2011 2013 2013 2011 2001 1935 1999 2004 2007 1988 2000 1999 1949 2012 42 | 4699e2c09244f3496b1c202925618ccf732a617d 2002 2004 1991 1994 2009 1998 1992 2007 1990 1997 2006 0 2005 2005 2008 2008 2008 2008 1999 2000 2005 2009 2009 2010 2006 2006 2004 2003 1996 2001 1991 1979 1980 1994 43 | 7e2677bb7c4b1c1bcb7bd89c79cfc17a9d955f48 1985 1973 1999 1990 1994 1980 1838 1845 1980 2005 1972 1973 1994 1976 1953 1972 1975 1978 1978 1980 1995 2000 1991 2004 2003 2004 2006 1995 1996 2006 1997 2007 1979 2008 2014 2010 2011 1974 1992 1997 2005 44 | 4a59d7877aba1aa4f5a94758030f99989b2989dc 1996 2000 1996 2000 2012 2011 2011 2009 2004 2007 2005 2007 2009 2009 2005 2003 2010 2010 2010 2010 2010 2010 2009 1989 2002 2009 1996 2012 2012 1975 2012 2000 45 | 198dcf518298e0afe39d005da5320cfe840480c1 2013 1995 1982 1991 2013 1990 2012 2009 2011 2004 2008 2013 1997 2009 1971 2012 2008 2011 1973 1997 2013 1974 2010 2006 2003 1987 46 | 1e1ea21612a781634ae21271372b77cd19d09212 1999 2003 1990 2007 1971 2008 2005 2008 2013 2003 1993 1993 47 | d7e464a1e466fb04e72a961c24d10ecf65c20890 1999 1999 1997 1952 2004 1997 2000 1995 1996 2002 2009 2000 2003 2002 2009 2007 2007 1997 2007 2001 1998 2006 1995 1965 2005 2002 1998 1996 1973 1997 2008 2001 1976 2010 1984 2008 2002 2003 1993 2006 2006 2008 2006 1992 2001 2005 1998 1999 2001 2003 2007 2005 1999 2008 2000 1975 2001 2008 2007 2005 2005 1999 1993 1997 1989 1994 1999 2007 1988 1985 1994 1991 2009 2009 1998 2003 2008 2002 1968 2007 1981 2007 1997 1997 1983 2002 2006 1994 1996 1997 1991 2001 2007 1948 1998 1991 1966 2000 1989 1969 1997 2000 2001 2002 2004 2007 2008 2004 1965 48 | b58e020f6c9c834ba83f07322adc75f3756e087f 2013 2013 2007 2000 2013 2013 49 | 5c7eebc8ba8fe8df00f54496ab743ede61314419 1763 1979 1952 1990 2005 1975 2007 2005 2006 1984 1993 1974 1983 2004 2001 2003 2003 2003 2004 2006 2003 1946 1961 1965 1998 1996 1812 1974 1997 1976 2004 1998 2006 2004 2006 1983 1989 2002 2002 2004 1964 1978 1999 1996 2005 1997 1970 50 | 8ab8bb8aa41151b85ab367f1152ff17cc17b87d1 1974 1994 2007 2008 2009 1980 1985 1999 1972 1967 2004 1996 1999 2003 1930 1999 2005 2005 2005 2005 1971 1982 1973 1996 2008 51 | 03a7d6e56d70018b355f4b3ee0f8c8a240bf89a2 2006 2005 2005 2005 2001 1998 2000 2001 2000 2006 2002 2000 2006 1997 2003 1998 2006 2004 2007 52 | 8c1c76248e128c7e5789c7f0a9a89fbba17e4c11 2005 2014 2006 2007 2009 2006 2013 2011 2010 2005 1998 2006 2010 2012 53 | 3717dc0dba9fdb13d1459ed4edf7955dce2e06b3 1991 1998 1996 1996 1977 1995 1977 1997 1997 1993 1999 1998 54 | -------------------------------------------------------------------------------- /core/src/main/resources/golddata/isaac/bibs-to-tsv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import csv 4 | import argparse 5 | 6 | parser = argparse.ArgumentParser(description='Convert bibliography data into an empty TSV file for annotators to fill ' 7 | 'in with citation mention data') 8 | # bibliographies.tsv is the file inside this very directory containing high quality human-annotated bibliography data 9 | # Note that bibliographies.tsv itself is generated from the scholar project: 10 | # scholar-project/pipeline/src/main/resources/ground-truths/bibliographies.json 11 | parser.add_argument('-i', metavar='INPUT.TSV', default='bibliographies.tsv', help='Filename for bibliography TSV data') 12 | parser.add_argument('-o', metavar='OUTPUT.TSV', default='mentions-blank.tsv', help='Filename for empty TSV') 13 | args = parser.parse_args() 14 | 15 | with open(args.i) as bibs, open(args.o, 'w') as mentions: 16 | bibs = csv.reader(bibs, delimiter='\t') 17 | mentions = csv.writer(mentions, delimiter='\t') 18 | mentions.writerow(["Paper ID", "URL", "Bib Entry", "Context", "Context Reference"]) 19 | for paper in bibs: 20 | if paper: 21 | id, entries = paper[0], paper[1:] 22 | for i, entry in enumerate(entries): 23 | title, year, venue, authors = entry.split('|') 24 | authors = authors.split(':') 25 | mentions.writerow([id,"https://www.semanticscholar.org/paper/{0}".format(id), "[{0}] {1}. {2}".format(i + 1, ', '.join(authors), year), "", ""]) 26 | -------------------------------------------------------------------------------- /core/src/main/resources/golddata/isaac/import_bib_gold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # format scholar-project/pipeline/src/main/resources/ground-truths/bibliographies.json as valid JSON before running this 3 | # script inside that directory 4 | 5 | from jsonsempai import magic 6 | import bibliographies 7 | 8 | papers = bibliographies.bibs 9 | 10 | def refToStr(ref): # edit as necessary to include only authors/years/venues/etc. 11 | return ref.title.text + "|" + str(ref.year) + "|" + ref.venue.name + "|" + ":".join([" ".join([a.firstName] + a.middleNames + [a.lastName]) for a in ref.authors]) 12 | 13 | def paperToStr(paper): 14 | return "\t".join([paper.id] + [refToStr(ref) for ref in paper.refs]) 15 | 16 | with open('bibliographies.tsv', 'w') as f: 17 | for paper in papers: 18 | f.write(paperToStr(paper).encode('utf-8') + "\n") 19 | -------------------------------------------------------------------------------- /core/src/main/resources/golddata/isaac/tsv-to-gold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import csv 4 | import re 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description='After annotators fill in empty TSV file generated by bibs-to-tsv.py, ' 8 | 'this converts the filled-in TSV data into the format used for other TSV ' 9 | 'gold data') 10 | parser.add_argument('-i', metavar='INPUT.TSV', default='mentions-filled.tsv', help='Filename for filled-in TSV') 11 | parser.add_argument('-o', metavar='OUTPUT.TSV', default='mentions.tsv', help='Filename for final gold TSV') 12 | args = parser.parse_args() 13 | 14 | with open(args.i) as mentions, open(args.o, 'w') as gold_writer: 15 | mentions = csv.reader(mentions, delimiter='\t') 16 | next(mentions) # skip header row 17 | gold_writer = csv.writer(gold_writer, delimiter='\t') 18 | gold = {} 19 | for paper, _, _, context, mention in mentions: 20 | if paper not in gold: 21 | gold[paper] = [] 22 | cleaned_context = re.sub(r'\s+', ' ', context.strip()) 23 | if not cleaned_context: 24 | continue 25 | gold[paper].append("{0}|{1}".format(cleaned_context, re.sub(r'[()]', '', mention))) 26 | for paper, bib_entries in gold.items(): 27 | if len(bib_entries) > 0: 28 | gold_writer.writerow([paper] + bib_entries) -------------------------------------------------------------------------------- /core/src/main/resources/opennlp/tools/tokenize/en-token.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/main/resources/opennlp/tools/tokenize/en-token.bin -------------------------------------------------------------------------------- /core/src/main/resources/org/allenai/scienceparse/pipeline/highfreq.tsv: -------------------------------------------------------------------------------- 1 | a_simple 12720690 2 | a_dynamic 3753200 3 | dynamics_of 1532890 4 | determination_of 1777002 5 | investigation_of 1961935 6 | performance_of 8679010 7 | design_of 49726446 8 | estimation_of 8083152 9 | a_multi 10658088 10 | a_statistical 1095776 11 | an_efficient 50271243 12 | use_of 8632806 13 | an_optimal 1618386 14 | visualization_of 1070969 15 | the_effects 5530896 16 | a_fast 10222655 17 | a_probabilistic 1931745 18 | synthesis_of 2068747 19 | a_scalable 1899108 20 | on_line 3244472 21 | application_of 41340376 22 | control_of 2037770 23 | stability_of 1756670 24 | recognition_of 1007944 25 | simulation_of 4727016 26 | a_data 1264830 27 | an_adaptive 8786936 28 | toward_a 2473704 29 | a_general 5332851 30 | a_bayesian 1330236 31 | applications_of 2929491 32 | effects_of 27700900 33 | an_integrated 5452943 34 | a_web 1050192 35 | measuring_the 1450275 36 | a_neural 1214904 37 | the_evolution 1135497 38 | multi_agent 1027299 39 | assessing_the 1886130 40 | the_impact 17252433 41 | a_practical 2301740 42 | low_complexity 1198054 43 | a_class 1027026 44 | evaluation_of 59043315 45 | the_complexity 2251237 46 | the_role 26007145 47 | what_is 2925672 48 | a_parallel 3641565 49 | an_experimental 2116674 50 | real_time 50445213 51 | a_robust 3952640 52 | extraction_of 1018251 53 | a_high 3576455 54 | exploring_the 3538080 55 | advances_in 2604354 56 | implementation_of 10742238 57 | a_unified 3918522 58 | state_of 3048066 59 | a_comparison 19333800 60 | low_power 1194318 61 | a_self 1060920 62 | evolution_of 2981682 63 | review_of 4298535 64 | lower_bounds 1005609 65 | the_design 5208700 66 | a_non 1036504 67 | development_and 1553940 68 | fast_and 1592184 69 | data_mining 1549602 70 | building_a 1888656 71 | evaluating_the 2875506 72 | modeling_of 1830630 73 | model_based 4964960 74 | a_two 2270310 75 | a_framework 33836573 76 | research_on 4838940 77 | a_computational 1634932 78 | the_use 6342888 79 | how_to 21718515 80 | the_development 1297113 81 | the_application 1091897 82 | analysis_and 4345255 83 | using_a 1959879 84 | a_hierarchical 1344020 85 | prediction_of 2722464 86 | characterization_of 6124240 87 | development_of 39929760 88 | generation_of 1629189 89 | modeling_and 10579272 90 | a_hybrid 13076028 91 | analysis_of 134179470 92 | neural_network 1384011 93 | towards_an 3914460 94 | high_performance 3483712 95 | an_architecture 1631500 96 | fault_tolerant 1160492 97 | in_the 1368941 98 | an_approach 9797140 99 | a_fuzzy 1988224 100 | an_algorithm 4011406 101 | study_of 4196997 102 | book_review 1114371 103 | a_note 31770520 104 | introduction_to 21261555 105 | study_on 2350493 106 | understanding_the 1899040 107 | a_low 4074049 108 | developing_a 1389696 109 | a_case 4785264 110 | a_new 383773671 111 | assessment_of 2821824 112 | large_scale 3419310 113 | the_influence 2529252 114 | algorithms_for 3336032 115 | classification_of 4431254 116 | construction_of 2663066 117 | estimating_the 1313070 118 | modeling_the 2438241 119 | learning_to 3248856 120 | an_investigation 1475595 121 | three_dimensional 2384190 122 | effect_of 18819000 123 | a_flexible 1281834 124 | comments_on 3485950 125 | energy_efficient 4985232 126 | performance_analysis 18339839 127 | an_introduction 2103116 128 | an_evaluation 2941470 129 | impact_of 14113162 130 | role_of 1418820 131 | influence_of 3787980 132 | integration_of 4965832 133 | a_generic 1558388 134 | optimization_of 6624849 135 | a_formal 2443825 136 | an_improved 11404888 137 | detection_of 9207810 138 | a_distributed 4994100 139 | a_methodology 1508225 140 | proceedings_of 15661230 141 | performance_evaluation 10518768 142 | a_study 31082424 143 | a_model 16993140 144 | a_system 1834632 145 | model_checking 1130500 146 | on_the 2050795697 147 | special_issue 1762722 148 | identification_of 10424040 149 | a_novel 110788635 150 | towards_the 1676733 151 | machine_learning 1310518 152 | agent_based 1434524 153 | comparison_of 28484820 154 | a_method 8919118 155 | on_a 9368724 156 | object_oriented 1596160 157 | a_review 3881448 158 | using_the 2515350 159 | measurement_of 1337719 160 | an_overview 2984265 161 | from_the 2218287 162 | an_analysis 6661556 163 | the_effect 16498944 164 | an_empirical 7243839 165 | a_survey 15373729 166 | verification_of 1178220 167 | an_application 1608600 168 | an_effective 1223620 169 | design_and 52970592 170 | a_comparative 6703872 171 | improving_the 7066752 172 | a_generalized 1330711 173 | a_real 1238160 174 | overview_of 2647164 175 | towards_a 49618212 176 | an_overview_of 2566320 177 | a_review_of 2284447 178 | a_case_study 1537596 179 | on_the_use 1848079 180 | the_evolution_of 1003312 181 | a_new_approach 5122026 182 | a_study_of 10074025 183 | analysis_of_the 2585856 184 | a_comparative_study 2961265 185 | introduction_to_the 1544796 186 | a_study_on 4248972 187 | a_novel_approach 1431635 188 | performance_analysis_of 13852134 189 | an_application_of 1011780 190 | a_method_for 3571193 191 | development_of_a 5741686 192 | a_new_method 1279488 193 | an_introduction_to 2002362 194 | a_note_on 30272320 195 | the_complexity_of 2138344 196 | the_use_of 6072724 197 | a_model_for 2191425 198 | a_framework_for 26200476 199 | a_survey_of 6222890 200 | a_survey_on 1445036 201 | the_effect_of 16211580 202 | an_evaluation_of 2148589 203 | the_impact_of 16671875 204 | design_of_a 4213936 205 | the_effects_of 5397246 206 | the_design_of 1383060 207 | a_comparison_of 15086610 208 | an_analysis_of 5500050 209 | the_role_of 25140654 210 | on_the_complexity 1898598 211 | an_empirical_study 1371527 212 | an_approach_to 4261494 213 | a_model_of 1413822 214 | the_influence_of 2448115 215 | an_algorithm_for 2530008 216 | proceedings_of_the 12292616 217 | design_and_implementation 7314000 218 | performance_evaluation_of 8114553 219 | a_new_approach_to 1996090 220 | on_the_use_of 1809780 221 | on_the_complexity_of 1797614 222 | design_and_implementation_of 6962494 223 | a_comparative_study_of 1840080 224 | a_note_on_the 2921148 225 | design_and_implementation_of_a 1076582 226 | -------------------------------------------------------------------------------- /core/src/main/scala/org/allenai/scienceparse/BibTraining.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import java.io.{ File, FileInputStream } 4 | 5 | import com.gs.collections.impl.set.mutable.UnifiedSet 6 | import org.allenai.common.{ Logging, Resource } 7 | import org.allenai.datastore.Datastores 8 | import org.allenai.scienceparse.Parser.ParseOpts 9 | import scopt.OptionParser 10 | 11 | import scala.collection.JavaConverters._ 12 | import scala.io.Source 13 | 14 | object BibTraining extends App with Datastores with Logging { 15 | // The Files are all Option[File] defaulting to None. Properly, they should be set to the 16 | // defaults from the datastore, but if we do that here, they will download several gigabytes 17 | // of files during startup, even if they are unused later. 18 | case class Config( 19 | output: File = null, 20 | groundTruth: Option[File] = None, 21 | maxIterations: Int = 150, 22 | backgroundSampleDocs: Int = 4000, 23 | backgroundDirectory: Option[File] = None, 24 | gazetteerFile: Option[File] = None, 25 | trainFraction: Double = 0.9, 26 | minExpectedFeatureCount: Int = 1 27 | ) 28 | 29 | val parser = new OptionParser[Config](this.getClass.getSimpleName) { 30 | head("Options that are not specified default to the settings that were used to make the production model.") 31 | 32 | opt[File]('o', "output") required () action { (o, c) => 33 | c.copy(output = o) 34 | } text "The output file" 35 | 36 | opt[File]('t', "groundTruth") action { (t, c) => 37 | c.copy(groundTruth = Some(t)) 38 | } text "The ground truth directory" 39 | 40 | opt[Int]("maxIterations") action { (i, c) => 41 | c.copy(maxIterations = i) 42 | } text "Maximum number of iterations during training" 43 | 44 | opt[Int]("backgroundSampleDocs") action { (d, c) => 45 | c.copy(backgroundSampleDocs = d) 46 | } text "The number of documents to use to build the background language model" 47 | 48 | opt[File]("backgroundDirectory") action { (d, c) => 49 | c.copy(backgroundDirectory = Some(d)) 50 | } text "The directory in which the background documents are found" 51 | 52 | opt[File]('g', "gazetteerFile") action { (f, c) => 53 | c.copy(gazetteerFile = Some(f)) 54 | } text "The gazetteer file" 55 | 56 | opt[Double]("trainFraction") action { (f, c) => 57 | c.copy(trainFraction = f) 58 | } text "The fraction of the ground truth to use for training" 59 | 60 | opt[Int]("minExpectedFeatureCount") action { (n, c) => 61 | c.copy(minExpectedFeatureCount = n) 62 | } text "The minimum number of times we should see a feature before accepting it." 63 | 64 | help("help") text "Prints help text" 65 | } 66 | 67 | parser.parse(args, Config()).foreach { config => 68 | val groundTruthDirectory = 69 | config.groundTruth.getOrElse(publicDirectory("productionBibGroundTruth", 2).toFile) 70 | 71 | val opts = new ParseOpts 72 | opts.modelFile = config.output.toString 73 | opts.iterations = config.maxIterations 74 | opts.threads = Runtime.getRuntime.availableProcessors() * 2 75 | opts.backgroundSamples = config.backgroundSampleDocs 76 | 77 | val backgroundDirectory = 78 | config.backgroundDirectory.getOrElse(publicDirectory("productionBackgroundDocs", 1).toFile) 79 | opts.backgroundDirectory = backgroundDirectory.toString 80 | 81 | val gazetteerFile = config.gazetteerFile.getOrElse(Parser.getDefaultGazetteer.toFile) 82 | opts.gazetteerFile = gazetteerFile.toString 83 | 84 | opts.trainFraction = config.trainFraction 85 | opts.minExpectedFeatureCount = config.minExpectedFeatureCount 86 | 87 | Parser.trainBibliographyCRF(groundTruthDirectory, opts) 88 | 89 | logger.info(s"New model at ${opts.modelFile}") 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /core/src/main/scala/org/allenai/scienceparse/CachedGrobidServer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import java.io.{InputStream, ByteArrayInputStream, IOException} 4 | import java.net.{SocketTimeoutException, URL} 5 | import java.nio.file.{NoSuchFileException, Paths, Files} 6 | import java.util.zip.{GZIPOutputStream, GZIPInputStream} 7 | 8 | import org.allenai.common.{Logging, Resource} 9 | import org.allenai.datastore.Datastores 10 | import org.apache.commons.io.{FileUtils, IOUtils} 11 | 12 | import scala.util.control.NonFatal 13 | import scala.util.{Success, Failure, Try, Random} 14 | import scalaj.http.{Http, MultiPart, HttpResponse} 15 | 16 | 17 | class CachedGrobidServer(url: URL) extends Logging with Datastores { 18 | private val cacheDir = { 19 | val dirName = url.toString.replaceAll("[^\\w-.]+", "#") 20 | Files.createDirectories(CachedGrobidServer.cacheDir) 21 | val dir = CachedGrobidServer.cacheDir.resolve(dirName) 22 | if(!Files.exists(dir)) { 23 | // Warm the cache, so for most evaluations we don't need to have a running Grobid server at 24 | // all. 25 | val warmCacheDir = publicDirectory("GrobidServerCache", 2) 26 | FileUtils.copyDirectory(warmCacheDir.toFile, dir.toFile) 27 | } 28 | dir 29 | } 30 | 31 | private val random = new Random 32 | /** Gets a response from an HTTP server given a request. Retries if we think retrying might fix it. */ 33 | private def withRetries[T](f: () => HttpResponse[T], retries: Int = 10): HttpResponse[T] = if (retries <= 0) { 34 | f() 35 | } else { 36 | val sleepTime = random.nextInt(1000) + 2500 // sleep between 2.5 and 3.5 seconds 37 | // If something goes wrong, we sleep a random amount of time, to make sure that we don't slam 38 | // the server, get timeouts, wait for exactly the same amount of time on all threads, and then 39 | // slam the server again. 40 | 41 | Try(f()) match { 42 | case Failure(e: SocketTimeoutException) => 43 | logger.warn(s"$e while querying Grobid. $retries retries left.") 44 | Thread.sleep(sleepTime) 45 | withRetries(f, retries - 1) 46 | 47 | case Failure(e: IOException) => 48 | logger.warn(s"Got IOException '${e.getMessage}' while querying Grobid. $retries retries left.") 49 | Thread.sleep(sleepTime) 50 | withRetries(f, retries - 1) 51 | 52 | case Success(response) if response.isServerError => 53 | logger.warn(s"Got response code '${response.statusLine}' while querying Grobid. $retries retries left.") 54 | Thread.sleep(sleepTime) 55 | withRetries(f, retries - 1) 56 | 57 | case Failure(e) => throw e 58 | 59 | case Success(response) => response 60 | } 61 | } 62 | 63 | // Note: This is not thread safe if you have two threads or processes ask for the same file at 64 | // the same time. 65 | def getExtractions(bytes: Array[Byte]): InputStream = { 66 | val paperId = Utilities.shaForBytes(bytes) 67 | 68 | val cacheFile = cacheDir.resolve(paperId + ".xml.gz") 69 | try { 70 | if (Files.size(cacheFile) == 0) 71 | throw new IOException(s"Paper $paperId is tombstoned") 72 | else 73 | new GZIPInputStream(Files.newInputStream(cacheFile)) 74 | } catch { 75 | case _: NoSuchFileException => 76 | logger.debug(s"Cache miss for $paperId") 77 | try { 78 | val response = withRetries { () => 79 | val multipart = MultiPart("input", s"$paperId.pdf", "application/octet-stream", bytes) 80 | Http(url + "/processFulltextDocument").timeout(60000, 60000).postMulti(multipart).asBytes 81 | } 82 | val bais = new ByteArrayInputStream(response.body) 83 | Resource.using(new GZIPOutputStream(Files.newOutputStream(cacheFile))) { os => 84 | IOUtils.copy(bais, os) 85 | } 86 | bais.reset() 87 | bais 88 | } catch { 89 | case NonFatal(e) => 90 | logger.warn(s"Tombstoning $paperId because of the following error:", e) 91 | Files.deleteIfExists(cacheFile) 92 | Files.createFile(cacheFile) 93 | throw e 94 | } 95 | } 96 | } 97 | } 98 | 99 | object CachedGrobidServer { 100 | val cacheDir = Files.createDirectories( 101 | Paths.get( 102 | System.getProperty("java.io.tmpdir"), 103 | this.getClass.getSimpleName.stripSuffix("$"))) 104 | } 105 | -------------------------------------------------------------------------------- /core/src/main/scala/org/allenai/scienceparse/GazetteerFromPMC.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import java.util.concurrent.atomic.AtomicInteger 4 | 5 | import org.allenai.common.ParIterator._ 6 | import scala.concurrent.ExecutionContext.Implicits.global 7 | import spray.json._ 8 | 9 | object GazetteerFromPMC extends App { 10 | case class GazetteerEntry(id: String, title: String, authors: Seq[String], year: Int) 11 | import DefaultJsonProtocol._ 12 | implicit val gazetteerEntryFormat = jsonFormat4(GazetteerEntry.apply) 13 | 14 | // We use the first 1k of this for testing, so let's drop 10k just to be sure. 15 | val labeledDataNotUsedForTesting = LabeledPapersFromPMC.get.drop(10000) 16 | 17 | val noneCount = new AtomicInteger() 18 | 19 | labeledDataNotUsedForTesting.parMap { lp => 20 | val ld = lp.labels 21 | (ld.title, ld.authors, ld.year) match { 22 | case (Some(title), Some(authors), Some(year)) => 23 | Some(GazetteerEntry(lp.paperId, title.replaceAll("\\s+", " "), authors.map(_.name), year)) 24 | case _ => 25 | noneCount.incrementAndGet() 26 | None 27 | } 28 | }.flatten.take(1000000).foreach { entry => 29 | println(entry.toJson) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /core/src/main/scala/org/allenai/scienceparse/GrobidParser.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import java.io.InputStream 4 | import java.nio.file.Path 5 | import java.util.Calendar 6 | 7 | import org.allenai.common.StringUtils.StringExtras 8 | import org.allenai.scienceparse.{ Section => SPSection } 9 | import org.jsoup.Jsoup 10 | import org.jsoup.nodes.{Document, Element, TextNode} 11 | import org.jsoup.parser.{Parser => JsoupParser} 12 | 13 | import scala.collection.JavaConverters._ 14 | 15 | object GrobidParser { 16 | def addDot(x: String) = if (x.length == 1) s"$x." else x 17 | 18 | def author(e: Element): String = { 19 | val first = List(e.findText("persName>forename[type=first]")) 20 | val mids = e.select("persName>forename[type=middle]").asScala.map(_.text).toList 21 | val last = List(e.findText("persName>surname")) 22 | (first ++ mids ++ last).filter(!_.isEmpty).map(a => addDot(a.trimNonAlphabetic)).mkString(" ") 23 | } 24 | 25 | def extractTitle(doc: Element): String = { 26 | doc.findText("teiHeader>fileDesc>titleStmt>title").titleCase() 27 | } 28 | 29 | def toTitle(s: String) = { 30 | s.trimChars(",.").find(c => Character.isAlphabetic(c)) match { 31 | case None => "" 32 | case Some(_) => s 33 | } 34 | } 35 | 36 | def extractYear(str: String): Int = "\\d{4}".r.findFirstIn(str) match { 37 | case Some(y) => y.toInt 38 | case None => 0 39 | } 40 | 41 | def extractBibEntriesWithId(doc: Element) = 42 | for { 43 | bib <- doc.select("listBibl>biblStruct").asScala 44 | } yield { 45 | val title = toTitle(bib.findText("analytic>title[type=main]")) match { 46 | case "" => bib.findText("monogr>title") 47 | case s => s 48 | } 49 | val authors = bib.select("analytic>author").asScala.map(author).toList match { 50 | case List() => bib.select("monogr>author").asScala.map(author).toList 51 | case l => l 52 | } 53 | val venue = bib.findText("monogr>title") 54 | val yr = extractYear(bib.findAttributeValue("monogr>imprint>date[type=published]", "when")) 55 | new BibRecord(title, authors.asJava, venue, null, null, yr) 56 | } 57 | 58 | def ifNonEmpty(s: String) = if (s.nonEmpty) Some(s) else None 59 | 60 | case class Section(id: Option[String], header: Option[String], text: String) 61 | 62 | private def extractSectionInfo(div: Element) = { 63 | val bodyPlusHeaderText = div.text 64 | 65 | val head = div.select("head").asScala.headOption 66 | val (id, headerText, bodyTextOffset) = head match { 67 | case Some(h) => 68 | val hText = h.text 69 | ( 70 | ifNonEmpty(h.attr("n")), 71 | Some(hText), 72 | hText.size + bodyPlusHeaderText.drop(hText.size).takeWhile(_ <= ' ').size 73 | ) 74 | case None => 75 | (None, None, 0) 76 | } 77 | val section = Section(id = id, text = bodyPlusHeaderText.drop(bodyTextOffset), header = head.map(_.text)) 78 | (div, bodyPlusHeaderText, bodyTextOffset, section) 79 | } 80 | 81 | def extractReferenceMentions(doc: Element, sectionInfo: Iterable[(Element, String, Int, Section)]): List[CitationRecord] = { 82 | val bibMentions = 83 | for { 84 | ref <- doc.select("ref[type=bibr").asScala 85 | ((div, fullText, offset, _), sectionNumber) <- sectionInfo.zipWithIndex.find { 86 | case ((div, fullText, offset, _), i) => 87 | ref.parents.contains(div) 88 | } 89 | } yield { 90 | val id = ref.attr("target").dropWhile(_ == '#') 91 | val begin = ref.textOffset(div) - offset 92 | val end = begin + ref.text.length 93 | Parser.extractContext(0, fullText, begin, end) 94 | } 95 | bibMentions.toList 96 | } 97 | 98 | def parseGrobidXml(grobidExtraction: Path): ExtractedMetadata = { 99 | val doc = Jsoup.parse(grobidExtraction.toFile, "UTF-8") 100 | parseGrobidXml(doc) 101 | } 102 | 103 | def parseGrobidXml(is: InputStream, baseUrl: String): ExtractedMetadata = { 104 | val doc = Jsoup.parse(is, "UTF-8", baseUrl, JsoupParser.xmlParser()) 105 | parseGrobidXml(doc) 106 | } 107 | 108 | private def parseGrobidXml(doc: Document): ExtractedMetadata = { 109 | val year = extractYear(doc.findAttributeValue("teiHeader>fileDesc>sourceDesc>biblStruct>monogr>imprint>date[type=published]", "when")) 110 | val calendar = Calendar.getInstance() 111 | calendar.set(Calendar.YEAR, year) 112 | 113 | val sectionInfo = doc.select("text>body>div").asScala.map(extractSectionInfo) 114 | 115 | val em = new ExtractedMetadata(extractTitle(doc), doc.select("teiHeader>fileDesc>sourceDesc>biblStruct>analytic>author").asScala.map(author).asJava, calendar.getTime) 116 | em.year = year 117 | em.references = extractBibEntriesWithId(doc).asJava 118 | em.referenceMentions = extractReferenceMentions(doc, sectionInfo).asJava 119 | em.abstractText = doc.select("teiHeader>profileDesc>abstract").asScala.headOption.map(_.text).getOrElse("") 120 | 121 | em.sections = sectionInfo.map { case (_, _, _, grobidSection) => 122 | new SPSection( 123 | Seq(grobidSection.id, grobidSection.header).flatten.map(_.trim).mkString(" "), 124 | grobidSection.text) 125 | }.asJava 126 | 127 | em 128 | } 129 | 130 | implicit class JsoupElementsImplicits(e: Element) { 131 | 132 | def findText(path: String): String = 133 | e.select(path).asScala.headOption.map(_.text).getOrElse("") 134 | 135 | def findAttributeValue(path: String, attrName: String): String = 136 | e.select(path).asScala.headOption.map(_.attr(attrName)).getOrElse("") 137 | 138 | // The number of text characters in the ancestor that preceed the given element 139 | def textOffset(ancestor: Element): Int = { 140 | if (ancestor == e.parent) { 141 | val ancestorText = ancestor.text 142 | val elementText = e.text 143 | val index = ancestorText.indexOf(elementText) 144 | ancestorText.indexOf(elementText, index + 1) match { 145 | case -1 => // The common and easy case: Text only occurs once in the parent. 146 | index 147 | case _ => // Our text occurs multiple times in the parent. Bogus! 148 | // Count how many times it occurs previous to our element 149 | def countOccurencesIn(base: String) = { 150 | var count = 0 151 | var index = base.indexOf(elementText) 152 | while (index > 0) { 153 | count += 1 154 | index = base.indexOf(elementText, index + 1) 155 | } 156 | count 157 | } 158 | val precedingSiblingText = 159 | ancestor.childNodes.asScala.takeWhile(_ != e).map { 160 | case t: TextNode => t.getWholeText.trim() 161 | case e: Element => e.text 162 | case _ => "" 163 | } 164 | val precedingCount = precedingSiblingText.map(countOccurencesIn).sum 165 | // Now get the next occurrence of our text 166 | def nthIndexOf(base: String, n: Int) = { 167 | var i = 0 168 | var index = base.indexOf(elementText) 169 | while (i < n) { 170 | index = base.indexOf(elementText, index + 1) 171 | i += 1 172 | } 173 | index 174 | } 175 | nthIndexOf(ancestorText, precedingCount) 176 | } 177 | } else if (e.parent == null) { 178 | sys.error("Must specify an ancestor element to find text offset") 179 | } else { 180 | e.parent.textOffset(ancestor) + e.textOffset(e.parent) 181 | } 182 | } 183 | } 184 | 185 | implicit class StringImplicits2(val str: String) extends AnyVal with StringExtras { 186 | /** @return Given full name such as "Doe, John A.", returns the last name assuming 187 | * that it's the word before the comma. 188 | */ 189 | def lastNameFromFull(): String = str.trim.takeWhile(_ != ',') 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /core/src/main/scala/org/allenai/scienceparse/InterleavingIterator.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | class InterleavingIterator[T](inners: Iterator[T]*) extends Iterator[T] { 4 | override def hasNext = inners.exists(_.hasNext) 5 | 6 | private var index = 0 7 | private def bumpIndex(): Unit = { 8 | index += 1 9 | index %= inners.size 10 | } 11 | 12 | while(!inners(index).hasNext) 13 | bumpIndex() 14 | 15 | private def moveToNextIndex(): Unit = { 16 | require(hasNext) 17 | bumpIndex() 18 | while(!inners(index).hasNext) 19 | bumpIndex() 20 | } 21 | 22 | override def next() = { 23 | require(inners(index).hasNext) 24 | val result = inners(index).next() 25 | if(hasNext) 26 | moveToNextIndex() 27 | result 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /core/src/main/scala/org/allenai/scienceparse/JsonProtocol.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import java.util.regex.Pattern 4 | import scala.collection.JavaConverters._ 5 | import spray.json._ 6 | 7 | object JsonProtocol extends DefaultJsonProtocol { 8 | import java.util.{ List => JavaList } 9 | 10 | private def expected(name: String) = throw DeserializationException(s"Expected $name") 11 | 12 | private def optional[T >: Null](obj: JsValue)(implicit format: JsonFormat[T]): T = 13 | obj.convertTo[Option[T]].orNull 14 | 15 | implicit def javaListJsonFormat[T: JsonFormat]: RootJsonFormat[JavaList[T]] = 16 | new RootJsonFormat[JavaList[T]] { 17 | override def write(list: JavaList[T]): JsValue = 18 | JsArray(list.asScala.map(_.toJson): _*) 19 | 20 | override def read(json: JsValue): JavaList[T] = json match { 21 | case JsArray(values) => values.map { value => value.convertTo[T] }.toList.asJava 22 | case _ => expected("List<>") 23 | } 24 | } 25 | 26 | implicit object PatternJsonFormat extends RootJsonFormat[Pattern] { 27 | override def write(pattern: Pattern): JsValue = JsString(pattern.pattern()) 28 | 29 | override def read(json: JsValue): Pattern = json match { 30 | case JsString(p) => Pattern.compile(p) 31 | case _ => expected("Pattern") 32 | } 33 | } 34 | 35 | implicit object ExtractedMetadataSourceJsonFormat extends RootJsonFormat[ExtractedMetadata.Source] { 36 | override def write(source: ExtractedMetadata.Source): JsValue = { 37 | JsString(source.name()) 38 | } 39 | 40 | override def read(json: JsValue): ExtractedMetadata.Source = { 41 | json match { 42 | case JsString(name) => ExtractedMetadata.Source.valueOf(name) 43 | case _ => expected("ExtractedMetadata.Source") 44 | } 45 | } 46 | } 47 | 48 | implicit object SectionJsonFormat extends RootJsonFormat[Section] { 49 | override def write(section: Section): JsValue = JsObject( 50 | "heading" -> Option(section.heading).toJson, 51 | "text" -> section.text.toJson 52 | ) 53 | 54 | override def read(json: JsValue): Section = json.asJsObject.getFields("heading", "text") match { 55 | case Seq(heading, JsString(text)) => 56 | new Section( 57 | optional[String](heading), 58 | text) 59 | case _ => expected("Section") 60 | } 61 | } 62 | 63 | implicit object BibRecordJsonFormat extends RootJsonFormat[BibRecord] { 64 | override def write(bibRecord: BibRecord) = JsObject( 65 | "title" -> Option(bibRecord.title).toJson, 66 | "author" -> bibRecord.author.toJson, 67 | "venue" -> Option(bibRecord.venue).toJson, 68 | "citeRegEx" -> Option(bibRecord.citeRegEx).toJson, 69 | "shortCiteRegEx" -> Option(bibRecord.shortCiteRegEx).toJson, 70 | "year" -> bibRecord.year.toJson 71 | ) 72 | 73 | override def read(json: JsValue): BibRecord = json.asJsObject.getFields( 74 | "title", 75 | "author", 76 | "venue", 77 | "citeRegEx", 78 | "shortCiteRegEx", 79 | "year" 80 | ) match { 81 | case Seq( 82 | title, 83 | author, 84 | venue, 85 | citeRegEx, 86 | shortCiteRegEx, 87 | JsNumber(year) 88 | ) => 89 | new BibRecord( 90 | optional[String](title), 91 | author.convertTo[JavaList[String]], 92 | optional[String](venue), 93 | optional[Pattern](citeRegEx), 94 | optional[Pattern](shortCiteRegEx), 95 | year.intValue() 96 | ) 97 | case _ => expected("BibRecord") 98 | } 99 | } 100 | 101 | implicit object CitationRecordJsonFormat extends RootJsonFormat[CitationRecord] { 102 | override def write(cr: CitationRecord): JsValue = JsObject( 103 | "referenceID" -> cr.referenceID.toJson, 104 | "context" -> cr.context.toJson, 105 | "startOffset" -> cr.startOffset.toJson, 106 | "endOffset" -> cr.endOffset.toJson 107 | ) 108 | 109 | override def read(json: JsValue): CitationRecord = json.asJsObject.getFields( 110 | "referenceID", 111 | "context", 112 | "startOffset", 113 | "endOffset" 114 | ) match { 115 | case Seq( 116 | JsNumber(referenceID), 117 | JsString(context), 118 | JsNumber(startOffset), 119 | JsNumber(endOffset) 120 | ) => new CitationRecord(referenceID.toInt, context, startOffset.toInt, endOffset.toInt) 121 | case _ => expected("CitationRecord") 122 | } 123 | } 124 | 125 | implicit object ExtractedMetadataJsonFormat extends RootJsonFormat[ExtractedMetadata] { 126 | override def write(em: ExtractedMetadata): JsValue = JsObject( 127 | "source" -> Option(em.source).toJson, 128 | "title" -> Option(em.title).toJson, 129 | "authors" -> em.authors.toJson, 130 | "emails" -> em.emails.toJson, 131 | "sections" -> Option(em.sections).toJson, 132 | "references" -> Option(em.references).toJson, 133 | "referenceMentions" -> Option(em.referenceMentions).toJson, 134 | "year" -> em.year.toJson, 135 | "abstractText" -> Option(em.abstractText).toJson, 136 | "creator" -> Option(em.creator).toJson 137 | ) 138 | 139 | override def read(json: JsValue): ExtractedMetadata = json.asJsObject.getFields( 140 | "source", 141 | "title", 142 | "authors", 143 | "emails", 144 | "sections", 145 | "references", 146 | "referenceMentions", 147 | "year", 148 | "abstractText", 149 | "creator" 150 | ) match { 151 | case Seq( 152 | source, 153 | title, 154 | authors, 155 | emails, 156 | sections, 157 | references, 158 | referenceMentions, 159 | JsNumber(year), 160 | abstractText, 161 | creator 162 | ) => 163 | val em = new ExtractedMetadata( 164 | optional[String](title), 165 | authors.convertTo[JavaList[String]], 166 | null) 167 | em.source = optional[ExtractedMetadata.Source](source) 168 | em.emails = emails.convertTo[JavaList[String]] 169 | em.sections = optional[JavaList[Section]](sections) 170 | em.references = optional[JavaList[BibRecord]](references) 171 | em.referenceMentions = optional[JavaList[CitationRecord]](referenceMentions) 172 | em.year = year.intValue() 173 | em.abstractText = optional[String](abstractText) 174 | em.creator = optional[String](creator) 175 | em 176 | case _ => expected("ExtractedMetadata") 177 | } 178 | } 179 | 180 | // Some formats for LabeledData 181 | implicit val authorFormat = jsonFormat3(LabeledData.Author) 182 | implicit val sectionFormat = jsonFormat2(LabeledData.Section) 183 | implicit val referenceFormat = jsonFormat7(LabeledData.Reference) 184 | implicit val rangeFormat = jsonFormat2(LabeledData.Range) 185 | implicit val mentionFormat = jsonFormat3(LabeledData.Mention) 186 | implicit val labeledDataFormat = jsonFormat9(LabeledData.apply) 187 | } 188 | -------------------------------------------------------------------------------- /core/src/main/scala/org/allenai/scienceparse/PrintCRFInput.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import java.awt.Desktop 4 | import java.io.{ PrintWriter, File } 5 | 6 | import org.allenai.common.Resource 7 | import org.allenai.scienceparse.pdfapi.{ PDFFontMetrics, PDFExtractor } 8 | import org.apache.commons.lang3.StringEscapeUtils 9 | import scopt.OptionParser 10 | import scala.collection.JavaConverters._ 11 | 12 | object PrintCRFInput extends App { 13 | case class Config( 14 | paperDir: Option[File] = None, 15 | paperId: String = null 16 | ) 17 | 18 | val parser = new OptionParser[Config](this.getClass.getSimpleName) { 19 | opt[File]('d', "paperDir") action { (d, c) => 20 | c.copy(paperDir = Some(d)) 21 | } text "The directory that contains the papers" 22 | 23 | arg[String]("") required () action { (p, c) => 24 | c.copy(paperId = p) 25 | } text "The ID of the paper whose CRF input you want to see" 26 | } 27 | 28 | parser.parse(args, Config()).foreach { config => 29 | val paperSource = config.paperDir.map(new DirectoryPaperSource(_)).getOrElse { 30 | PaperSource.getDefault 31 | } 32 | 33 | val seq = Resource.using(paperSource.getPdf(config.paperId)) { is => 34 | val ext = new PDFExtractor 35 | val doc = ext.extractFromInputStream(is) 36 | PDFToCRFInput.getSequence(doc).asScala 37 | } 38 | 39 | // make a font-to-color map 40 | def font2style(fm: PDFFontMetrics) = f"font${fm.hashCode()}%x" 41 | val fonts = seq.map(_.getPdfToken.fontMetrics).toSet.map(font2style) 42 | val colors = Stream.from(1). 43 | map { n => (n * 0.61803398875 * 360).round % 360 }. 44 | map { hue => s"hsl($hue, 90%%, 85%%)" } 45 | val font2color = (fonts zip colors).toMap 46 | 47 | val tempFile = File.createTempFile(s"CRFInput-${config.paperId}.", ".html") 48 | tempFile.deleteOnExit() 49 | try { 50 | Resource.using(new PrintWriter(tempFile, "UTF-8")) { out => 51 | out.println("") 52 | out.println("") 53 | out.println(s"CRF input for ${config.paperId}") 54 | out.println("") 60 | out.println("") 61 | out.println("") 62 | var line = 0 63 | var page = 0 64 | seq.foreach { token => 65 | if (token.getPage != page) { 66 | out.println("


") 67 | line = 0 68 | page = token.getPage 69 | } else if (token.getLine != line) { 70 | out.println("
") 71 | line = token.getLine 72 | } 73 | 74 | val style = font2style(token.getPdfToken.fontMetrics) 75 | val escaped = StringEscapeUtils.escapeHtml4(token.getPdfToken.token) 76 | out.println(s"$escaped") 77 | } 78 | out.println("") 79 | out.println("") 80 | } 81 | 82 | Desktop.getDesktop.browse(tempFile.toURI) 83 | Thread.sleep(5000) 84 | } finally { 85 | tempFile.delete() 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /core/src/main/scala/org/allenai/scienceparse/PrintFeaturizedCRFInput.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import java.io.{ DataInputStream, File } 4 | import java.nio.file.Files 5 | import java.util 6 | 7 | import com.gs.collections.api.map.primitive.ObjectDoubleMap 8 | import org.allenai.common.Resource 9 | import org.allenai.scienceparse.pdfapi.PDFExtractor 10 | import scopt.OptionParser 11 | 12 | import scala.collection.JavaConverters._ 13 | 14 | object PrintFeaturizedCRFInput extends App { 15 | case class Config( 16 | paperDir: Option[File] = None, 17 | modelFile: Option[File] = None, 18 | paperId: String = null 19 | ) 20 | 21 | val parser = new OptionParser[Config](this.getClass.getSimpleName) { 22 | opt[File]('d', "paperDir") action { (d, c) => 23 | c.copy(paperDir = Some(d)) 24 | } text "The directory that contains the papers" 25 | 26 | opt[File]('m', "model") action { (m, c) => 27 | c.copy(modelFile = Some(m)) 28 | } text "A model to load LM feature values from" 29 | 30 | arg[String]("") required () action { (p, c) => 31 | c.copy(paperId = p) 32 | } text "The ID of the paper whose CRF input you want to see" 33 | } 34 | 35 | parser.parse(args, Config()).foreach { config => 36 | val paperSource = config.paperDir.map(new DirectoryPaperSource(_)).getOrElse { 37 | PaperSource.getDefault 38 | } 39 | 40 | val predExtractor = { 41 | val modelPath = config.modelFile.map(_.toPath).getOrElse(Parser.getDefaultProductionModel) 42 | Resource.using(new DataInputStream(Files.newInputStream(modelPath))) { dis => 43 | Parser.loadModelComponents(dis).predExtractor 44 | } 45 | } 46 | 47 | val seq = Resource.using(paperSource.getPdf(config.paperId)) { is => 48 | val ext = new PDFExtractor 49 | val doc = ext.extractFromInputStream(is) 50 | PDFToCRFInput.getSequence(doc) 51 | } 52 | 53 | val paddedSeq = PDFToCRFInput.padSequence(seq).asScala.toSeq 54 | 55 | val lines = stringsFromFeaturizedSeq(predExtractor.nodePredicates(paddedSeq.asJava)) 56 | 57 | lines.asScala.foreach(println) 58 | } 59 | 60 | def stringsFromFeaturizedSeq( 61 | featurizedJava: util.List[ObjectDoubleMap[String]], 62 | prefix: String = "" 63 | ) = { 64 | // do a complicated dance to map from GS collections to Scala collections 65 | val featurized = featurizedJava.asScala.map { gsMap => 66 | gsMap.keySet().asScala.map { key => key -> gsMap.get(key) }.toMap 67 | }.toSeq 68 | 69 | // token feature is special 70 | val tokenFeaturePrefix = "%t=" 71 | 72 | // figure out binary features 73 | val feature2values = featurized.flatten.foldLeft(Map.empty[String, Set[Double]]) { 74 | case (acc, (key, value)) => acc.updated(key, acc.getOrElse(key, Set[Double]()) + value) 75 | } 76 | val binaryFeatures = feature2values. 77 | filter(_._2 subsetOf Set(0.0, 1.0)). 78 | keys. 79 | filterNot(_.startsWith(tokenFeaturePrefix)). 80 | toSet 81 | 82 | // figure out an order for non-binary features 83 | val orderedNonBinaryFeatures = featurized. 84 | flatMap(_.keys). 85 | filterNot(binaryFeatures). 86 | filterNot(_.startsWith(tokenFeaturePrefix)). 87 | groupBy(identity). 88 | mapValues(_.size). 89 | toSeq.sortBy { case (feature, count) => (-count, feature) }. 90 | map(_._1) 91 | 92 | // write header 93 | val header = (tokenFeaturePrefix +: orderedNonBinaryFeatures).mkString("\t") 94 | 95 | // write entries 96 | val body = featurized.zipWithIndex.map { 97 | case (features, index) => 98 | ( 99 | // token feature 100 | Seq( 101 | features.filter(_._1.startsWith(tokenFeaturePrefix)).map { case (key, value) => s"$key=$value" }.mkString("/") 102 | ) ++ 103 | 104 | // non-binary features 105 | orderedNonBinaryFeatures.map { f => features.get(f).map(d => f"$d%.3f").getOrElse("") } ++ 106 | 107 | // binary features 108 | (features.keySet & binaryFeatures).toSeq.sorted 109 | ).mkString("\t") 110 | } 111 | 112 | val result = header +: body 113 | 114 | if (prefix.isEmpty) { 115 | result.asJava 116 | } else { 117 | result.zipWithIndex.map { case (line, i) => f"$prefix\t$i%04d\t$line" }.asJava 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /core/src/main/scala/org/allenai/scienceparse/S2PaperSource.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import java.io.{BufferedInputStream, IOException} 4 | import java.net.SocketTimeoutException 5 | import java.nio.file.{Files, Path, StandardCopyOption} 6 | 7 | import org.allenai.common.Logging 8 | 9 | import scala.util.control.NonFatal 10 | import scala.util.{Failure, Random, Success, Try} 11 | import scalaj.http.{Http, HttpResponse} 12 | 13 | object S2PaperSource extends PaperSource with Logging { 14 | 15 | private val random = new Random 16 | /** Gets a response from an HTTP server given a request. Retries if we think retrying might fix it. */ 17 | private def withRetries[T](f: () => HttpResponse[T], retries: Int = 10): T = if (retries <= 0) { 18 | val result = f() 19 | if(result.isSuccess) 20 | result.body 21 | else 22 | throw new IOException(s"Got error ${result.code} (${result.statusLine}) from S2 server") 23 | } else { 24 | val sleepTime = random.nextInt(1000) + 2500 // sleep between 2.5 and 3.5 seconds 25 | // If something goes wrong, we sleep a random amount of time, to make sure that we don't slam 26 | // the server, get timeouts, wait for exactly the same amount of time on all threads, and then 27 | // slam the server again. 28 | 29 | Try(f()) match { 30 | case Failure(e: SocketTimeoutException) => 31 | logger.warn(s"$e while querying S2. $retries retries left.") 32 | Thread.sleep(sleepTime) 33 | withRetries(f, retries - 1) 34 | 35 | case Failure(e: IOException) => 36 | logger.warn(s"Got IOException '${e.getMessage}' while querying S2. $retries retries left.") 37 | Thread.sleep(sleepTime) 38 | withRetries(f, retries - 1) 39 | 40 | case Success(response) if response.isServerError => 41 | logger.warn(s"Got response code '${response.statusLine}' while querying S2. $retries retries left.") 42 | Thread.sleep(sleepTime) 43 | withRetries(f, retries - 1) 44 | 45 | case Failure(e) => throw e 46 | 47 | case Success(response) => response.body 48 | } 49 | } 50 | 51 | override def getPdf(paperId: String) = { 52 | val key = paperId.take(4) + "/" + paperId.drop(4) + ".pdf" 53 | 54 | // We download to a temp file first. If we gave out an InputStream that comes directly from 55 | // S3, it would time out if the caller of this function reads the stream too slowly. 56 | val tempFile = withRetries { () => 57 | Http(s"https://pdfs.semanticscholar.org/$key").timeout(30000, 30000).execute { is => 58 | val result = Files.createTempFile(paperId + ".", ".paper.pdf") 59 | try { 60 | Files.copy(is, result, StandardCopyOption.REPLACE_EXISTING) 61 | result 62 | } catch { 63 | case NonFatal(e) => 64 | Files.deleteIfExists(result) 65 | throw e 66 | } 67 | } 68 | } 69 | tempFile.toFile.deleteOnExit() 70 | new BufferedInputStream(Files.newInputStream(tempFile)) 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /core/src/main/scala/org/allenai/scienceparse/StringUtils.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import org.allenai.common.{ StringUtils => CommonStringUtils } 4 | 5 | object StringUtils { 6 | import CommonStringUtils.StringImplicits 7 | 8 | def normalize(s: String) = s.normalize.replaceFancyUnicodeChars.removeUnprintable.replace('ı', 'i') 9 | 10 | def makeSingleLine(s: String) = s.replaceAll("\\n", "\\\\n").replaceAll("\\r", "\\\\r") 11 | 12 | 13 | /** Splits a name into first and last names */ 14 | def splitName(name: String) = { 15 | val suffixes = Set("Jr.", "Sr.", "II", "III") 16 | val lastNamePrefixes = Set("van", "da", "von") 17 | 18 | val parts = name.split("\\s", -1) 19 | 20 | if(parts.length <= 1) { 21 | ("", name) 22 | } else { 23 | var lastNameIndex = parts.length - 1 24 | def skipToNonemptyPart() = 25 | while(lastNameIndex > 0 && parts(lastNameIndex).isEmpty) 26 | lastNameIndex -= 1 27 | def skipToRightAfterNonemptyPart() = 28 | while(lastNameIndex > 1 && parts(lastNameIndex - 1).isEmpty) 29 | lastNameIndex -= 1 30 | 31 | // move to the first non-empty part 32 | skipToNonemptyPart() 33 | 34 | // deal with suffixes 35 | if(lastNameIndex > 0 && suffixes.contains(parts(lastNameIndex))) 36 | lastNameIndex -= 1 37 | skipToNonemptyPart() 38 | 39 | // deal with last name prefixes 40 | skipToRightAfterNonemptyPart() 41 | if(lastNameIndex > 1 && lastNamePrefixes.contains(parts(lastNameIndex - 1))) 42 | lastNameIndex -= 1 43 | skipToRightAfterNonemptyPart() 44 | 45 | (parts.take(lastNameIndex).mkString(" "), parts.drop(lastNameIndex).mkString(" ")) 46 | } 47 | } 48 | 49 | def getFirstName(name: String) = splitName(name)._1 50 | def getLastName(name: String) = splitName(name)._2 51 | } 52 | -------------------------------------------------------------------------------- /core/src/main/scala/org/allenai/scienceparse/Training.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import java.io.{ FileInputStream, File } 4 | 5 | import com.gs.collections.impl.set.mutable.UnifiedSet 6 | import org.allenai.common.{ Resource, Logging } 7 | import org.allenai.datastore.Datastores 8 | import org.allenai.scienceparse.Parser.ParseOpts 9 | import scopt.OptionParser 10 | 11 | import scala.io.Source 12 | import scala.collection.JavaConverters._ 13 | 14 | object Training extends App with Datastores with Logging { 15 | // The Files are all Option[File] defaulting to None. Properly, they should be set to the 16 | // defaults from the datastore, but if we do that here, they will download several gigabytes 17 | // of files during startup, even if they are unused later. 18 | case class Config( 19 | output: File = null, 20 | maxHeaderWords: Int = Parser.MAXHEADERWORDS, 21 | maxIterations: Int = 1000, 22 | backgroundSampleDocs: Int = 40000, 23 | backgroundDirectory: Option[File] = None, 24 | gazetteerFile: Option[File] = None, 25 | trainFraction: Double = 0.9, 26 | minYear: Int = 2008, 27 | maxPaperCount: Int = 34000, 28 | excludeIdsFile: Option[File] = None, 29 | minExpectedFeatureCount: Int = 13, 30 | trainingData: Iterator[LabeledPaper] = LabeledPapersFromDBLP.get 31 | ) 32 | 33 | val parser = new OptionParser[Config](this.getClass.getSimpleName) { 34 | head("Options that are not specified default to the settings that were used to make the production model.") 35 | 36 | opt[File]('o', "output") required () action { (o, c) => 37 | c.copy(output = o) 38 | } text "The output file" 39 | 40 | opt[Int]("maxHeaderWords") action { (m, c) => 41 | c.copy(maxHeaderWords = m) 42 | } text "Specifies the maximum number of words to use for the header if we don't have any other information about where the header ends" 43 | 44 | opt[Int]("maxIterations") action { (i, c) => 45 | c.copy(maxIterations = i) 46 | } text "Maximum number of iterations during training" 47 | 48 | opt[Int]("backgroundSampleDocs") action { (d, c) => 49 | c.copy(backgroundSampleDocs = d) 50 | } text "The number of documents to use to build the background language model" 51 | 52 | opt[File]("backgroundDirectory") action { (d, c) => 53 | c.copy(backgroundDirectory = Some(d)) 54 | } text "The directory in which the background documents are found" 55 | 56 | opt[File]('g', "gazetteerFile") action { (f, c) => 57 | c.copy(gazetteerFile = Some(f)) 58 | } text "The gazetteer file" 59 | 60 | opt[Double]("trainFraction") action { (f, c) => 61 | c.copy(trainFraction = f) 62 | } text "The fraction of the ground truth to use for training" 63 | 64 | opt[Int]("minYear") action { (y, c) => 65 | c.copy(minYear = y) 66 | } text "The earliest published year we're willing to consider" 67 | 68 | opt[Int]('c', "maxPaperCount") action { (p, c) => 69 | c.copy(maxPaperCount = p) 70 | } text "The maximum number of labeled documents to consider" 71 | 72 | opt[File]("excludeIdsFile") action { (e, c) => 73 | c.copy(excludeIdsFile = Some(e)) 74 | } text "A file with paper IDs to exclude, one per line. We always exclude the papers from the evaluation set." 75 | 76 | opt[Int]("minExpectedFeatureCount") action { (n, c) => 77 | c.copy(minExpectedFeatureCount = n) 78 | } text "The minimum number of times we should see a feature before accepting it." 79 | 80 | opt[Unit]("trainOnDBLP") action { (_, c) => 81 | c.copy(trainingData = LabeledPapersFromDBLP.get) 82 | } text "Train with data from DBLP" 83 | 84 | opt[Unit]("trainOnPMC") action { (_, c) => 85 | c.copy(trainingData = LabeledPapersFromPMC.getCleaned.drop(10000)) 86 | // Drop 10000 because we test on those. 87 | } text "Train with data from PMC" 88 | 89 | opt[Unit]("trainOnBoth") action { (_, c) => 90 | c.copy(trainingData = new InterleavingIterator(LabeledPapersFromPMC.getCleaned, LabeledPapersFromDBLP.get)) 91 | } text "Train with data from DBLP and PMC" 92 | 93 | help("help") text "Prints help text" 94 | } 95 | 96 | parser.parse(args, Config()).foreach { config => 97 | val opts = new ParseOpts 98 | opts.modelFile = config.output.toString 99 | opts.headerMax = config.maxHeaderWords 100 | opts.iterations = config.maxIterations 101 | opts.threads = Runtime.getRuntime.availableProcessors() * 2 102 | opts.backgroundSamples = config.backgroundSampleDocs 103 | 104 | val backgroundDirectory = 105 | config.backgroundDirectory.getOrElse(publicDirectory("productionBackgroundDocs", 1).toFile) 106 | opts.backgroundDirectory = backgroundDirectory.toString 107 | 108 | val gazetteerFile = config.gazetteerFile.getOrElse(Parser.getDefaultGazetteer.toFile) 109 | opts.gazetteerFile = gazetteerFile.toString 110 | 111 | opts.trainFraction = config.trainFraction 112 | opts.checkAuthors = true 113 | opts.minYear = config.minYear 114 | opts.documentCount = config.maxPaperCount 115 | opts.minExpectedFeatureCount = config.minExpectedFeatureCount 116 | 117 | val excludedIds = Evaluation.goldDocIds ++ config.excludeIdsFile.map { excludedIdsFile => 118 | Resource.using(Source.fromFile(excludedIdsFile)) { source => 119 | source.getLines().map(_.trim) 120 | }.toSet 121 | }.getOrElse(Set.empty) 122 | 123 | val labeledData = config.trainingData.asJava 124 | 125 | Parser.trainParser( 126 | labeledData, 127 | opts, 128 | UnifiedSet.newSet(excludedIds.toIterable.asJava) 129 | ) 130 | 131 | logger.info(s"New model at ${opts.modelFile}") 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /core/src/main/scala/org/allenai/scienceparse/Utilities.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import java.security.MessageDigest 4 | 5 | object Utilities { 6 | private val sha1HexLength = 40 7 | def toHex(bytes: Array[Byte]): String = { 8 | val sb = new scala.collection.mutable.StringBuilder(sha1HexLength) 9 | bytes.foreach { byte => sb.append(f"$byte%02x") } 10 | sb.toString 11 | } 12 | 13 | def shaForBytes(bytes: Array[Byte]): String = { 14 | val digest = MessageDigest.getInstance("SHA-1") 15 | digest.reset() 16 | digest.update(bytes) 17 | toHex(digest.digest()) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /core/src/main/scala/org/allenai/scienceparse/pipeline/Bucketizers.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse.pipeline 2 | 3 | import scala.io.Source 4 | 5 | /** This contains a bunch of helper functions stolen from the pipeline code. We need it here to 6 | * anticipate how well the pipeline will work with the output from science-parse. */ 7 | object Bucketizers { 8 | import Normalizers._ 9 | 10 | /** This file contains 225 high-frequency n-grams from title prefixes. 11 | * High means the S2 * Dblp bucket size is > 1M. (Early Sept. 2015) 12 | * n is 2, 3, 4, 5. 13 | */ 14 | val highFreqTitleNgramStream = this.getClass.getResourceAsStream("/org/allenai/scienceparse/pipeline/highfreq.tsv") 15 | 16 | val highFreqNameNgramStream = this.getClass.getResourceAsStream("/org/allenai/scienceparse/pipeline/highfreqNames.tsv") 17 | 18 | def loadHighFreqs(is: java.io.InputStream): Map[String, Int] = 19 | Source.fromInputStream(is).getLines.map { l => 20 | val Array(t, f) = l.split("\t") 21 | t -> f.toInt 22 | }.toMap 23 | 24 | lazy val highFreqTitleNgrams = loadHighFreqs(highFreqTitleNgramStream) 25 | 26 | lazy val highFreqNameNgrams = loadHighFreqs(highFreqTitleNgramStream) // This looks like a typo, but I copied it this way from the pipeline. 27 | 28 | val defaultTitleCutoffThreshold = 1000000 29 | 30 | val defaultNameCutoffThreshold = 100000 31 | 32 | val concatChar = "_" 33 | 34 | def toBucket(words: Iterable[String]) = words.mkString(concatChar) 35 | 36 | def toBucket(s: String) = s.split(" ").mkString(concatChar) 37 | 38 | val defaultTitleNgramLength = 3 39 | 40 | val defaultNameNgramLength = 2 41 | 42 | val defaultAllowTruncated = true 43 | 44 | val defaultUpto = 1 45 | 46 | def cutoffFilter(b: String, cutoffOption: Option[Int], highFreqs: Map[String, Int]): Boolean = 47 | cutoffOption.isEmpty || !highFreqs.contains(b) || highFreqs(b) < cutoffOption.get 48 | 49 | /** Return the array of tokens for the given input. 50 | * Limit number of tokens to maxCount 51 | */ 52 | def words(text: String, maxCount: Int = 40): Array[String] = { 53 | val words = alphaNumericNormalize(text).split(' ').filter(_.nonEmpty) 54 | words.take(maxCount) 55 | } 56 | 57 | /** Returns a list of ngrams. 58 | * If cutoff is specified, continue to add more words until the result has frequency 59 | * lower than the cutoff value. 60 | * If allowTruncated is set to true, accept ngrams that have length less than n. 61 | * For example, if the text is "local backbones" and n = 3, we will generate 62 | * the ngram "local_backbones". 63 | */ 64 | def ngrams( 65 | text: String, 66 | n: Int, 67 | cutoffOption: Option[Int], 68 | allowTruncated: Boolean = defaultAllowTruncated, 69 | highFreqs: Map[String, Int] = highFreqTitleNgrams, 70 | upto: Int = defaultUpto 71 | ): Iterator[String] = ngramAux(words(text), n, cutoffOption, allowTruncated, highFreqs, upto) 72 | 73 | def tailNgrams( 74 | text: String, 75 | n: Int, 76 | cutoffOption: Option[Int], 77 | allowTruncated: Boolean = defaultAllowTruncated, 78 | highFreqs: Map[String, Int] = highFreqTitleNgrams, 79 | upto: Int = defaultUpto 80 | ) = ngramAux(words(text).reverse, n, cutoffOption, allowTruncated, highFreqs, upto) 81 | 82 | def ngramAux( 83 | chunks: Array[String], 84 | n: Int, 85 | cutoffOption: Option[Int], 86 | allowTruncated: Boolean, 87 | highFreqs: Map[String, Int], 88 | upto: Int 89 | ): Iterator[String] = { 90 | chunks.sliding(n) 91 | .filter(x => (allowTruncated && x.nonEmpty) || x.length == n) 92 | .map(x => toBucket(x.toIterable)) 93 | .filter(cutoffFilter(_, cutoffOption, highFreqs)) 94 | .take(upto) 95 | } 96 | 97 | def titleNgrams(title: String, upto: Int, allowTruncated: Boolean = defaultAllowTruncated) = { 98 | ngrams( 99 | title, 100 | n = defaultTitleNgramLength, 101 | cutoffOption = Some(defaultTitleCutoffThreshold), 102 | upto = upto, 103 | allowTruncated = allowTruncated 104 | ) 105 | } 106 | 107 | def titleTailNgrams(title: String, upto: Int = 1, allowTruncated: Boolean = defaultAllowTruncated) = { 108 | tailNgrams( 109 | title, 110 | n = defaultTitleNgramLength, 111 | cutoffOption = Some(defaultTitleCutoffThreshold), 112 | upto = upto, 113 | allowTruncated = allowTruncated 114 | ) 115 | } 116 | 117 | def nameNgrams(name: String) = ngrams( 118 | name, 119 | n = defaultNameNgramLength, 120 | allowTruncated = false, 121 | cutoffOption = Some(defaultNameCutoffThreshold), 122 | highFreqs = highFreqNameNgrams, 123 | upto = 3 124 | ) 125 | 126 | /** This is used in V1. */ 127 | def simple3TitlePrefix(text: String): List[String] = 128 | ngrams(text, n = 3, cutoffOption = None, allowTruncated = true, highFreqTitleNgrams, upto = 1).toList 129 | } 130 | -------------------------------------------------------------------------------- /core/src/main/scala/org/allenai/scienceparse/pipeline/Normalizers.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse.pipeline 2 | 3 | import java.text.Normalizer 4 | 5 | /** This contains a bunch of helper functions stolen from the pipeline code. We need it here to 6 | * anticipate how well the pipeline will work with the output from science-parse. */ 7 | object Normalizers { 8 | def removeDiacritics(s: String): String = 9 | "\\p{InCombiningDiacriticalMarks}+".r 10 | .replaceAllIn(Normalizer.normalize(s, Normalizer.Form.NFD), "") 11 | 12 | def removePunctuation(s: String): String = 13 | s.replaceAll("\\p{P}", " ") 14 | 15 | def removeNonAphanumeric(s: String): String = 16 | s.replaceAll("[^A-Za-z0-9]", " ") 17 | 18 | def implodeSpaces(s: String) = " +".r.replaceAllIn(s.trim, " ") 19 | 20 | def removeSpaces(s: String) = " +".r.replaceAllIn(s, "") 21 | 22 | def normalize(s: String): String = 23 | implodeSpaces(removePunctuation(removeDiacritics(s.toLowerCase))) 24 | 25 | def alphaNumericNormalize(s: String): String = 26 | implodeSpaces(removeNonAphanumeric(removeDiacritics(s.toLowerCase))) 27 | 28 | def alphaNumericNormalizeNoSpaces(s: String): String = 29 | removeSpaces(removeNonAphanumeric(removeDiacritics(s.toLowerCase))) 30 | 31 | def strictNormalize(s: String): String = s.toLowerCase.replaceAll("[^a-z]", "") 32 | 33 | def soundexWord(word: String): String = { 34 | val s = strictNormalize(word) 35 | if (s.isEmpty) return "" 36 | s.head + (s.substring(1) 37 | .replaceAll("[hw]", "") 38 | .replaceAll("[bfpv]", "1") 39 | .replaceAll("[cgjkqsxz]", "2") 40 | .replaceAll("[dt]", "3") 41 | .replaceAll("l", "4") 42 | .replaceAll("[mn]", "5") 43 | .replaceAll("r", "6") 44 | .replaceAll("(\\d)+", "$1") 45 | .replaceAll("[aeiouy]", "") 46 | + "000").take(3) 47 | } 48 | 49 | def soundex(s: String): String = s.split(" ").map(soundexWord).mkString(" ") 50 | 51 | def truncateWords(s: String): String = s.split(" ").map(strictNormalize(_).take(3)).mkString(" ") 52 | } 53 | -------------------------------------------------------------------------------- /core/src/main/scala/org/allenai/scienceparse/pipeline/SimilarityMeasures.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse.pipeline 2 | 3 | import org.allenai.scienceparse.LabeledData.Reference 4 | import org.allenai.scienceparse.StringUtils 5 | 6 | /** This contains a bunch of helper functions stolen from the pipeline code. We need it here to 7 | * anticipate how well the pipeline will work with the output from science-parse. */ 8 | object SimilarityMeasures { 9 | def jaccardSim[T](s1: Set[T], s2: Set[T]): Double = { 10 | s1.intersect(s2).size.toDouble / s1.union(s2).size 11 | } 12 | 13 | def containmentJaccardSim[T](s1: Set[T], s2: Set[T]): Double = { 14 | s1.intersect(s2).size.toDouble / math.min(s1.size, s2.size) 15 | } 16 | 17 | def identical(left: String, right: String) = 18 | if (left == right) Some(1.0) else None 19 | 20 | def prePostfix(left: String, right: String, transform: Int => Double = x => x / (x + 0.5)) = { 21 | if (left.length > right.length && (left.startsWith(right) || left.endsWith(right))) { 22 | Some(transform(right.split(" ").length)) 23 | } else { 24 | None 25 | } 26 | } 27 | 28 | def pickFromOptions[T](members: Option[T]*): Option[T] = 29 | members.toSeq.find(_.isDefined).getOrElse(None) 30 | 31 | def twoWayPrePostfix(left: String, right: String, transform: Int => Double = x => x / (x + 0.5)) = 32 | pickFromOptions(prePostfix(left, right, transform), prePostfix(right, left, transform)) 33 | 34 | /** Smooth interpolation between containment Jaccard and plain Jaccard, 35 | * based on character n-grams. 36 | * Short strings must match exactly, but longer strings are considered a match 37 | * if one is a substring of the other. 38 | * 39 | * The final score is (J + F * JC) / (1 + F) in which 40 | * J is the plain Jaccard 41 | * JC is the containment Jaccard 42 | * F = s ** (m - 1) 43 | * m is the minimum length of the two strings 44 | * s, l are parameters 45 | * 46 | * @param left String to compare 47 | * @param right Other string to compare 48 | * @param ngramLength Longer values will give a larger penalty to single-character typos 49 | * @param s Determines how rapidly F rises with string length 50 | * @param l The string length (in characters) for which which the two Jaccard scores have equal weights 51 | * @return 52 | */ 53 | def characterNgramSimilarity( 54 | left: String, 55 | right: String, 56 | ngramLength: Int = 3, 57 | s: Double = 1.2, 58 | l: Int = 10 59 | ): Option[Double] = { 60 | if (left == right) { 61 | Some(1.0) 62 | } else { 63 | val ngramsLeft = left.sliding(ngramLength).toSet 64 | val ngramsRight = right.sliding(ngramLength).toSet 65 | val minSize = math.min(ngramsLeft.size, ngramsRight.size) 66 | val directSim = jaccardSim(ngramsLeft, ngramsRight) 67 | val containmentSim = containmentJaccardSim(ngramsLeft, ngramsRight) 68 | val containmentWeight = math.min(math.pow(s, minSize - l), 100000.0) 69 | Some((directSim + containmentWeight * containmentSim) / (1.0 + containmentWeight)) 70 | } 71 | } 72 | 73 | def titleNgramSimilarity( 74 | left: TitleAuthors, 75 | right: TitleAuthors, 76 | s: Double = 1.2, 77 | l: Int = 10 78 | ): Option[Double] = { 79 | if (left == right) { 80 | Some(1.0) 81 | } else { 82 | val ngramsLeft = left.normalizedTitleNgrams 83 | val ngramsRight = right.normalizedTitleNgrams 84 | val minSize = math.min(ngramsLeft.size, ngramsRight.size) 85 | val directSim = jaccardSim(ngramsLeft, ngramsRight) 86 | val containmentSim = containmentJaccardSim(ngramsLeft, ngramsRight) 87 | val containmentWeight = math.min(math.pow(s, minSize - l), 100000.0) 88 | Some((directSim + containmentWeight * containmentSim) / (1.0 + containmentWeight)) 89 | } 90 | } 91 | } 92 | 93 | case class AuthorNameMatch(first: String, last: String, full: String) 94 | 95 | case class TitleAuthors(title: String, names: Seq[AuthorNameMatch], year: Option[Int] = None) { 96 | def lastNames: Seq[String] = names.map(_.last) 97 | 98 | def fullNames: Seq[String] = names.map(_.full) 99 | 100 | // Note: There is a slight inversion of control here. This logic would be more properly contained within 101 | // BibEntryToPaperMatcher and TitleAuthorsMatchScheme, but is here for performance reasons. 102 | lazy val normalizedTitleNgrams: Set[String] = Normalizers.alphaNumericNormalize(title).sliding(3).toSet 103 | lazy val normalizedAuthors: Set[String] = names.map(x => Normalizers.alphaNumericNormalize(x.last)).toSet 104 | // Does not include empty strings. 105 | lazy val normalizedAuthorsAllNames: Set[String] = { 106 | val allNames = names.flatMap(name => Seq(name.first, name.last, name.full)) 107 | val normalized = allNames.map(Normalizers.alphaNumericNormalize) 108 | normalized.filter(_.nonEmpty).toSet 109 | } 110 | } 111 | 112 | object TitleAuthors { 113 | def fromReference(ref: Reference) = TitleAuthors( 114 | ref.title.getOrElse(""), 115 | ref.authors.map { a => 116 | val (first, last) = StringUtils.splitName(a) 117 | AuthorNameMatch(first, last, a) 118 | }, 119 | ref.year 120 | ) 121 | } 122 | -------------------------------------------------------------------------------- /core/src/test/java/org/allenai/scienceparse/CRFBibRecordParserTest.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.List; 6 | 7 | import org.testng.annotations.Test; 8 | 9 | import com.gs.collections.api.tuple.Pair; 10 | 11 | import junit.framework.Assert; 12 | import lombok.val; 13 | import lombok.extern.slf4j.Slf4j; 14 | 15 | @Test 16 | @Slf4j 17 | public class CRFBibRecordParserTest { 18 | 19 | public void testReadData() throws IOException { 20 | File coraFile = new File(this.getClass().getResource("/coratest.txt").getFile()); 21 | val labels = CRFBibRecordParser.labelFromCoraFile(coraFile); 22 | log.info(labels.toString()); 23 | Assert.assertEquals(1, labels.size()); 24 | boolean foundOne = false; 25 | boolean foundTwo = false; 26 | boolean foundThree = false; 27 | for(Pair p : labels.get(0)) { 28 | if(p.getOne().equals("Formalising") && p.getTwo().equals("B_T")) 29 | foundOne = true; 30 | if(p.getOne().equals("formalism.") && p.getTwo().equals("E_T")) 31 | foundTwo = true; 32 | if(p.getOne().equals("1992.") && p.getTwo().equals("W_Y")) 33 | foundThree = true; 34 | } 35 | Assert.assertTrue(foundOne); 36 | Assert.assertTrue(foundTwo); 37 | Assert.assertTrue(foundThree); 38 | 39 | File umassFile = new File(this.getClass().getResource("/umasstest.txt").getFile()); 40 | val labels2 = CRFBibRecordParser.labelFromUMassFile(umassFile); 41 | log.info(labels2.toString()); 42 | Assert.assertEquals(1, labels2.size()); 43 | foundOne = false; 44 | foundTwo = false; 45 | for(Pair p : labels2.get(0)) { 46 | if(p.getOne().equals("E.") && p.getTwo().equals("B_A")) 47 | foundOne = true; 48 | if(p.getOne().equals("1979") && p.getTwo().equals("B_Y")) 49 | foundTwo = true; 50 | } 51 | Assert.assertTrue(foundOne); 52 | Assert.assertTrue(foundTwo); 53 | 54 | File kermitFile = new File(this.getClass().getResource("/kermittest.txt").getFile()); 55 | val labels3 = CRFBibRecordParser.labelFromKermitFile(kermitFile); 56 | log.info(labels3.toString()); 57 | Assert.assertEquals(2, labels3.size()); 58 | foundOne = false; 59 | foundTwo = false; 60 | for(Pair p : labels3.get(1)) { 61 | if(p.getOne().equals("Hinshaw,") && p.getTwo().equals("B_A")) 62 | foundOne = true; 63 | if(p.getOne().equals("Shock") && p.getTwo().equals("E_V")) 64 | foundTwo = true; 65 | } 66 | Assert.assertTrue(foundOne); 67 | Assert.assertTrue(foundTwo); 68 | 69 | } 70 | 71 | public void testCoraLabeling() throws Exception { 72 | String s = " A. Cau Formalising Dijkstra's development strategy within Stark's formalism. BCS-FACS Refinement Workshop, 1992. "; 73 | int tokens = 2 + 21 - 8; //start/stop plus tokens in source minus eight tags. 74 | List> labeledData = CRFBibRecordParser.getLabeledLineCora(s); 75 | Assert.assertEquals(tokens, labeledData.size()); 76 | Assert.assertEquals("Cau", labeledData.get(2).getOne()); 77 | Assert.assertEquals("E_A", labeledData.get(2).getTwo()); 78 | Assert.assertEquals("Formalising", labeledData.get(3).getOne()); 79 | Assert.assertEquals("B_T", labeledData.get(3).getTwo()); 80 | Assert.assertEquals("development", labeledData.get(5).getOne()); 81 | Assert.assertEquals("I_T", labeledData.get(5).getTwo()); 82 | Assert.assertEquals("1992.", labeledData.get(13).getOne()); 83 | Assert.assertEquals("W_Y", labeledData.get(13).getTwo()); 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /core/src/test/java/org/allenai/scienceparse/CheckReferencesTest.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import junit.framework.Assert; 4 | import lombok.extern.slf4j.Slf4j; 5 | import org.allenai.datastore.Datastore; 6 | import org.testng.annotations.Test; 7 | 8 | import java.io.IOException; 9 | import java.util.Arrays; 10 | 11 | @Test 12 | @Slf4j 13 | public class CheckReferencesTest { 14 | public void smallTest() throws IOException { 15 | final String jsonFile = 16 | Datastore.apply().filePath("org.allenai.scienceparse", "gazetteer.json", 5).toString(); 17 | CheckReferences cr = new CheckReferences(jsonFile); 18 | log.info("num hashes: " + cr.getHashSize()); 19 | Assert.assertEquals(1557178, cr.getHashSize()); 20 | Assert.assertTrue(cr.hasPaper( 21 | "Ecological Sampling of Gaze Shifts", 22 | Arrays.asList("Giuseppe Boccignone", 23 | "Mario Ferraro"), 2014, "KDD")); 24 | Assert.assertTrue(cr.hasPaper( 25 | "HIST: A Methodology for the Automatic Insertion of a Hierarchical Self Test", 26 | Arrays.asList("Oliver F. Haberl", 27 | "Thomas Kropf"), 1992, "KDD")); 28 | Assert.assertFalse(cr.hasPaper( 29 | "Fake paper titles: A case study in negative examples", 30 | Arrays.asList("Kevin Bache", 31 | "David Newman", 32 | "Padhraic Smyth"), 2013, "KDD")); 33 | Assert.assertFalse(cr.hasPaper( 34 | "Text-based measures of document diversity", 35 | Arrays.asList("Captain Bananas", 36 | "David Newman", 37 | "Padhraic Smyth"), 2013, "KDD")); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /core/src/test/java/org/allenai/scienceparse/GazetteerFeaturesTest.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | 6 | import org.testng.annotations.Test; 7 | 8 | import junit.framework.Assert; 9 | import lombok.val; 10 | import lombok.extern.slf4j.Slf4j; 11 | 12 | @Test 13 | @Slf4j 14 | public class GazetteerFeaturesTest { 15 | 16 | public String filePathOfResource(String path) { 17 | return this.getClass().getResource(path).getFile(); 18 | } 19 | 20 | public void testLength() { 21 | Assert.assertTrue(GazetteerFeatures.withinLength("this string is only six words.")); 22 | Assert.assertFalse(GazetteerFeatures.withinLength("this string by contrast is eight words long.")); 23 | 24 | } 25 | 26 | public void testGazetteers() throws Exception { 27 | GazetteerFeatures gf = new GazetteerFeatures(filePathOfResource("/gazetteer-test/")); 28 | 29 | int namesId = gf.gazetteerNumber("names.male.txt"); 30 | int univId = gf.gazetteerNumber("education.university.small.txt"); 31 | Assert.assertEquals(gf.size(), 2); 32 | Assert.assertEquals(3, gf.sizeOfSet(univId)); 33 | Assert.assertEquals(5, gf.sizeOfSet(namesId)); 34 | boolean [] abbeyInSet = gf.inSet("Abbey"); 35 | Assert.assertEquals(2, abbeyInSet.length); 36 | Assert.assertFalse(abbeyInSet[univId]); 37 | Assert.assertTrue(abbeyInSet[namesId]); 38 | boolean [] beautyInSet = gf.inSet("marinello school of beauty"); 39 | Assert.assertEquals(2, beautyInSet.length); 40 | Assert.assertTrue(beautyInSet[univId]); 41 | Assert.assertFalse(beautyInSet[namesId]); 42 | boolean [] wilkinsInSet = gf.inSet("d. wilkins school of windmill dunks"); 43 | Assert.assertEquals(2, wilkinsInSet.length); 44 | Assert.assertFalse(wilkinsInSet[univId]); 45 | Assert.assertFalse(wilkinsInSet[namesId]); 46 | boolean [] apolloInSet = gf.inSet("Apollo College Phoenix Inc."); 47 | Assert.assertTrue(apolloInSet[univId]); 48 | } 49 | 50 | public void testGazetteerFeatures() throws Exception { 51 | List elems = Arrays.asList("Abbey", "is", "at", "Apollo", "College", "Phoenix", "Inc."); 52 | ReferencesPredicateExtractor rpe = new ReferencesPredicateExtractor(); 53 | GazetteerFeatures gf = new GazetteerFeatures(filePathOfResource("/gazetteer-test/")); 54 | val spns = gf.getSpans(elems); 55 | log.info(spns.toString()); 56 | Assert.assertEquals(2, spns.size()); 57 | 58 | rpe.setGf(gf); 59 | val preds = rpe.nodePredicates(elems); 60 | log.info(preds.toString()); 61 | Assert.assertEquals(1.0, preds.get(0).get("%gaz_W_names.male.txt")); 62 | Assert.assertFalse(preds.get(2).containsKey("%gaz_B_education.university.small.txt")); 63 | Assert.assertEquals(1.0, preds.get(3).get("%gaz_B_education.university.small.txt")); 64 | Assert.assertEquals(1.0, preds.get(4).get("%gaz_I_education.university.small.txt")); 65 | Assert.assertEquals(1.0, preds.get(6).get("%gaz_E_education.university.small.txt")); 66 | 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /core/src/test/java/org/allenai/scienceparse/HeaderIntegrationTest.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import junit.framework.Assert; 4 | import lombok.extern.slf4j.Slf4j; 5 | import lombok.val; 6 | import org.apache.pdfbox.io.IOUtils; 7 | import org.testng.annotations.Test; 8 | 9 | import java.io.*; 10 | import java.net.URL; 11 | import java.nio.file.Files; 12 | import java.util.*; 13 | import java.util.stream.Collectors; 14 | 15 | @Slf4j 16 | public class HeaderIntegrationTest { 17 | private final static PaperSource paperSource = PaperSource.getDefault(); 18 | 19 | static final int kSampledPapers = 100; 20 | 21 | public static class Result { 22 | int authorHits; 23 | int authorInvalid; 24 | boolean titleMatch; 25 | String title; 26 | int totalAuthors; 27 | boolean titleMissing; 28 | } 29 | 30 | public static HashSet authorSet(Iterable authors) { 31 | HashSet result = new HashSet(); 32 | for (String author : authors) { 33 | result.add(Parser.lastName(author)); 34 | } 35 | return result; 36 | } 37 | 38 | public static Result testPaper( 39 | final Parser parser, 40 | final ParserGroundTruth pgt, 41 | final String paperId 42 | ) { 43 | ExtractedMetadata metadata; 44 | 45 | ParserGroundTruth.Paper paper = pgt.forKey(paperId.substring(4)); 46 | 47 | try { 48 | metadata = parser.doParse( 49 | paperSource.getPdf(paperId), 50 | Parser.MAXHEADERWORDS); 51 | } catch (Exception e) { 52 | log.info("Failed to parse or extract from {}. Skipping.", paper.url); 53 | return null; 54 | } 55 | 56 | HashSet golden = authorSet(Arrays.asList(paper.authors)); 57 | HashSet extracted = authorSet(metadata.authors); 58 | 59 | int hits = 0; 60 | int invalid = 0; 61 | for (String name : golden) { 62 | if (extracted.contains(name)) { 63 | hits += 1; 64 | } 65 | } 66 | for (String name : extracted) { 67 | if (!golden.contains(name)) { 68 | log.info("Bad author {}: {} ", name, 69 | String.join(",", golden.toArray(new String[]{})) 70 | ); 71 | invalid += 1; 72 | } 73 | } 74 | Result res = new Result(); 75 | res.totalAuthors = golden.size(); 76 | res.authorHits = hits; 77 | res.authorInvalid = invalid; 78 | res.title = paper.title; 79 | 80 | if (metadata.title == null) { 81 | res.titleMatch = false; 82 | res.titleMissing = true; 83 | } else { 84 | res.titleMatch = Parser.processTitle(paper.title) 85 | .equals(Parser.processTitle(metadata.title)); 86 | } 87 | 88 | 89 | if (res.authorInvalid > 0 || !res.titleMatch) { 90 | metadata.authors.sort((String a, String b) -> a.compareTo(b)); 91 | Arrays.sort(paper.authors); 92 | log.info("Failed match for paper {}.", paperId); 93 | log.info("Titles: GOLD:\n{} OURS:\n{}", paper.title, metadata.title); 94 | for (int i = 0; i < Math.max(paper.authors.length, metadata.authors.size()); ++i) { 95 | String goldAuthor = null; 96 | String metaAuthor = null; 97 | if (i < paper.authors.length) { goldAuthor = paper.authors[i]; } 98 | if (i < metadata.authors.size()) { metaAuthor = metadata.authors.get(i); } 99 | log.info("Author: ({}) ({})", goldAuthor, metaAuthor); 100 | } 101 | } 102 | 103 | return res; 104 | } 105 | 106 | public void testAuthorAndTitleExtraction() throws Exception { 107 | ParserGroundTruth pgt = new ParserGroundTruth( 108 | Files.newInputStream(Parser.getDefaultGazetteer())); 109 | 110 | // TODO (build and train a classifier at test time). 111 | // Parser parser = trainParser(pgt); 112 | Parser parser = new Parser(); 113 | 114 | ArrayList sampledPapers = new ArrayList<>(); 115 | 116 | for (int i = 0; i < pgt.papers.size(); i += pgt.papers.size() / kSampledPapers) { 117 | sampledPapers.add(pgt.papers.get(i)); 118 | } 119 | 120 | long startTime = System.currentTimeMillis(); 121 | ArrayList results = sampledPapers 122 | .stream() 123 | .parallel() 124 | .map(p -> testPaper(parser, pgt, p.id)) 125 | .filter(f -> f != null) 126 | .collect(Collectors.toCollection(ArrayList::new)); 127 | 128 | // Gahh I wish I had a dataframe library... 129 | int totalHits = 0, totalInvalid = 0, totalAuthors = 0, titleMatches = 0, titleMissing = 0; 130 | for (Result res : results) { 131 | totalHits += res.authorHits; 132 | totalInvalid += res.authorInvalid; 133 | totalAuthors += res.totalAuthors; 134 | if (res.titleMatch) { 135 | titleMatches += 1; 136 | } 137 | if (res.titleMissing) { 138 | titleMissing += 1; 139 | } 140 | } 141 | 142 | long finishTime = System.currentTimeMillis(); 143 | double elapsed = (finishTime - startTime) / 1000.0; 144 | log.info("Testing complete. {} papers processed in {} seconds; {} papers/sec ", 145 | results.size(), elapsed, results.size() / elapsed); 146 | 147 | Assert.assertTrue(results.size() > 5); 148 | 149 | log.info("Authors: {} (Match: {} Invalid: {} Total {})", 150 | totalHits / (double)totalAuthors, totalHits, totalInvalid, totalAuthors); 151 | log.info("Titles: {} (Match: {} Missing: {} Total {})", 152 | titleMatches / (double)results.size(), titleMatches, titleMissing, results.size()); 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /core/src/test/java/org/allenai/scienceparse/PDFPredicateExtractorTest.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import com.gs.collections.api.map.primitive.ObjectDoubleMap; 4 | import com.gs.collections.api.tuple.Pair; 5 | import lombok.extern.slf4j.Slf4j; 6 | import org.allenai.scienceparse.pdfapi.PDFDoc; 7 | import org.allenai.scienceparse.pdfapi.PDFExtractor; 8 | import org.testng.Assert; 9 | import org.testng.annotations.Test; 10 | 11 | import java.io.File; 12 | import java.io.FileInputStream; 13 | import java.io.IOException; 14 | import java.io.InputStream; 15 | import java.util.Arrays; 16 | import java.util.Iterator; 17 | import java.util.List; 18 | 19 | @Slf4j 20 | public class PDFPredicateExtractorTest { 21 | 22 | private void titleFontFeatureCheckForStream(InputStream pdfInputStream) throws IOException { 23 | String target = "How to make words with vectors: Phrase generation in distributional semantics"; 24 | PDFDoc doc = new PDFExtractor().extractFromInputStream(pdfInputStream); 25 | List pts = PDFToCRFInput.getSequence(doc); 26 | // Iterator it = pts.iterator(); 27 | // while(it.hasNext()) { 28 | // PaperToken pt = it.next(); 29 | // log.info((pt.getPdfToken()==null)?"null":pt.getPdfToken().token + " f:" + pt.getPdfToken().fontMetrics.ptSize); 30 | // } 31 | Pair pos = PDFToCRFInput.findString(PDFToCRFInput.asStringList(pts), target); 32 | PDFPredicateExtractor ppe = new PDFPredicateExtractor(); 33 | List> preds = ppe.nodePredicates(pts); 34 | int[] idxes = new int[]{pos.getOne() - 1, pos.getOne(), 35 | pos.getTwo(), pos.getTwo() + 1, pos.getTwo() + 2}; 36 | log.info("fonts for " + Arrays.toString(idxes)); 37 | log.info(Arrays.toString(Arrays.stream(idxes).mapToDouble((int a) -> preds.get(a).get("%font")).toArray())); 38 | log.info("tokens for " + Arrays.toString(idxes)); 39 | log.info(Arrays.toString(Arrays.stream(idxes).mapToObj((int a) -> pts.get(a).getPdfToken().token).toArray())); 40 | 41 | 42 | Assert.assertEquals(preds.get(pos.getOne()).get("%fcb"), 1.0); 43 | Assert.assertTrue(!preds.get(pos.getTwo() - 1).containsKey("%fcb")); 44 | log.info("Title font change features correct."); 45 | } 46 | 47 | @Test 48 | public void titleFontFeatureCheck() throws IOException { 49 | InputStream is = PDFPredicateExtractorTest.class.getResource("/P14-1059.pdf").openStream(); 50 | titleFontFeatureCheckForStream(is); 51 | is.close(); 52 | } 53 | 54 | public void titleFontForExplicitFilePath(String f) throws IOException { 55 | InputStream is = new FileInputStream(new File(f)); 56 | titleFontFeatureCheckForStream(is); 57 | is.close(); 58 | } 59 | 60 | @Test 61 | public void testCaseMasks() { 62 | String cap = "Exploring"; 63 | List ls = PDFPredicateExtractor.getCaseMasks(cap); 64 | Assert.assertEquals(ls.size(), 2); 65 | Assert.assertTrue(ls.contains("%Xxx")); 66 | Assert.assertTrue(ls.contains("%letters")); 67 | 68 | String nonSimple = "Dharmaratnå"; 69 | ls = PDFPredicateExtractor.getCaseMasks(nonSimple); 70 | Assert.assertTrue(ls.contains("%hasNonAscii")); 71 | Assert.assertTrue(!ls.contains("%hasAt")); 72 | 73 | String email = "bob@joe.com"; 74 | ls = PDFPredicateExtractor.getCaseMasks(email); 75 | Assert.assertTrue(ls.contains("%hasAt")); 76 | } 77 | 78 | public static void main(String [] args) throws Exception { 79 | (new PDFPredicateExtractorTest()).titleFontForExplicitFilePath("src\\test\\resources\\P14-1059.pdf"); 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /core/src/test/java/org/allenai/scienceparse/PDFToCRFInputTest.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import com.gs.collections.api.tuple.Pair; 4 | import com.gs.collections.impl.tuple.Tuples; 5 | import lombok.extern.slf4j.Slf4j; 6 | import lombok.val; 7 | import org.allenai.scienceparse.pdfapi.PDFDoc; 8 | import org.allenai.scienceparse.pdfapi.PDFExtractor; 9 | import org.testng.Assert; 10 | import org.testng.annotations.Test; 11 | import scala.Option; 12 | import scala.Some; 13 | 14 | import java.io.IOException; 15 | import java.io.InputStream; 16 | import java.sql.Date; 17 | import java.util.Arrays; 18 | import java.util.List; 19 | import java.util.regex.Pattern; 20 | import java.util.stream.Collectors; 21 | 22 | @Test 23 | @Slf4j 24 | public class PDFToCRFInputTest { 25 | public String filePathOfResource(String path) { 26 | return this.getClass().getResource(path).getFile(); 27 | } 28 | 29 | public void testGetPaperTokens() throws IOException { 30 | InputStream pdfInputStream = PDFToCRFInputTest.class.getResourceAsStream("/P14-1059.pdf"); 31 | PDFDoc doc = new PDFExtractor().extractFromInputStream(pdfInputStream); 32 | List pts = PDFToCRFInput.getSequence(doc); 33 | log.info("got " + pts.size() + " things."); 34 | assert (pts.size() > 50); 35 | } 36 | 37 | public void testFindString() throws IOException { 38 | String target = "How to make words with vectors: Phrase generation in distributional semantics"; 39 | InputStream pdfInputStream = PDFToCRFInputTest.class.getResourceAsStream("/P14-1059.pdf"); 40 | PDFDoc doc = new PDFExtractor().extractFromInputStream(pdfInputStream); 41 | List pts = PDFToCRFInput.getSequence(doc); 42 | Pair pos = PDFToCRFInput.findString(PDFToCRFInput.asStringList(pts), target); 43 | Pair posNot = PDFToCRFInput.findString(PDFToCRFInput.asStringList(pts), "this string won't be found"); 44 | 45 | Assert.assertTrue(pos != null); 46 | Assert.assertTrue(pos.getOne() > 0 && (pos.getTwo() - pos.getOne() == 11)); 47 | log.info("found title at " + pos.getOne() + ", " + pos.getTwo()); 48 | log.info("title is " + PDFToCRFInput.stringAt(pts, pos)); 49 | Assert.assertTrue(posNot == null); 50 | } 51 | 52 | public void testLabelMetadata() throws IOException { 53 | InputStream pdfInputStream = PDFToCRFInputTest.class.getResourceAsStream("/P14-1059.pdf"); 54 | PDFDoc doc = new PDFExtractor().extractFromInputStream(pdfInputStream); 55 | List pts = PDFToCRFInput.getSequence(doc); 56 | final ExtractedMetadata em = new ExtractedMetadata( 57 | "How to make words with vectors: Phrase generation in distributional semantics", 58 | Arrays.asList("Georgiana Dinu", "Marco Baroni"), 59 | new Date(1388556000000L)); 60 | val labeledData = LabeledData$.MODULE$.fromExtractedMetadata("dummyid", em); 61 | val result = PDFToCRFInput.labelMetadata("P14-1059", pts, labeledData); 62 | log.info(PDFToCRFInput.getLabelString(result)); 63 | log.info(pts.stream().map((PaperToken p) -> p.getPdfToken().token).collect(Collectors.toList()).toString()); 64 | Assert.assertEquals(result.get(26).getTwo(), "O"); 65 | Assert.assertEquals(result.get(27).getTwo(), "B_T"); 66 | Assert.assertEquals(result.get(34).getTwo(), "I_T"); 67 | Assert.assertEquals(result.get(37).getTwo(), "E_T"); 68 | Assert.assertEquals(result.get(38).getTwo(), "B_A"); 69 | Assert.assertEquals(result.get(47).getTwo(), "O"); 70 | Assert.assertEquals(result.get(47).getOne(), pts.get(46)); //off by one due to start/stop 71 | Assert.assertEquals(result.get(0).getTwo(), ""); 72 | Assert.assertEquals(result.get(result.size() - 1).getTwo(), ""); 73 | } 74 | 75 | public void testGetSpans() { 76 | List ls = Arrays.asList("O", "O", "B_A", "I_A", "E_A"); 77 | val spans = ExtractedMetadata.getSpans(ls); 78 | Assert.assertEquals(spans.size(), 1); 79 | Assert.assertEquals(spans.get(0).tag, "A"); 80 | Assert.assertEquals(spans.get(0).loc, Tuples.pair(2, 5)); 81 | } 82 | 83 | public void testAuthorPatterns() { 84 | List> authOpt = PDFToCRFInput.authorToPatternOptPair("Marco C. Baroni"); 85 | Assert.assertTrue(authOpt.get(0).getOne().matcher("Marco").matches()); 86 | Assert.assertTrue(authOpt.get(1).getOne().matcher("C").matches()); 87 | Assert.assertTrue(authOpt.get(2).getOne().matcher("Baroni").matches()); 88 | Pair span = PDFToCRFInput.findPatternSequence(Arrays.asList("Marco", "C", "Baroni"), authOpt); 89 | Assert.assertEquals(span, Tuples.pair(0, 3)); 90 | span = PDFToCRFInput.findPatternSequence(Arrays.asList("Marco", "Baroni"), authOpt); 91 | Assert.assertEquals(span, Tuples.pair(0, 2)); 92 | authOpt = PDFToCRFInput.authorToPatternOptPair("Marco Baroni"); 93 | span = PDFToCRFInput.findPatternSequence(Arrays.asList("M.", "G.", "Baroni"), authOpt); 94 | Assert.assertEquals(span, Tuples.pair(0, 3)); 95 | span = PDFToCRFInput.findPatternSequence(Arrays.asList("M.", "G.", "B."), authOpt); 96 | Assert.assertEquals(span, null); 97 | } 98 | 99 | public void testAuthor() throws IOException { 100 | InputStream pdfInputStream = PDFToCRFInputTest.class.getResourceAsStream("/P14-1059.pdf"); 101 | PDFDoc doc = new PDFExtractor().extractFromInputStream(pdfInputStream); 102 | List pts = PDFToCRFInput.getSequence(doc); 103 | final ExtractedMetadata em = new ExtractedMetadata( 104 | "How to make words with vectors: Phrase generation in distributional semantics", 105 | Arrays.asList("Georgiana Dinu", "Marco C. Baroni"), 106 | new Date(1388556000000L)); 107 | val labeledData = LabeledData$.MODULE$.fromExtractedMetadata("dummyid", em); 108 | 109 | val result = PDFToCRFInput.labelMetadata("P14-1059", pts, labeledData); 110 | Assert.assertEquals(result.get(38).getTwo(), "B_A"); 111 | Assert.assertEquals(result.get(39).getTwo(), "E_A"); 112 | Assert.assertEquals(result.get(41).getTwo(), "B_A"); 113 | Assert.assertEquals(result.get(42).getTwo(), "E_A"); 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /core/src/test/java/org/allenai/scienceparse/ParserLMFeaturesTest.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import com.gs.collections.impl.set.mutable.UnifiedSet; 4 | import junit.framework.Assert; 5 | import lombok.extern.slf4j.Slf4j; 6 | import org.testng.annotations.Test; 7 | 8 | import java.io.File; 9 | 10 | @Test 11 | @Slf4j 12 | public class ParserLMFeaturesTest { 13 | 14 | public String filePathOfResource(String path) { 15 | return this.getClass().getResource(path).getFile(); 16 | } 17 | 18 | public void testParserLMFeatures() throws Exception { 19 | File f = new File(filePathOfResource("/groundTruth.json")); 20 | ParserGroundTruth pgt = new ParserGroundTruth(f.getPath()); 21 | log.info("pgt 0: " + pgt.papers.get(0)); 22 | ParserLMFeatures plf = new ParserLMFeatures(pgt.papers, new UnifiedSet(), f.getParentFile(), 3); 23 | log.info("of count in background: " + plf.backgroundBow.get("of")); 24 | Assert.assertEquals(1.0, plf.authorBow.get("Seebode")); 25 | Assert.assertEquals(1.0, plf.titleBow.get("Disk-based")); 26 | Assert.assertTrue(plf.backgroundBow.get("of") > 2.0); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /core/src/test/java/org/allenai/scienceparse/ParserTest.java: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse; 2 | 3 | import com.fasterxml.jackson.databind.ObjectMapper; 4 | import com.gs.collections.api.tuple.Pair; 5 | import com.gs.collections.impl.tuple.Tuples; 6 | import lombok.extern.slf4j.Slf4j; 7 | import lombok.val; 8 | import org.testng.Assert; 9 | import org.testng.annotations.Test; 10 | import scala.Function0; 11 | import scala.Option; 12 | import scala.collection.JavaConverters; 13 | import scala.runtime.AbstractFunction0; 14 | 15 | import java.io.File; 16 | import java.io.FileInputStream; 17 | import java.io.InputStream; 18 | import java.nio.file.Paths; 19 | import java.util.Arrays; 20 | import java.util.Iterator; 21 | import java.util.List; 22 | import java.util.function.Function; 23 | import java.util.stream.Collectors; 24 | 25 | @Test 26 | @Slf4j 27 | public class ParserTest { 28 | 29 | private final static List pdfKeys = Arrays.asList("/bagnell11", "/seung08", "/ding11", "/mooney05", 30 | "/roark13", "/dyer12", "/bohnet09", "/P14-1059", "/map-reduce", "/fader11", "/proto06", 31 | "/agarwal11", "/smola10", "/senellart10", "/zolotov04", "/pedersen04", "/smith07", 32 | "/aimag10"); 33 | 34 | public static String filePathOfResource(String path) { 35 | return ParserTest.class.getResource(path).getFile(); 36 | } 37 | 38 | public static String resourceDirectory(String path) { 39 | return (new File(ParserTest.class.getResource(path).getFile())).getParent(); 40 | } 41 | 42 | public static InputStream inputStreamOfResource(String path) throws Exception { 43 | return new FileInputStream(new File(filePathOfResource(path))); 44 | } 45 | 46 | private List resolveKeys(List keys) { 47 | return keys.stream().map((String s) -> new File(filePathOfResource(s + ".pdf"))).collect(Collectors.toList()); 48 | } 49 | 50 | private Pair testModel(String id, Parser p) throws Exception { 51 | String jsonPath = id + ".extraction.json"; 52 | String pdfPath = id + ".pdf"; 53 | InputStream jsonInputStream = getClass().getResourceAsStream(jsonPath); 54 | InputStream pdfInputStream = getClass().getResourceAsStream(pdfPath); 55 | List> arr = new ObjectMapper().readValue(jsonInputStream, List.class); 56 | jsonInputStream.close(); 57 | ExtractedMetadata em = p.doParse(pdfInputStream, Parser.MAXHEADERWORDS); 58 | pdfInputStream.close(); 59 | 60 | double titleTP = 0.0; 61 | double titleFP = 0.0; 62 | double authorTP = 0.0; 63 | double authorFN = 0.0; 64 | for (List elems : arr) { 65 | String type = (String) elems.get(0); 66 | Object expectedValue = elems.get(1); 67 | if (type.equalsIgnoreCase("title")) { 68 | String guessValue = em.title; 69 | if (guessValue != null && guessValue.equals(expectedValue)) 70 | titleTP++; 71 | else 72 | titleFP++; 73 | //Assert.assertEquals(guessValue, expectedValue, String.format("Title error on %s", id)); 74 | } 75 | if (type.equalsIgnoreCase("author")) { 76 | if (em.authors.contains(expectedValue)) 77 | authorTP++; 78 | else 79 | authorFN++; 80 | //Assert.assertTrue(em.authors.contains(expectedValue), 81 | //"could not find author " + expectedValue + " in extracted authors " + em.authors.toString()); 82 | } 83 | // if (type.equalsIgnoreCase("year")) { 84 | // Assert.assertEquals(em.year, expectedValue, String.format("Year error on %s", id)); 85 | // } 86 | } 87 | return Tuples.pair((titleTP / (titleTP + titleFP + 0.000001)), authorTP / (authorTP + authorFN + 0.000001)); 88 | } 89 | 90 | public void testParserWithGroundTruth() throws Exception { 91 | final File testModelFile = File.createTempFile("science-parse-test-model.", ".dat"); 92 | testModelFile.deleteOnExit(); 93 | 94 | /* 95 | * We'll use this to override the default paper source which pulls from S2. The problem with 96 | * pulling from S2 is that the set of publicly available PDFs changes over time making this 97 | * test rather flappy. 98 | */ 99 | PaperSource previousSource = PaperSource.defaultPaperSource; 100 | PaperSource.defaultPaperSource = new DirectoryPaperSource( 101 | new File(resourceDirectory("/groundTruth.json"))); 102 | 103 | try { 104 | Parser.ParseOpts opts = new Parser.ParseOpts(); 105 | opts.iterations = 10; 106 | opts.threads = 4; 107 | opts.modelFile = testModelFile.getPath(); 108 | opts.headerMax = Parser.MAXHEADERWORDS; 109 | opts.backgroundSamples = 3; 110 | opts.gazetteerFile = null; 111 | opts.trainFraction = 0.9; 112 | opts.backgroundDirectory = resourceDirectory("/groundTruth.json"); 113 | opts.minYear = -1; 114 | opts.checkAuthors = false; 115 | 116 | File f = new File(opts.modelFile); 117 | f.deleteOnExit(); 118 | 119 | final Iterator labeledTrainingData = 120 | JavaConverters.asJavaIteratorConverter( 121 | LabeledPapersFromDBLP.getFromGroundTruth( 122 | Paths.get(filePathOfResource("/groundTruth.json")))).asJava(); 123 | 124 | Parser.trainParser(labeledTrainingData, opts); 125 | final Parser p = new Parser( 126 | testModelFile, 127 | Parser.getDefaultGazetteer().toFile(), 128 | Parser.getDefaultBibModel().toFile()); 129 | double avgTitlePrec = 0.0; 130 | double avgAuthorRec = 0.0; 131 | double cases = 0.0; 132 | for (String s : pdfKeys) { 133 | val res = testModel(s, p); 134 | cases++; 135 | avgTitlePrec += res.getOne(); 136 | avgAuthorRec += res.getTwo(); 137 | } 138 | avgTitlePrec /= cases; 139 | avgAuthorRec /= cases; 140 | log.info("Title precision = recall = " + avgTitlePrec); 141 | log.info("Author recall = " + avgAuthorRec); 142 | 143 | testModelFile.delete(); 144 | } finally { 145 | PaperSource.defaultPaperSource = previousSource; 146 | } 147 | } 148 | 149 | public void testParserGroundTruth() throws Exception { 150 | ParserGroundTruth pgt = new ParserGroundTruth(filePathOfResource("/groundTruth.json")); 151 | Assert.assertEquals(pgt.papers.size(), 4); 152 | } 153 | 154 | public void testParserRobustness() throws Exception { 155 | // ParserGroundTruth pgt = new ParserGroundTruth(filePathOfResource("/papers-parseBugs.json")); 156 | // Assert.assertEquals(false, true); 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /core/src/test/resources/2a774230b5328df3f8125da9b84a82d92b46a240.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/2a774230b5328df3f8125da9b84a82d92b46a240.pdf -------------------------------------------------------------------------------- /core/src/test/resources/403b61d52192d6cf23c92a3da68ba08f03a954e4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/403b61d52192d6cf23c92a3da68ba08f03a954e4.pdf -------------------------------------------------------------------------------- /core/src/test/resources/6c46de8a4399840548a056d13d38e1f54da2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/6c46de8a4399840548a056d13d38e1f54da2.pdf -------------------------------------------------------------------------------- /core/src/test/resources/P07-1088-labels.txt: -------------------------------------------------------------------------------- 1 | TITLE Sparse Information Extraction: Unsupervised Language Models to the Rescue 2 | AUTHOR Doug Downey 3 | AUTHOR StefanSchoenmackers 4 | AUTHOR Oren Etzioni 5 | -------------------------------------------------------------------------------- /core/src/test/resources/P07-1088.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/P07-1088.pdf -------------------------------------------------------------------------------- /core/src/test/resources/P14-1059-labels.txt: -------------------------------------------------------------------------------- 1 | TITLE How to make words with vectors: Phrase generation in distributional semantics 2 | AUTHOR Georgiana Dinu 3 | AUTHOR Marco Baroni 4 | ABSTRACT We introduce the problem of generation in distributional semantics: Given a distributional vector representing some meaning, how can we generate the phrase that best expresses that meaning? We motivate this novel challenge on theoretical and practical grounds and propose a simple data-driven approach to the estimation of generation functions. We test this in a monolingual scenario (paraphrase generation) as well as in a cross-lingual setting(translation by synthesizing adjective-noun phrase vectors in English and generating the equivalent expressions in Italian). 5 | -------------------------------------------------------------------------------- /core/src/test/resources/P14-1059.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title", "How to make words with vectors: Phrase generation in distributional semantics"], 3 | ["line", "Abstract"], 4 | ["line", "We introduce the problem of generation"], 5 | ["line", "space functioning as interlingua."], 6 | ["year", 2014] 7 | ] -------------------------------------------------------------------------------- /core/src/test/resources/P14-1059.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/P14-1059.pdf -------------------------------------------------------------------------------- /core/src/test/resources/a7c25298c607d5bf32e3301b6b209431e2a7f830.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/a7c25298c607d5bf32e3301b6b209431e2a7f830.pdf -------------------------------------------------------------------------------- /core/src/test/resources/agarwal11.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title", "Noisy Matrix Decomposition via Convex Relaxation: Optimal Rates in High Dimensions"], 3 | ["line", "Abstract"], 4 | ["line", "In this paper, we consider a family of regularizers"] 5 | ] -------------------------------------------------------------------------------- /core/src/test/resources/agarwal11.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/agarwal11.pdf -------------------------------------------------------------------------------- /core/src/test/resources/aimag10.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["line", "Adapting Open"], 3 | ["year", 2010] 4 | ] -------------------------------------------------------------------------------- /core/src/test/resources/aimag10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/aimag10.pdf -------------------------------------------------------------------------------- /core/src/test/resources/bagnell11.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title", "Computational Rationalization: The Inverse Equilibrium Problem"], 3 | ["line", "Abstract"], 4 | ["line", "techniques that both explains demonstrated behavior"], 5 | ["line", "3.1. Rationality Assumptions"] 6 | ] -------------------------------------------------------------------------------- /core/src/test/resources/bagnell11.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/bagnell11.pdf -------------------------------------------------------------------------------- /core/src/test/resources/bohnet09.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title", "Efficient Parsing of Syntactic and Semantic Dependency Structures"], 3 | ["line", "Abstract"], 4 | ["line", "proach can compute a projective dependency tree"], 5 | ["year", 2009] 6 | ] -------------------------------------------------------------------------------- /core/src/test/resources/bohnet09.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/bohnet09.pdf -------------------------------------------------------------------------------- /core/src/test/resources/bunescu-acl07.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/bunescu-acl07.pdf -------------------------------------------------------------------------------- /core/src/test/resources/bunescu-acl07.txt: -------------------------------------------------------------------------------- 1 | TITLE = Learning to Extract Relations from the Web using Minimal Supervision 2 | AUTHOR = Razvan C. Bunescu 3 | AUTHOR = Raymond J. Mooney 4 | -------------------------------------------------------------------------------- /core/src/test/resources/c0690a1d74ab781bd54f9fa7e67267cce656.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/c0690a1d74ab781bd54f9fa7e67267cce656.pdf -------------------------------------------------------------------------------- /core/src/test/resources/c921a74c209e720534939dfa191d639e647dd242.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/c921a74c209e720534939dfa191d639e647dd242.pdf -------------------------------------------------------------------------------- /core/src/test/resources/coordinate_calibrator.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/coordinate_calibrator.pdf -------------------------------------------------------------------------------- /core/src/test/resources/coratest.txt: -------------------------------------------------------------------------------- 1 | A. Cau, R. Kuiper, and W.-P. de Roever. Formalising Dijkstra's development strategy within Stark's formalism. In C. B. Jones, R. C. Shaw, and T. Denvir, editors, Proc. 5th. BCS-FACS Refinement Workshop, 1992. 2 | -------------------------------------------------------------------------------- /core/src/test/resources/ding11.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title","Maximum Margin Multi-Instance Learning"], 3 | ["line", "Abstract"], 4 | ["line", "Multi-instance data are different from traditional single-instance data, which bring new opportunities"], 5 | ["year", 2011] 6 | ] -------------------------------------------------------------------------------- /core/src/test/resources/ding11.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/ding11.pdf -------------------------------------------------------------------------------- /core/src/test/resources/dyer12.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title", "Joint Feature Selection in Distributed Stochastic Learning for Large-Scale Discriminative Training in SMT"], 3 | ["line", "Abstract"], 4 | ["line", "Since inference for SMT (unlike many other learn-"], 5 | ["line", "data. Feature groups are 12 dense features (default), rule identifiers (id), rule n-gram (ng), and rule shape (shape)."], 6 | ["year", 2012] 7 | ] -------------------------------------------------------------------------------- /core/src/test/resources/dyer12.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/dyer12.pdf -------------------------------------------------------------------------------- /core/src/test/resources/e4faf2c1d76b9bf8f8b4524dfb8c5c6b93be5f35.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/e4faf2c1d76b9bf8f8b4524dfb8c5c6b93be5f35.pdf -------------------------------------------------------------------------------- /core/src/test/resources/fader11.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title", "Identifying Relations for Open Information Extraction"], 3 | ["line", "Abstract"], 4 | ["line", "Incoherent extractions are cases where the ex-"], 5 | ["year", 2011] 6 | ] -------------------------------------------------------------------------------- /core/src/test/resources/fader11.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/fader11.pdf -------------------------------------------------------------------------------- /core/src/test/resources/gazetteer-test/education.university.small.txt: -------------------------------------------------------------------------------- 1 | universidad pontificia bolivariana 2 | apollo college-phoenix inc 3 | marinello school of beauty 4 | -------------------------------------------------------------------------------- /core/src/test/resources/gazetteer-test/names.male.txt: -------------------------------------------------------------------------------- 1 | # You may use the lists of names for any purpose, so long as credit is given 2 | # in any published work. You may also redistribute the list if you 3 | # provide the recipients with a copy of this mail message. The lists are 4 | # not in the public domain (I retain the copyright on the lists) but are 5 | # freely redistributable. 6 | 7 | # If you have any additions to the lists of names, I would appreciate 8 | # receiving them. 9 | 10 | # My email address is mkant+@cs.cmu.edu. 11 | 12 | # --mark 13 | 14 | # **************************************************************************** 15 | 16 | # List of common male names. 17 | # Copyright (c) January 1991 by Mark Kantrowitz. 18 | # 2924 names 19 | # Thanks to Bill Ross for about 1000 additional names. 20 | 21 | Aaron 22 | Abbey 23 | Abbie 24 | Abbot 25 | Abbott -------------------------------------------------------------------------------- /core/src/test/resources/groundTruth.json: -------------------------------------------------------------------------------- 1 | { "id": "c921a74c209e720534939dfa191d639e647dd242", "url": "http://s3-us-west-2.amazonaws.com/ai2-s2-pdfs/c921/a74c209e720534939dfa191d639e647dd242.pdf", "title": "Did you notice?: neuronal processing of multimodal mobile phone feedback", "authors": [ "Antons, Jan-Niklas", "Arndt, Sebastian", "Seebode, Julia", "Schleicher, Robert", "M�ller, Sebastian" ], "year": 2013, "venue": "" } 2 | { "id": "2a774230b5328df3f8125da9b84a82d92b46a240", "url": "http://s3-us-west-2.amazonaws.com/ai2-s2-pdfs/2a77/4230b5328df3f8125da9b84a82d92b46a240.pdf", "title": "Disk-based storage for scalable video", "authors": [ "Chang, Edward Y.", "Zakhor, Avideh" ], "year": 1997, "venue": "" } 3 | { "id": "403b61d52192d6cf23c92a3da68ba08f03a954e4", "url": "http://s3-us-west-2.amazonaws.com/ai2-s2-pdfs/403b/61d52192d6cf23c92a3da68ba08f03a954e4.pdf", "title": "Smoothness-Increasing Accuracy-Conserving (SIAC) Postprocessing for Discontinuous Galerkin Solutions over Structured Triangular Meshes", "authors": [ "Mirzaee, Hanieh", "Ji, Liangyue", "Ryan, Jennifer K.", "Kirby, Robert M." ], "year": 2011, "venue": "" } 4 | { "id": "a7c25298c607d5bf32e3301b6b209431e2a7f830", "url": "http://s3-us-west-2.amazonaws.com/ai2-s2-pdfs/a7c2/5298c607d5bf32e3301b6b209431e2a7f830.pdf", "title": "Mining Generalized Association Rules on Biomedical Literature", "authors": [ "Berardi, Margherita", "Lapi, Michele", "Leo, Pietro", "Loglisci, Corrado" ], "year": 2005, "venue": "" } 5 | -------------------------------------------------------------------------------- /core/src/test/resources/kermittest.txt: -------------------------------------------------------------------------------- 1 | Tracey, et al. , Nature 330 , 662-664 ( 1987 ) 2 | Hinshaw, et al. , Circ. Shock 30 , 279-292 ( 1990 ) 3 | -------------------------------------------------------------------------------- /core/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ${logback_stdoutLevel:-DEBUG} 6 | 7 | 8 | %-5level %logger{36}: %msg%n 9 | 10 | 11 | 12 | 13 | false 14 | EvalErrors.log 15 | 16 | %msg%n 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /core/src/test/resources/map-reduce.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["line", "As a reaction to this complexity, we designed a new"], 3 | ["line", "most important words that occur in a document or a set"], 4 | ["line", "ments. The user would write code similar to the follow-"] 5 | ] -------------------------------------------------------------------------------- /core/src/test/resources/map-reduce.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/map-reduce.pdf -------------------------------------------------------------------------------- /core/src/test/resources/model-bib-crf-test.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/model-bib-crf-test.dat -------------------------------------------------------------------------------- /core/src/test/resources/mono04.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title", "Monolingual Machine Translation for Paraphrase Generation"], 3 | ["line", "Abstract"], 4 | ["line", "AER of 20.88% may appear problematic in a sys-"], 5 | ["line", "tion, we were hesitant to incur the exponential increase"], 6 | ["year", 2004] 7 | ] -------------------------------------------------------------------------------- /core/src/test/resources/mono04.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/mono04.pdf -------------------------------------------------------------------------------- /core/src/test/resources/mooney05.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title", "A Statistical Semantic Parser that Integrates Syntax and Semantics"], 3 | ["line", "Abstract"], 4 | ["year", 2005] 5 | ] -------------------------------------------------------------------------------- /core/src/test/resources/mooney05.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/mooney05.pdf -------------------------------------------------------------------------------- /core/src/test/resources/papers-parsebugs.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "id": "089f6c46de8a4399840548a056d13d38e1f54da2", 3 | "url": "http://s3-us-west-2.amazonaws.com/ai2-s2-pdfs/089f/6c46de8a4399840548a056d13d38e1f54da2.pdf", 4 | "title": "Scheduling problems in transportation networks of line topology", 5 | "authors": [ 6 | "Kowalski, Dariusz R.", 7 | "Nussbaum, Eyal", 8 | "Segal, Michael", 9 | "Milyeykovski, Vitaly" 10 | ], 11 | "year": 2014, 12 | "venue": "" 13 | }] -------------------------------------------------------------------------------- /core/src/test/resources/pedersen04.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title", "Combating Web Spam with TrustRank"], 3 | ["line", "Abstract"], 4 | ["year", 2004] 5 | ] -------------------------------------------------------------------------------- /core/src/test/resources/pedersen04.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/pedersen04.pdf -------------------------------------------------------------------------------- /core/src/test/resources/proto06.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title", "Prototype-Driven Learning for Sequence Models"], 3 | ["line", "Abstract"], 4 | ["line", "For our part-of-speech tagging experiments, we used"], 5 | ["year", 2006] 6 | ] -------------------------------------------------------------------------------- /core/src/test/resources/proto06.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/proto06.pdf -------------------------------------------------------------------------------- /core/src/test/resources/roark13.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title", "Discriminative Joint Modeling of Lexical Variation and Acoustic Confusion for Automated Narrative Retelling Assessment"], 3 | ["line", "Abstract"], 4 | ["line", "5K25AG033723-02 and P30 AG024978-05 and"], 5 | ["line", "Table 5: Story element F-score achieved by log-linear models (MaxEnt and CRF) when adding context dependent features (CD)"], 6 | ["year", 2013] 7 | ] -------------------------------------------------------------------------------- /core/src/test/resources/roark13.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/roark13.pdf -------------------------------------------------------------------------------- /core/src/test/resources/senellart10.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title", "Probabilistic XML via Markov Chains"], 3 | ["line", "We want to reinterpret probabilistic models on words to"], 4 | ["line", "bility of going into the original component distributed among"], 5 | ["line", "Figure 3: Translations between probabilistic XML representation systems."], 6 | ["line", "ABSTRACT"], 7 | ["year", 2010] 8 | ] -------------------------------------------------------------------------------- /core/src/test/resources/senellart10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/senellart10.pdf -------------------------------------------------------------------------------- /core/src/test/resources/seung08.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title", "Natural Image Denoising with Convolutional Networks"], 3 | ["line", "Abstract"], 4 | ["line", "One approach to image denoising is to transform an image from pixel intensities into another rep-"], 5 | ["year", 2008] 6 | ] -------------------------------------------------------------------------------- /core/src/test/resources/seung08.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/seung08.pdf -------------------------------------------------------------------------------- /core/src/test/resources/smith07.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title", "Smooth Sensitivity and Sampling in Private Data Analysis"], 3 | ["line", "ABSTRACT"], 4 | ["year", 2007] 5 | ] -------------------------------------------------------------------------------- /core/src/test/resources/smith07.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/smith07.pdf -------------------------------------------------------------------------------- /core/src/test/resources/smola10.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["title", "An Architecture for Parallel Topic Models"], 3 | ["line", "ABSTRACT"], 4 | ["line", "from PubMed abstracts, which is equivalent to a processing"], 5 | ["line", "Instead, we use it here to design a sampler for inference of"], 6 | ["line", "ods. Likewise, the same architecture could be used to per-"], 7 | ["year", 2010] 8 | ] -------------------------------------------------------------------------------- /core/src/test/resources/smola10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/smola10.pdf -------------------------------------------------------------------------------- /core/src/test/resources/superscripttest.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/superscripttest.pdf -------------------------------------------------------------------------------- /core/src/test/resources/testng.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /core/src/test/resources/umasstest.txt: -------------------------------------------------------------------------------- 1 | [30] E. W. Montroll , B. J. West , Fluctuation Phenomena , Elsevier Science Publishers B. V. ,
Amsterdam ,
1979 , Ch . On an enriched collection of stochastic processes , pp . 61--205 .
2 | -------------------------------------------------------------------------------- /core/src/test/resources/zolotov04.extraction.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["line", "Indexing XML Data Stored in a Relational Database"], 3 | ["line", "As XML usage grows for both data-centric and"], 4 | ["line", "In the ORDPATH values shown in Figure 2 (such as"], 5 | ["line", "Abstract"], 6 | ["year", 2004] 7 | ] -------------------------------------------------------------------------------- /core/src/test/resources/zolotov04.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/zolotov04.pdf -------------------------------------------------------------------------------- /core/src/test/scala/org/allenai/scienceparse/CoraExtractionSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import org.allenai.common.Resource 4 | import org.allenai.common.testkit.UnitSpec 5 | 6 | import scala.collection.JavaConverters._ 7 | import scala.collection.mutable.ArrayBuffer 8 | import scala.io.Source 9 | 10 | import org.scalatest._ 11 | import Matchers._ 12 | 13 | class CoraExtractionSpec extends UnitSpec { 14 | 15 | case class Reference( 16 | source: String, 17 | authors: Seq[String], 18 | title: String, 19 | date: String 20 | ) 21 | 22 | case class TestResult( 23 | reference: Reference, 24 | extracted: Seq[BibRecord], 25 | precision: Float, 26 | recall: Float, 27 | msg: Seq[String] = Seq() 28 | ) 29 | 30 | case class TestResults( 31 | precision: Float, 32 | recall: Float, 33 | results: Seq[TestResult] 34 | ) 35 | 36 | val refs = new ArrayBuffer[Reference]() 37 | val extractor = new ExtractReferences(Parser.getDefaultGazetteer.toString) 38 | 39 | Resource.using( 40 | Source.fromInputStream(getClass.getResourceAsStream("/tagged_references.txt")) 41 | ) { 42 | source => 43 | for ( 44 | ref <- source.getLines 45 | ) { 46 | val authorMatch = "(.*)".r.findFirstMatchIn(ref) 47 | val authors = authorMatch 48 | .toSeq 49 | .flatMap(_.group(1).split(",|and|&")) 50 | .map(_.trim) 51 | 52 | val title = "(.*)".r.findFirstMatchIn(ref).map(_.group(1).trim) 53 | val date = "(.*)".r.findFirstMatchIn(ref).map(_.group(1).trim) 54 | val raw = ref.replaceAll("<[^>]+>", "").replaceAll("]+>", "").trim 55 | refs.append(Reference(raw, authors, title.getOrElse(""), date.getOrElse(""))) 56 | } 57 | } 58 | 59 | // Successful as long as we got exactly one record. 60 | def segmentationTest(ref: Reference, extracted: Seq[BibRecord]): TestResult = { 61 | TestResult(ref, extracted, 1, 1) 62 | } 63 | 64 | def runTest(name: String, test: (Reference, Seq[BibRecord]) => TestResult): TestResults = { 65 | def testRecord(ref: Reference): TestResult = { 66 | val text = Seq("Bibliography", ref.source).asJava 67 | val records = extractor.findReferences(text).getOne.asScala 68 | if (records.size == 0) { 69 | println(s"Missed extraction: ${ref.source}") 70 | TestResult(ref, records, 0, 0, Seq("Missing")) 71 | } else if (records.size > 1) { 72 | TestResult(ref, records, 0, 0, Seq("Too many extractions")) 73 | } else { 74 | test(ref, records) 75 | } 76 | } 77 | 78 | val results: Seq[TestResult] = refs.map(testRecord _) 79 | 80 | val precision = results.map(_.precision).sum / results.size 81 | val recall = results.map(_.recall).sum / results.size 82 | 83 | println(s"$name precision: $precision recall: $recall") 84 | 85 | TestResults(precision, recall, results) 86 | } 87 | 88 | "cora-ie references" should "be extracted" in { 89 | assert(runTest("segmentation", segmentationTest _).recall >= 0.1) 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /core/src/test/scala/org/allenai/scienceparse/JavaTestSuite.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import org.scalatest.testng.TestNGWrapperSuite 4 | 5 | class JavaTestSuite extends TestNGWrapperSuite( 6 | List("src/test/resources/testng.xml") 7 | ) 8 | 9 | -------------------------------------------------------------------------------- /core/src/test/scala/org/allenai/scienceparse/JsonProtocolSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import java.util 4 | import java.util.regex.Pattern 5 | 6 | import org.allenai.common.testkit.UnitSpec 7 | 8 | class JsonProtocolSpec extends UnitSpec { 9 | import spray.json._ 10 | import JsonProtocol._ 11 | 12 | "JsonProtocol" should "round trip basic content" in { 13 | val em = new ExtractedMetadata( 14 | "The Brand Personality of Rocks: A Critical Evaluation of a Brand Personality Scale", 15 | util.Arrays.asList("Mark Avis", "Sarah Forbes", "Shelagh Ferguson"), 16 | null) 17 | 18 | em.equals(em.toJson.convertTo[ExtractedMetadata]) 19 | } 20 | 21 | it should "round trip empty authors" in { 22 | val em = new ExtractedMetadata( 23 | "The Brand Personality of Rocks: A Critical Evaluation of a Brand Personality Scale", 24 | util.Arrays.asList("", "Sarah Forbes", "Shelagh Ferguson"), 25 | null) 26 | 27 | em.equals(em.toJson.convertTo[ExtractedMetadata]) 28 | } 29 | 30 | it should "round trip complex content" in { 31 | val em = new ExtractedMetadata( 32 | "The Brand Personality of Rocks: A Critical Evaluation of a Brand Personality Scale", 33 | util.Arrays.asList("Mark Avis", "Sarah Forbes", "Shelagh Ferguson"), 34 | null) 35 | em.year = 2014 36 | em.sections = util.Arrays.asList( 37 | new Section("Introduction", "In this paper, ..."), 38 | new Section(null, "Furthermore, ...") 39 | ) 40 | em.abstractText = "Aaker’s (1997) brand personality (BP) scale is widely used in research and is an important foundation for the theory of BP." 41 | em.creator = "MS Paint" 42 | em.source = ExtractedMetadata.Source.META 43 | 44 | em.equals(em.toJson.convertTo[ExtractedMetadata]) 45 | } 46 | 47 | it should "round trip empty content" in { 48 | // Empty content 49 | val em = new ExtractedMetadata( 50 | null, 51 | util.Arrays.asList(), 52 | null) 53 | em.sections = util.Arrays.asList( 54 | new Section("", ""), 55 | new Section(null, "") 56 | ) 57 | em.abstractText = "" 58 | em.creator = "" 59 | 60 | em.equals(em.toJson.convertTo[ExtractedMetadata]) 61 | } 62 | 63 | it should "round trip references" in { 64 | val em = new ExtractedMetadata( 65 | "The Brand Personality of Rocks: A Critical Evaluation of a Brand Personality Scale", 66 | util.Arrays.asList("Mark Avis", "Sarah Forbes", "Shelagh Ferguson"), 67 | null) 68 | 69 | em.references = util.Arrays.asList( 70 | new BibRecord( 71 | "Managing Brand Equity: Capitalizing on the Value of a Brand Name", 72 | util.Arrays.asList("Aaker, D"), 73 | "The Free Press", 74 | null, 75 | null, 76 | 1991 77 | ), 78 | new BibRecord( 79 | "Dimensions of Brand Personality", 80 | util.Arrays.asList("Aaker, D"), 81 | "Journal of Marketing Research", 82 | Pattern.compile("Aaker et al\\."), 83 | Pattern.compile("\\[2\\]"), 84 | 1997 85 | ), 86 | new BibRecord( 87 | null, 88 | util.Arrays.asList(), 89 | null, 90 | null, 91 | null, 92 | 2001 93 | ) 94 | ) 95 | 96 | em.referenceMentions = util.Arrays.asList( 97 | new CitationRecord( 98 | 1, 99 | "As [1] held these truths to be self-evident, ...", 100 | 3, 101 | 6 102 | ) 103 | ) 104 | 105 | em.equals(em.toJson.convertTo[ExtractedMetadata]) 106 | } 107 | 108 | "LabeledData" should "round-trip through the JSON format" in { 109 | val sha = "a7c25298c607d5bf32e3301b6b209431e2a7f830" 110 | def getInputStream = this.getClass.getResourceAsStream(s"/$sha.pdf") 111 | val em = Parser.getInstance().doParse(getInputStream) 112 | val labeledData = LabeledData.fromExtractedMetadata(sha, em) 113 | val jsonString = labeledData.toJson.prettyPrint 114 | 115 | val labeledDataFromJson = jsonString.parseJson.convertTo[LabeledData] 116 | 117 | assertResult(labeledData.title)(labeledDataFromJson.title) 118 | assertResult(labeledData.authors)(labeledDataFromJson.authors) 119 | assertResult(labeledData.abstractText)(labeledDataFromJson.abstractText) 120 | assertResult(labeledData.year)(labeledDataFromJson.year) 121 | assertResult(labeledData.venue)(labeledDataFromJson.venue) 122 | assertResult(labeledData.sections)(labeledDataFromJson.sections) 123 | assertResult(labeledData.references)(labeledDataFromJson.references) 124 | //assertResult(labeledData.mentions)(labeledDataFromJson.mentions) 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /core/src/test/scala/org/allenai/scienceparse/MetaEvalSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | 5 | class MetaEvalSpec extends UnitSpec { 6 | "MetaEval" should "produce good P/R numbers" in { 7 | val parser = new Parser() 8 | val evaluationResult = Evaluation.evaluate(parser) 9 | Evaluation.printResults(evaluationResult) 10 | 11 | val minimumPR = Map( 12 | "abstract ".trim -> ((0.856, 0.856)), 13 | "abstractNormalized ".trim -> ((0.856, 0.856)), 14 | "authorFullName ".trim -> ((0.821, 0.805)), 15 | "authorFullNameNormalized ".trim -> ((0.851, 0.831)), 16 | "authorLastName ".trim -> ((0.871, 0.847)), 17 | "authorLastNameNormalized ".trim -> ((0.889, 0.862)), 18 | "bibAll ".trim -> ((0.033, 0.031)), 19 | "bibAllButVenuesNormalized".trim -> ((0.619, 0.560)), 20 | "bibAllNormalized ".trim -> ((0.044, 0.041)), 21 | "bibAuthors ".trim -> ((0.726, 0.637)), 22 | "bibAuthorsNormalized ".trim -> ((0.840, 0.743)), 23 | "bibCounts ".trim -> ((1.000, 0.826)), 24 | "bibMentions ".trim -> ((0.232, 0.218)), 25 | "bibMentionsNormalized ".trim -> ((0.273, 0.245)), 26 | "bibTitles ".trim -> ((0.795, 0.709)), 27 | "bibTitlesNormalized ".trim -> ((0.796, 0.710)), 28 | "bibVenues ".trim -> ((0.062, 0.051)), 29 | "bibVenuesNormalized ".trim -> ((0.063, 0.052)), 30 | "bibYears ".trim -> ((0.933, 0.835)), 31 | "title ".trim -> ((0.427, 0.427)), 32 | "titleNormalized ".trim -> ((0.842, 0.842)) 33 | ) 34 | 35 | val tolerance = 0.002 36 | evaluationResult.scienceParse.foreach { case (metric, eval) => 37 | val (minimumP, minimumR) = minimumPR(metric.name) 38 | assert(eval.p > minimumP - tolerance, s"Evaluating precision for ${metric.name}") 39 | assert(eval.r > minimumR - tolerance, s"Evaluating recall for ${metric.name}") 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /core/src/test/scala/org/allenai/scienceparse/StringUtilsSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | 5 | class StringUtilsSpec extends UnitSpec { 6 | "author names" should "get split correctly" in { 7 | val tests = Map( 8 | "Aryabhata" -> Tuple2("", "Aryabhata"), 9 | "Peter Clark" -> Tuple2("Peter", "Clark"), 10 | "Peter Clark" -> Tuple2("Peter", " Clark"), 11 | "Arthur C. Clarke" -> Tuple2("Arthur C.", "Clarke"), 12 | "Ludwig van Beethoven" -> Tuple2("Ludwig", "van Beethoven"), 13 | "Ludwig van Beethoven" -> Tuple2("Ludwig", " van Beethoven"), 14 | " Ludwig van Beethoven" -> Tuple2(" Ludwig", " van Beethoven"), 15 | "Ludwig van Beethoven Jr." -> Tuple2("Ludwig", " van Beethoven Jr."), 16 | "Ludwig van Beethoven Jr. " -> Tuple2("Ludwig", " van Beethoven Jr. "), 17 | "Ayrton Senna da Silva" -> Tuple2("Ayrton Senna", "da Silva"), 18 | "" -> Tuple2("", ""), 19 | " " -> Tuple2("", " ") 20 | ) 21 | 22 | tests.foreach { case (original, expected) => 23 | assertResult(expected)(StringUtils.splitName(original)) 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.2.7 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9") 2 | 3 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.2-1") 4 | 5 | addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.4") 6 | -------------------------------------------------------------------------------- /server/.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | -------------------------------------------------------------------------------- /server/README.md: -------------------------------------------------------------------------------- 1 | # Science Parse Server 2 | 3 | This is a wrapper that makes the [SP library](../core/README.md) available as a web service. We have a version running at http://scienceparse.allenai.org, so you can try it yourself: http://scienceparse.allenai.org/v1/498bb0efad6ec15dd09d941fb309aa18d6df9f5f 4 | 5 | This will show a large amount of JSON. Most of it is body text. You can get a slightly more compact output by skipping the body text: http://scienceparse.allenai.org/v1/498bb0efad6ec15dd09d941fb309aa18d6df9f5f?skipFields=sections 6 | 7 | Both of these examples parse the paper with the S2 paper id `498bb0efad6ec15dd09d941fb309aa18d6df9f5f`. You can see that paper here: https://pdfs.semanticscholar.org/498b/b0efad6ec15dd09d941fb309aa18d6df9f5f.pdf 8 | 9 | ## Parsing your own PDF 10 | 11 | If you want to upload your own PDF, you can do that with a HTTP POST: 12 | ``` 13 | curl -v -H "Content-type: application/pdf" --data-binary @paper.pdf "http://scienceparse.allenai.org/v1" 14 | ``` 15 | 16 | Note that the content type needs to be `application/pdf`, and the URL needs to not have a trailing slash. 17 | 18 | ## Running the server yourself 19 | 20 | You can compile the server into a super-jar with sbt with `sbt server/assembly`. That will download all dependencies, compile them, and build an executable jar with all dependencies bundled. Then, you can start up the server with `java -Xmx6g -jar jarfile.jar`. On first startup, it will download several gigabytes of model files, and then bind to port 8080 on the machine you run it on. 21 | 22 | The server takes a few command line arguments. Run it with `java -jar jarfile.jar --help` to see what they are. 23 | 24 | Science Parse takes quite a bit of memory, so we recommend running it with `-Xmx6g`. Some documents might require more than that. Science Parse also uses off-heap memory (i.e., memory that's not specified by `-Xmx`), so we recommend that you have at least 2GB free in addition to the heap memory specified with `-Xmx`. 25 | 26 | ## Feedback mechanism 27 | 28 | The server supports something called the "Feedback mechanism". This is a fairly basic way to gather corrections to the extractions SP makes, so we can improve the models. The mechanism is disabled by default, so you shouldn't have to worry about it most of the time. 29 | 30 | We don't support this mechanism publically, but if you want to play with it, it should be easy to point it at a postgres database of your choice, and start gathering feedback. 31 | -------------------------------------------------------------------------------- /server/build.sbt: -------------------------------------------------------------------------------- 1 | javaOptions in run += s"-Xmx10G" 2 | 3 | fork := true 4 | 5 | mainClass in assembly := Some("org.allenai.scienceparse.SPServer") 6 | 7 | assemblyMergeStrategy in assembly := { 8 | case "logback.xml" => MergeStrategy.first 9 | case "application.conf" => MergeStrategy.concat 10 | case x => (assemblyMergeStrategy in assembly).value.apply(x) 11 | } 12 | 13 | libraryDependencies ++= Seq( 14 | "org.slf4j" % "jcl-over-slf4j" % "1.7.7", 15 | "org.eclipse.jetty" % "jetty-server" % "9.4.1.v20170120", 16 | "com.typesafe" % "config" % "1.3.1", 17 | "org.scalikejdbc" %% "scalikejdbc" % "2.5.0" exclude ("commons-logging", "commons-logging"), 18 | "org.postgresql" % "postgresql" % "42.0.0" 19 | ) 20 | -------------------------------------------------------------------------------- /server/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | org.allenai.scienceparse.Server { 2 | db = { 3 | url = null 4 | user = "scienceparse" 5 | password = null 6 | 7 | connectionPool = enabled 8 | keepAliveConnection = true 9 | } 10 | 11 | db-as-root = { 12 | url = ${org.allenai.scienceparse.Server.db.url} 13 | user = "root" 14 | password = null 15 | 16 | connectionPool = disabled 17 | keepAliveConnection = false 18 | numThreads = 1 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /server/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ${logback_stdoutLevel:-DEBUG} 6 | 7 | 8 | %-5level %logger{36}: %msg%n 9 | 10 | 11 | 12 | 13 | false 14 | EvalErrors.log 15 | 16 | %msg%n 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /server/src/main/scala/org/allenai/scienceparse/FeedbackStore.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.scienceparse 2 | 3 | import com.typesafe.config.{ConfigFactory, Config} 4 | import org.allenai.common.{Resource, Logging} 5 | import org.allenai.common.Config._ 6 | 7 | import scalikejdbc._ 8 | 9 | import java.time.Instant 10 | 11 | object FeedbackStore extends Logging { 12 | { // Set up the DB 13 | Class.forName("org.postgresql.Driver") 14 | 15 | val config = ConfigFactory.load() 16 | val dbConfig: Config = config[Config]("org.allenai.scienceparse.Server.db") 17 | 18 | scalikejdbc.GlobalSettings.loggingSQLAndTime = scalikejdbc.LoggingSQLAndTimeSettings( 19 | enabled = dbConfig.get[Boolean]("logging").getOrElse(false), 20 | logLevel = 'DEBUG, 21 | warningEnabled = true, 22 | warningThresholdMillis = 1000L, 23 | warningLogLevel = 'WARN 24 | ) 25 | 26 | val dbUrl = dbConfig.getString("url") 27 | val dbUser = dbConfig.getString("user") 28 | val dbPassword = dbConfig.get[String]("password").getOrElse( 29 | throw new IllegalArgumentException("Password for DB not set. Please set org.allenai.scienceparse.Server.db.password.")) 30 | ConnectionPool.singleton(dbUrl, dbUser, dbPassword) 31 | 32 | // upgrade the schema if necessary 33 | { 34 | val dbConfig: Config = config[Config]("org.allenai.scienceparse.Server.db-as-root") 35 | val dbUrl = dbConfig.getString("url") 36 | logger.info(s"Connecting to $dbUrl") 37 | val dbUser = dbConfig.getString("user") 38 | val dbPassword = dbConfig.get[String]("password").getOrElse( 39 | throw new IllegalArgumentException("Root password for DB not set. Please set org.allenai.scienceparse.Server.db-as-root.password.")) 40 | 41 | val rootConnectionPoolName = "rootConnectionPool" 42 | val cpSettings = new ConnectionPoolSettings(initialSize = 1, maxSize = 2) 43 | ConnectionPool.add(rootConnectionPoolName, dbUrl, dbUser, dbPassword, cpSettings) 44 | Resource.using(ConnectionPool(rootConnectionPoolName)) { implicit cp => 45 | DB.localTx { implicit session => 46 | sql""" 47 | CREATE TABLE IF NOT EXISTS settings ( 48 | key VARCHAR NOT NULL PRIMARY KEY, 49 | value VARCHAR NOT NULL) 50 | """.execute().apply() 51 | 52 | def dbSchemaVersion = 53 | sql"SELECT value::integer FROM settings WHERE key = 'version'".map(_.int("value")).single().apply().getOrElse(0) 54 | val desiredSchemaVersion = 1 55 | val schemaUpdateFunctions = Map( 56 | 0 -> (() => { 57 | sql""" 58 | CREATE TABLE feedback ( 59 | paperId CHAR(40) NOT NULL, 60 | timeAdded TIMESTAMP NOT NULL, 61 | value JSONB NOT NULL, 62 | PRIMARY KEY(paperId, timeAdded)) 63 | """.execute().apply() 64 | 65 | sql""" 66 | INSERT INTO settings (key, value) VALUES ('version', 1) 67 | """.execute().apply() 68 | }) 69 | ) 70 | 71 | var currentSchemaVersion = dbSchemaVersion 72 | while(currentSchemaVersion != desiredSchemaVersion) { 73 | val updateFunction = schemaUpdateFunctions.getOrElse( 74 | currentSchemaVersion, 75 | throw new RuntimeException(s"Could not find upgrade function for version $currentSchemaVersion.")) 76 | updateFunction() 77 | 78 | val newSchemaVersion = dbSchemaVersion 79 | if(newSchemaVersion == currentSchemaVersion) 80 | throw new RuntimeException(s"Upgrade function for version $currentSchemaVersion did not change the version.") 81 | currentSchemaVersion = newSchemaVersion 82 | } 83 | } 84 | } 85 | } 86 | } 87 | 88 | def addFeedback(paperId: String, data: LabeledData): Unit = { 89 | import spray.json._ 90 | import JsonProtocol._ 91 | 92 | val jsonString = data.toJson.compactPrint 93 | DB.localTx { implicit t => 94 | sql""" 95 | INSERT INTO feedback (paperId, timeAdded, value) VALUES 96 | ($paperId, current_timestamp, $jsonString::jsonb) 97 | """.update().apply() 98 | } 99 | } 100 | 101 | private val paperSource = PaperSource.getDefault 102 | 103 | def getFeedback(paperId: String): Option[LabeledData] = { 104 | import spray.json._ 105 | import JsonProtocol._ 106 | 107 | DB.readOnly { implicit t => 108 | sql""" 109 | SELECT value FROM feedback WHERE paperId=$paperId ORDER BY timeAdded DESC LIMIT 1 110 | """.map { result => 111 | val jsonString = result.string("value") 112 | jsonString.parseJson.convertTo[LabeledData] 113 | }.first().apply() 114 | } 115 | } 116 | 117 | /** 118 | * @param onOrAfter If given, constrains returned feedback to those added on or after this timestamp. 119 | * @param before If given, constrains returned feedback to those added before this timestamp. 120 | * @return (paper id, time added, labeled data) 121 | */ 122 | def getAllFeedback( 123 | onOrAfter: Option[Instant] = None, 124 | before: Option[Instant] = None 125 | ): Traversable[(String, String, LabeledData)] = { 126 | import spray.json._ 127 | import JsonProtocol._ 128 | 129 | val onOrAfterClause = onOrAfter.map(ts => sqls" AND a.timeadded >= $ts").getOrElse(sqls"") 130 | val beforeClause = before.map(ts => sqls" AND a.timeadded < $ts").getOrElse(sqls"") 131 | 132 | DB.readOnly { implicit t => 133 | sql""" 134 | SELECT a.paperId, a.timeAdded, a.value FROM feedback AS a JOIN ( 135 | SELECT paperId, MAX(timeAdded) AS timeAdded FROM feedback GROUP BY paperId 136 | ) AS b ON a.paperId = b.paperId AND a.timeAdded = b.timeAdded 137 | $onOrAfterClause $beforeClause 138 | """.map { result => 139 | val paperId = result.string("paperId") 140 | val timeAdded = result.timestamp("timeAdded").toInstant 141 | val jsonString = result.string("value") 142 | (paperId, timeAdded.toString, jsonString.parseJson.convertTo[LabeledData]) 143 | }.traversable.apply() 144 | } 145 | } 146 | } 147 | --------------------------------------------------------------------------------