├── .gitignore ├── LICENSE ├── README.md ├── boilerpipe ├── boilerpipe-common │ ├── LICENSE │ ├── README.md │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── kohlschutter │ │ └── boilerpipe │ │ ├── BoilerpipeDocumentSource.java │ │ ├── BoilerpipeExtractor.java │ │ ├── BoilerpipeFilter.java │ │ ├── BoilerpipeInput.java │ │ ├── BoilerpipeProcessingException.java │ │ ├── conditions │ │ └── TextBlockCondition.java │ │ ├── document │ │ ├── BPAnnotation.java │ │ ├── HeaderAnnotation.java │ │ ├── Image.java │ │ ├── Link.java │ │ ├── ParagraphAnnotation.java │ │ ├── TextBlock.java │ │ ├── TextDocument.java │ │ ├── TextDocumentStatistics.java │ │ ├── TextFormatAnnotation.java │ │ └── package-info.java │ │ ├── estimators │ │ └── SimpleEstimator.java │ │ ├── extractors │ │ ├── ArticleExtractor.java │ │ ├── ArticleSentencesExtractor.java │ │ ├── CanolaExtractor.java │ │ ├── CommonExtractors.java │ │ ├── DefaultExtractor.java │ │ ├── ExtractorBase.java │ │ ├── KeepEverythingExtractor.java │ │ ├── KeepEverythingWithMinKWordsExtractor.java │ │ ├── LargestContentExtractor.java │ │ ├── NumWordsRulesExtractor.java │ │ └── package-info.java │ │ ├── filters │ │ ├── debug │ │ │ └── PrintDebugFilter.java │ │ ├── english │ │ │ ├── DensityRulesClassifier.java │ │ │ ├── HeuristicFilterBase.java │ │ │ ├── IgnoreBlocksAfterContentFilter.java │ │ │ ├── IgnoreBlocksAfterContentFromEndFilter.java │ │ │ ├── KeepLargestFulltextBlockFilter.java │ │ │ ├── MinFulltextWordsFilter.java │ │ │ ├── NumWordsRulesClassifier.java │ │ │ ├── TerminatingBlocksFinder.java │ │ │ └── package-info.java │ │ ├── heuristics │ │ │ ├── AddPrecedingLabelsFilter.java │ │ │ ├── ArticleMetadataFilter.java │ │ │ ├── BlockProximityFusion.java │ │ │ ├── ContentFusion.java │ │ │ ├── DocumentTitleMatchClassifier.java │ │ │ ├── ExpandTitleToContentFilter.java │ │ │ ├── KeepLargestBlockFilter.java │ │ │ ├── LabelFusion.java │ │ │ ├── LargeBlockSameTagLevelToContentFilter.java │ │ │ ├── ListAtEndFilter.java │ │ │ ├── SimpleBlockFusionProcessor.java │ │ │ ├── TrailingHeadlineToBoilerplateFilter.java │ │ │ └── package-info.java │ │ └── simple │ │ │ ├── BoilerplateBlockFilter.java │ │ │ ├── InvertedFilter.java │ │ │ ├── LabelToBoilerplateFilter.java │ │ │ ├── LabelToContentFilter.java │ │ │ ├── MarkEverythingBoilerplateFilter.java │ │ │ ├── MarkEverythingContentFilter.java │ │ │ ├── MinClauseWordsFilter.java │ │ │ ├── MinWordsFilter.java │ │ │ ├── SplitParagraphBlocksFilter.java │ │ │ ├── SurroundingToContentFilter.java │ │ │ └── package-info.java │ │ ├── labels │ │ ├── ConditionalLabelAction.java │ │ ├── DefaultLabels.java │ │ └── LabelAction.java │ │ ├── package-info.java │ │ ├── sax │ │ ├── BoilerpipeHTMLContentHandler.java │ │ ├── BoilerpipeHTMLParser.java │ │ ├── BoilerpipeSAXInput.java │ │ ├── CommonTagActions.java │ │ ├── DefaultTagActionMap.java │ │ ├── HTMLDocument.java │ │ ├── HTMLFetcher.java │ │ ├── HTMLHighlighter.java │ │ ├── ImageExtractor.java │ │ ├── InputSourceable.java │ │ ├── MarkupTagAction.java │ │ ├── TagAction.java │ │ ├── TagActionMap.java │ │ └── package-info.java │ │ └── util │ │ ├── UnicodeTokenizer.java │ │ └── package-info.java └── nekohtml │ ├── dependency-reduced-pom.xml │ ├── pom.xml │ └── src │ └── main │ └── java │ └── org │ └── cyberneko │ └── html │ ├── HTMLElements.java │ └── HTMLTagBalancer.java ├── com.ibm.research.ai.ki.corpus ├── pom.xml └── src │ └── main │ ├── java │ └── com │ │ └── ibm │ │ └── research │ │ └── ai │ │ └── ki │ │ └── corpora │ │ └── crawl │ │ ├── CharsetDetect.java │ │ ├── CommonCrawlConfig.java │ │ ├── HtmlToDocument.java │ │ ├── LanguageScorer.java │ │ ├── SaveCommonCrawl.java │ │ ├── SaveCommonCrawlBase.java │ │ └── SaveCommonCrawlHdfs.java │ └── resources │ ├── cc-dbp │ └── cc-dbp.properties │ ├── log4j.properties │ └── simplelogger.properties ├── com.ibm.research.ai.ki.kb ├── README.md ├── pom.xml └── src │ └── main │ ├── java │ └── com │ │ └── ibm │ │ └── research │ │ └── ai │ │ └── ki │ │ └── kb │ │ ├── BuildGazetteer.java │ │ ├── BuildGroundTruth.java │ │ ├── ConfigureMinMaxEntityFreq.java │ │ ├── FindUnary.java │ │ ├── GroundTruthConfig.java │ │ ├── KBConfig.java │ │ ├── KBFiles.java │ │ ├── NodePopularity.java │ │ ├── RelationTaxonomy.java │ │ ├── SelectTypes.java │ │ ├── TypePairFilter.java │ │ ├── conversion │ │ ├── ConvertDBpedia.java │ │ ├── DBpediaKBConfig.java │ │ ├── MergeNodesDBpedia.java │ │ ├── SelectRelations.java │ │ └── SummaryCharts.java │ │ └── explore │ │ ├── CheckLabelCollisions.java │ │ └── FilterByCorpusCount.java │ └── resources │ ├── dbpediaConfig.properties │ └── relationSample.txt ├── com.ibm.research.ai.ki.kbp ├── pom.xml └── src │ └── main │ ├── java │ └── com │ │ └── ibm │ │ └── research │ │ └── ai │ │ └── ki │ │ └── kbp │ │ ├── CoveredTextEntityId.java │ │ ├── CreateTsvDataset.java │ │ ├── CreateTsvDatasetTokenWindow.java │ │ ├── DocumentFeatureString.java │ │ ├── DocumentPreprocessing.java │ │ ├── FilterEntsByGroundTruth.java │ │ ├── GazetteerEDL.java │ │ ├── GroundTruth.java │ │ ├── GroupRelexMentionTsvDataset.java │ │ ├── IEntityPairFilter.java │ │ ├── IGroundTruth.java │ │ ├── IPostprocessEntityRecognition.java │ │ ├── IRelexDatasetManager.java │ │ ├── IRelexMention.java │ │ ├── IRelexTensors.java │ │ ├── IRelexTsv.java │ │ ├── KBPBuildDataset.java │ │ ├── NounPhraseEntityWithId.java │ │ ├── RelexConfig.java │ │ ├── RelexDatasetFiles.java │ │ ├── RelexDatasetManagerBinary.java │ │ ├── RelexMention.java │ │ ├── RelexMentionReader.java │ │ ├── RelexStats.java │ │ ├── RelexTensors.java │ │ ├── RelexVocab.java │ │ ├── ShowExamples.java │ │ ├── Tokenizer.java │ │ ├── TypePairEntityPairFilter.java │ │ ├── baselines │ │ └── NREConvert.java │ │ ├── embeddings │ │ ├── EmbeddingFormat.java │ │ └── Word2VecConverter.java │ │ └── unary │ │ ├── DownsampleEntityFilter.java │ │ ├── IEntityFilter.java │ │ ├── RelexDatasetManagerUnary.java │ │ ├── UnaryGroundTruth.java │ │ ├── UnaryRelexMention.java │ │ ├── UnaryRelexTensors.java │ │ └── UnaryRelexTsvDataset.java │ └── resources │ └── relexConfigNonSpark.properties ├── com.ibm.research.ai.ki.nlp ├── pom.xml └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── ibm │ │ │ ├── reseach │ │ │ └── ai │ │ │ │ └── ki │ │ │ │ └── nlp │ │ │ │ ├── AnnoRef.java │ │ │ │ ├── Annotation.java │ │ │ │ ├── Annotator.java │ │ │ │ ├── Document.java │ │ │ │ ├── DocumentJSONDeserializer.java │ │ │ │ ├── DocumentJSONSerializer.java │ │ │ │ ├── DocumentReader.java │ │ │ │ ├── DocumentSerialize.java │ │ │ │ ├── DocumentStructure.java │ │ │ │ ├── DocumentWriter.java │ │ │ │ ├── DocumentWriter2.java │ │ │ │ ├── OffsetCorrection.java │ │ │ │ ├── Pipeline.java │ │ │ │ ├── PipelinedDocuments.java │ │ │ │ ├── ResettingAnnotator.java │ │ │ │ ├── TransformBase.java │ │ │ │ ├── TransformRegex.java │ │ │ │ ├── TransformString.java │ │ │ │ ├── conversion │ │ │ │ └── NIFSerialization.java │ │ │ │ └── types │ │ │ │ ├── Author.java │ │ │ │ ├── Categories.java │ │ │ │ ├── Chunk.java │ │ │ │ ├── CorefIndex.java │ │ │ │ ├── DocDate.java │ │ │ │ ├── DocRelations.java │ │ │ │ ├── DocumentContentType.java │ │ │ │ ├── DocumentSource.java │ │ │ │ ├── Entity.java │ │ │ │ ├── EntityWithId.java │ │ │ │ ├── Event.java │ │ │ │ ├── LinkAnnotation.java │ │ │ │ ├── LinkedEntity.java │ │ │ │ ├── ListAnnotation.java │ │ │ │ ├── ListItem.java │ │ │ │ ├── Paragraph.java │ │ │ │ ├── Relation.java │ │ │ │ ├── Section.java │ │ │ │ ├── SectionHeader.java │ │ │ │ ├── Sentence.java │ │ │ │ ├── TextFormatting.java │ │ │ │ ├── Title.java │ │ │ │ ├── Token.java │ │ │ │ └── XmlTag.java │ │ │ └── research │ │ │ └── ai │ │ │ └── ki │ │ │ └── nlp │ │ │ └── parse │ │ │ ├── ClearNLPNER.java │ │ │ ├── ClearNLPPOS.java │ │ │ ├── ClearNLPParse.java │ │ │ ├── ClearNLPSentence.java │ │ │ ├── ClearNLPTokenize.java │ │ │ ├── ClearNLPTransform.java │ │ │ ├── DigitSequenceTokenize.java │ │ │ ├── EntityToOccurrences.java │ │ │ ├── GazetteerMatcher.java │ │ │ ├── NormalizeTextTransform.java │ │ │ ├── OpenNLPChunk.java │ │ │ ├── OpenNLPNER.java │ │ │ ├── OpenNLPPOS.java │ │ │ ├── OpenNLPSentence.java │ │ │ ├── OpenNLPTokenize.java │ │ │ ├── RegexParagraph.java │ │ │ ├── RegexTokenize.java │ │ │ └── TokensSnapToEntities.java │ └── resources │ │ ├── com │ │ └── ibm │ │ │ └── research │ │ │ └── ai │ │ │ └── ki │ │ │ └── nlp │ │ │ └── parse │ │ │ ├── clearNLP-replace.tsv │ │ │ └── normalizeText-replace.tsv │ │ ├── downloadOpenNLPModels.sh │ │ ├── en-sent.bin │ │ └── log4j.properties │ └── test │ └── java │ └── com │ └── ibm │ └── research │ └── ai │ └── ki │ └── nlp │ ├── OverlappingSpansTest.java │ ├── TestJSON.java │ ├── TransformStringTest.java │ └── parse │ ├── TestClearNLP.java │ ├── TestGazetteerMatcher.java │ └── TestNER.java ├── com.ibm.research.ai.ki.spark ├── pom.xml └── src │ └── main │ ├── java │ └── com │ │ └── ibm │ │ └── research │ │ └── ai │ │ └── ki │ │ └── spark │ │ ├── Base64ToBinary.java │ │ ├── CorpusStatistics.java │ │ ├── CreateW2VFile.java │ │ ├── DocEntityStats.java │ │ ├── GatherRelexStats.java │ │ ├── GatherRelexVocab.java │ │ ├── GazetteerPreprocess.java │ │ ├── NonSparkGatherVocab.java │ │ ├── RelexBuildDataset.java │ │ ├── RelexTensorDataset.java │ │ ├── RelexTsvDataset.java │ │ ├── RunPipelineSpark.java │ │ └── SimpleSparkJob.java │ ├── resources │ └── relexConfig.properties │ └── scripts │ ├── java-viacloud │ └── java-viaspark ├── com.ibm.research.ai.ki.util ├── pom.xml └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── ibm │ │ │ └── research │ │ │ └── ai │ │ │ └── ki │ │ │ ├── formats │ │ │ ├── ArchiveEntryIterable.java │ │ │ ├── NTriples.java │ │ │ └── SimpleTsvIterable.java │ │ │ └── util │ │ │ ├── BlockShuffler.java │ │ │ ├── CollectionUtil.java │ │ │ ├── CombinedSpans.java │ │ │ ├── DenseVectors.java │ │ │ ├── Distribution.java │ │ │ ├── FileUtil.java │ │ │ ├── FirstPairComparator.java │ │ │ ├── HashMapUtil.java │ │ │ ├── Lang.java │ │ │ ├── LogLinear.java │ │ │ ├── MutableDouble.java │ │ │ ├── MutableInteger.java │ │ │ ├── NBest.java │ │ │ ├── NestedIterable.java │ │ │ ├── NextOnlyIterator.java │ │ │ ├── NonOverlappingSpans.java │ │ │ ├── OverlappingSpans.java │ │ │ ├── Pair.java │ │ │ ├── PeriodicChecker.java │ │ │ ├── PropertyLoader.java │ │ │ ├── PropertyStruct.java │ │ │ ├── RandomUtil.java │ │ │ ├── SecondPairComparator.java │ │ │ ├── Span.java │ │ │ ├── SparseVectors.java │ │ │ ├── ThreadedLoopIterator.java │ │ │ ├── Warnings.java │ │ │ ├── eval │ │ │ ├── BootstrappingConfidenceInterval.java │ │ │ ├── MultiPrecisionRecall.java │ │ │ ├── PrecisionRecall.java │ │ │ └── SamplingPermutationTest.java │ │ │ ├── graphs │ │ │ ├── GraphAlgorithms.java │ │ │ ├── SnowballSampler.java │ │ │ └── TreeAlgorithms.java │ │ │ ├── io │ │ │ ├── DataIO.java │ │ │ ├── MultiFileWriter.java │ │ │ ├── OldVersionOf.java │ │ │ ├── RefactoringObjectInputStream.java │ │ │ ├── TensorFileReader.java │ │ │ └── TensorFileWriter.java │ │ │ └── parallel │ │ │ ├── BlockingThreadedExecutor.java │ │ │ ├── ISimpleExecutor.java │ │ │ ├── PollingThreadedExecutor.java │ │ │ ├── SingleThreadedExecutor.java │ │ │ └── StreamEater.java │ └── resources │ │ └── com │ │ └── ibm │ │ └── research │ │ └── ai │ │ └── ki │ │ └── util │ │ └── serializedMappings.properties │ └── test │ ├── java │ └── com │ │ └── ibm │ │ └── research │ │ └── ai │ │ └── ki │ │ └── util │ │ ├── BjUtilTestCounter.java │ │ ├── ExecuteJavaProc.java │ │ ├── FileIteratorTest.java │ │ ├── FileUtilTest.java │ │ ├── HashMapUtilTest.java │ │ ├── LangTest.java │ │ ├── NBestTest.java │ │ ├── NonOverlappingTest.java │ │ ├── OverlappingSpansTest.java │ │ ├── PrecisionRecallTest.java │ │ ├── PropertyLoaderTest.java │ │ ├── RandomUtilTest.java │ │ ├── SpanTest.java │ │ ├── SparseVectorsTest.java │ │ └── TestTreeAlgorithms.java │ └── resources │ └── com │ └── ibm │ └── research │ └── ai │ └── ki │ └── util │ └── 1.properties ├── config.properties ├── configSmall-de.properties ├── configSmall.properties ├── create.sh ├── createSmall-de.sh ├── createSmall.sh ├── pom.xml ├── unaryConfig.properties └── unaryCreate.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .classpath 2 | .project 3 | .settings 4 | **/.classpath 5 | **/.project 6 | **/.settings 7 | target/** 8 | */target/** 9 | */target/* 10 | .metadata 11 | clientdb.xml 12 | release.properties 13 | pom.xml.releaseBackup 14 | *~ 15 | 16 | 17 | # User specified git ignore directories (works recursively). 18 | *.DS_Store 19 | .metadata 20 | .recommenders 21 | 22 | .idea/ 23 | **/.idea/ 24 | 25 | 26 | # Byte-compiled / optimized / DLL files 27 | __pycache__/ 28 | *.py[cod] 29 | *$py.class 30 | 31 | # C extensions 32 | *.so 33 | 34 | # Distribution / packaging 35 | .Python 36 | env/ 37 | build/ 38 | develop-eggs/ 39 | dist/ 40 | downloads/ 41 | eggs/ 42 | .eggs/ 43 | lib/ 44 | lib64/ 45 | parts/ 46 | sdist/ 47 | var/ 48 | *.egg-info/ 49 | .installed.cfg 50 | *.egg 51 | 52 | # PyInstaller 53 | # Usually these files are written by a python script from a template 54 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 55 | *.manifest 56 | *.spec 57 | 58 | # Installer logs 59 | pip-log.txt 60 | pip-delete-this-directory.txt 61 | 62 | # Unit test / coverage reports 63 | htmlcov/ 64 | .tox/ 65 | .coverage 66 | .coverage.* 67 | .cache 68 | nosetests.xml 69 | coverage.xml 70 | *,cover 71 | .hypothesis/ 72 | 73 | # Translations 74 | *.mo 75 | *.pot 76 | 77 | # Django stuff: 78 | *.log 79 | local_settings.py 80 | 81 | # Flask stuff: 82 | instance/ 83 | .webassets-cache 84 | 85 | # Scrapy stuff: 86 | .scrapy 87 | 88 | # Sphinx documentation 89 | docs/_build/ 90 | 91 | # PyBuilder 92 | target/ 93 | 94 | # IPython Notebook 95 | .ipynb_checkpoints 96 | 97 | # pyenv 98 | .python-version 99 | 100 | # celery beat schedule file 101 | celerybeat-schedule 102 | 103 | # dotenv 104 | .env 105 | 106 | # virtualenv 107 | venv/ 108 | ENV/ 109 | 110 | # Spyder project settings 111 | .spyderproject 112 | 113 | # Rope project settings 114 | .ropeproject 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/LICENSE: -------------------------------------------------------------------------------- 1 | boilerpipe 2 | 3 | Copyright (c) 2009, 2014 Christian Kohlschütter 4 | 5 | The author licenses this file to You under the Apache License, Version 2.0 6 | (the "License"); you may not use this file except in compliance with 7 | the License. You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/README.md: -------------------------------------------------------------------------------- 1 | Fork of boilerpipe from https://github.com/kohlschutter/boilerpipe. 2 | 3 | This version produces offset annotation for links in the extracted TextBlocks. 4 | It also places a double newline between disconnected text blocks to help in paragraph and sentence segementation. 5 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | 7 | com.ibm.research.ai.ki 8 | cc-dbp-parent-pom 9 | 1.0.0-SNAPSHOT 10 | ../.. 11 | 12 | 13 | boilerpipe-common 14 | 1.0.0-SNAPSHOT 15 | 16 | 17 | 18 | com.ibm.research.ai.ki 19 | nekohtml 20 | 1.9.13-SNAPSHOT 21 | 22 | 23 | 24 | xerces 25 | xercesImpl 26 | 2.12.0 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/BoilerpipeDocumentSource.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe; 19 | 20 | import com.kohlschutter.boilerpipe.document.TextDocument; 21 | 22 | /** 23 | * Something that can be represented as a {@link TextDocument}. 24 | */ 25 | public interface BoilerpipeDocumentSource { 26 | TextDocument toTextDocument() throws BoilerpipeProcessingException; 27 | } 28 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/BoilerpipeFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe; 19 | 20 | import com.kohlschutter.boilerpipe.document.TextDocument; 21 | 22 | /** 23 | * A generic {@link BoilerpipeFilter}. Takes a {@link TextDocument} and processes it somehow. 24 | */ 25 | public interface BoilerpipeFilter { 26 | /** 27 | * Processes the given document doc. 28 | * 29 | * @param doc The {@link TextDocument} that is to be processed. 30 | * @return true if changes have been made to the {@link TextDocument}. 31 | * @throws BoilerpipeProcessingException 32 | */ 33 | boolean process(final TextDocument doc) throws BoilerpipeProcessingException; 34 | } 35 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/BoilerpipeInput.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe; 19 | 20 | import com.kohlschutter.boilerpipe.document.TextDocument; 21 | 22 | /** 23 | * A source that returns {@link TextDocument}s. 24 | */ 25 | public interface BoilerpipeInput { 26 | /** 27 | * Returns (somehow) a {@link TextDocument}. 28 | * 29 | * @return A {@link TextDocument}. 30 | * @throws BoilerpipeProcessingException 31 | */ 32 | TextDocument getTextDocument() throws BoilerpipeProcessingException; 33 | } 34 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/BoilerpipeProcessingException.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe; 19 | 20 | /** 21 | * Exception for signaling failure in the processing pipeline. 22 | */ 23 | public class BoilerpipeProcessingException extends Exception { 24 | private static final long serialVersionUID = 1L; 25 | 26 | public BoilerpipeProcessingException() { 27 | super(); 28 | } 29 | 30 | public BoilerpipeProcessingException(String message, Throwable cause) { 31 | super(message, cause); 32 | } 33 | 34 | public BoilerpipeProcessingException(String message) { 35 | super(message); 36 | } 37 | 38 | public BoilerpipeProcessingException(Throwable cause) { 39 | super(cause); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/conditions/TextBlockCondition.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.conditions; 19 | 20 | import com.kohlschutter.boilerpipe.document.TextBlock; 21 | import com.kohlschutter.boilerpipe.labels.ConditionalLabelAction; 22 | 23 | /** 24 | * Evaluates whether a given {@link TextBlock} meets a certain condition. 25 | * 26 | * Useful in combination with {@link ConditionalLabelAction}. 27 | */ 28 | public interface TextBlockCondition { 29 | /** 30 | * Returns true iff the given {@link TextBlock} tb meets the defined condition. 31 | * 32 | * @param tb 33 | * @return iff the condition is met. 34 | */ 35 | boolean meetsCondition(final TextBlock tb); 36 | } 37 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/BPAnnotation.java: -------------------------------------------------------------------------------- 1 | package com.kohlschutter.boilerpipe.document; 2 | 3 | /** 4 | * Used to represent structured elements of the html page that will be retained as offset annotations on the document. 5 | * @author mrglass 6 | * 7 | */ 8 | public abstract class BPAnnotation implements Cloneable { 9 | public static final boolean debug = false; 10 | 11 | //CONSIDER: tag type? like 'a' or 'h1' or 'b' 12 | public int start; 13 | public int end; 14 | 15 | public final String localName; 16 | 17 | protected BPAnnotation(String localName) { 18 | this.start = 10000000; 19 | this.end = -10000000; 20 | this.localName = localName.toLowerCase(); 21 | } 22 | 23 | public boolean isValid() { 24 | return end > start; 25 | } 26 | 27 | public void addOffset(int offset) { 28 | this.start += offset; 29 | this.end += offset; 30 | } 31 | 32 | public BPAnnotation clone() { 33 | try { 34 | return (BPAnnotation)super.clone(); 35 | } catch (CloneNotSupportedException e) { 36 | throw new Error(e); 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/HeaderAnnotation.java: -------------------------------------------------------------------------------- 1 | package com.kohlschutter.boilerpipe.document; 2 | 3 | /** 4 | * HTML h* header annotation. 5 | * @author mrglass 6 | * 7 | */ 8 | public class HeaderAnnotation extends BPAnnotation { 9 | public HeaderAnnotation(String localName) { 10 | super(localName); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/Link.java: -------------------------------------------------------------------------------- 1 | package com.kohlschutter.boilerpipe.document; 2 | 3 | /** 4 | * HTML anchor tag as offset annotation 5 | * @author mrglass 6 | * 7 | */ 8 | public class Link extends BPAnnotation { 9 | public String href; 10 | 11 | public Link(String href) { 12 | super("a"); 13 | this.href = href; 14 | } 15 | 16 | public boolean isValid() { 17 | return start < end && href != null; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/ParagraphAnnotation.java: -------------------------------------------------------------------------------- 1 | package com.kohlschutter.boilerpipe.document; 2 | 3 | /** 4 | * HTML paragraph tag 5 | * @author mrglass 6 | * 7 | */ 8 | public class ParagraphAnnotation extends BPAnnotation { 9 | public ParagraphAnnotation() { 10 | super("p"); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/TextDocumentStatistics.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.document; 19 | 20 | /** 21 | * Provides shallow statistics on a given {@link TextDocument} 22 | */ 23 | public final class TextDocumentStatistics { 24 | private int numWords = 0; 25 | private int numBlocks = 0; 26 | 27 | /** 28 | * Computes statistics on a given {@link TextDocument}. 29 | * 30 | * @param doc The {@link TextDocument}. 31 | * @param contentOnly if true then o 32 | */ 33 | public TextDocumentStatistics(final TextDocument doc, final boolean contentOnly) { 34 | for (TextBlock tb : doc.getTextBlocks()) { 35 | if (contentOnly && !tb.isContent()) { 36 | continue; 37 | } 38 | 39 | numWords += tb.getNumWords(); 40 | numBlocks++; 41 | } 42 | } 43 | 44 | /** 45 | * Returns the average number of words at block-level (= overall number of words divided by the 46 | * number of blocks). 47 | * 48 | * @return Average 49 | */ 50 | public float avgNumWords() { 51 | return numWords / (float) numBlocks; 52 | } 53 | 54 | /** 55 | * Returns the overall number of words in all blocks. 56 | * 57 | * @return Sum 58 | */ 59 | public int getNumWords() { 60 | return numWords; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/TextFormatAnnotation.java: -------------------------------------------------------------------------------- 1 | package com.kohlschutter.boilerpipe.document; 2 | 3 | public class TextFormatAnnotation extends BPAnnotation { 4 | public TextFormatAnnotation(String localName) { 5 | super(localName); 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * The Boilerpipe document model. 3 | */ 4 | package com.kohlschutter.boilerpipe.document; 5 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/estimators/SimpleEstimator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.estimators; 19 | 20 | import com.kohlschutter.boilerpipe.BoilerpipeExtractor; 21 | import com.kohlschutter.boilerpipe.document.TextDocumentStatistics; 22 | import com.kohlschutter.boilerpipe.extractors.ArticleExtractor; 23 | import com.kohlschutter.boilerpipe.extractors.DefaultExtractor; 24 | 25 | /** 26 | * Estimates the "goodness" of a {@link BoilerpipeExtractor} on a given document. 27 | */ 28 | public final class SimpleEstimator { 29 | 30 | /** 31 | * Returns the singleton instance of {@link SimpleEstimator} 32 | */ 33 | public static final SimpleEstimator INSTANCE = new SimpleEstimator(); 34 | 35 | private SimpleEstimator() { 36 | } 37 | 38 | /** 39 | * Given the statistics of the document before and after applying the {@link BoilerpipeExtractor}, 40 | * can we regard the extraction quality (too) low? 41 | * 42 | * Works well with {@link DefaultExtractor}, {@link ArticleExtractor} and others. 43 | * 44 | * @param dsBefore 45 | * @param dsAfter 46 | * @return true if low quality is to be expected. 47 | */ 48 | public boolean isLowQuality(final TextDocumentStatistics dsBefore, 49 | final TextDocumentStatistics dsAfter) { 50 | if (dsBefore.getNumWords() < 90 || dsAfter.getNumWords() < 70) { 51 | return true; 52 | } 53 | 54 | if (dsAfter.avgNumWords() < 25) { 55 | return true; 56 | } 57 | 58 | return false; 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/ArticleSentencesExtractor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.extractors; 19 | 20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException; 21 | import com.kohlschutter.boilerpipe.document.TextDocument; 22 | import com.kohlschutter.boilerpipe.filters.simple.MinClauseWordsFilter; 23 | import com.kohlschutter.boilerpipe.filters.simple.SplitParagraphBlocksFilter; 24 | 25 | /** 26 | * A full-text extractor which is tuned towards extracting sentences from news articles. 27 | */ 28 | public final class ArticleSentencesExtractor extends ExtractorBase { 29 | public static final ArticleSentencesExtractor INSTANCE = new ArticleSentencesExtractor(); 30 | 31 | /** 32 | * Returns the singleton instance for {@link ArticleSentencesExtractor}. 33 | */ 34 | public static ArticleSentencesExtractor getInstance() { 35 | return INSTANCE; 36 | } 37 | 38 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException { 39 | return 40 | 41 | ArticleExtractor.INSTANCE.process(doc) | SplitParagraphBlocksFilter.INSTANCE.process(doc) 42 | | MinClauseWordsFilter.INSTANCE.process(doc); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/CommonExtractors.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.extractors; 19 | 20 | import com.kohlschutter.boilerpipe.BoilerpipeExtractor; 21 | 22 | /** 23 | * Provides quick access to common {@link BoilerpipeExtractor}s. 24 | */ 25 | public final class CommonExtractors { 26 | private CommonExtractors() { 27 | } 28 | 29 | /** 30 | * Works very well for most types of Article-like HTML. 31 | */ 32 | public static final ArticleExtractor ARTICLE_EXTRACTOR = ArticleExtractor.INSTANCE; 33 | 34 | /** 35 | * Usually worse than {@link ArticleExtractor}, but simpler/no heuristics. 36 | */ 37 | public static final DefaultExtractor DEFAULT_EXTRACTOR = DefaultExtractor.INSTANCE; 38 | 39 | /** 40 | * Like {@link DefaultExtractor}, but keeps the largest text block only. 41 | */ 42 | public static final LargestContentExtractor LARGEST_CONTENT_EXTRACTOR = 43 | LargestContentExtractor.INSTANCE; 44 | 45 | /** 46 | * Trained on krdwrd Canola (different definition of "boilerplate"). You may give it a try. 47 | */ 48 | public static final CanolaExtractor CANOLA_EXTRACTOR = CanolaExtractor.INSTANCE; 49 | 50 | /** 51 | * Dummy Extractor; should return the input text. Use this to double-check that your problem is 52 | * within a particular {@link BoilerpipeExtractor}, or somewhere else. 53 | */ 54 | public static final KeepEverythingExtractor KEEP_EVERYTHING_EXTRACTOR = 55 | KeepEverythingExtractor.INSTANCE; 56 | } 57 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/DefaultExtractor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.extractors; 19 | 20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException; 21 | import com.kohlschutter.boilerpipe.document.TextDocument; 22 | import com.kohlschutter.boilerpipe.filters.english.DensityRulesClassifier; 23 | import com.kohlschutter.boilerpipe.filters.heuristics.BlockProximityFusion; 24 | import com.kohlschutter.boilerpipe.filters.heuristics.SimpleBlockFusionProcessor; 25 | 26 | /** 27 | * A quite generic full-text extractor. 28 | */ 29 | public class DefaultExtractor extends ExtractorBase { 30 | public static final DefaultExtractor INSTANCE = new DefaultExtractor(); 31 | 32 | /** 33 | * Returns the singleton instance for {@link DefaultExtractor}. 34 | */ 35 | public static DefaultExtractor getInstance() { 36 | return INSTANCE; 37 | } 38 | 39 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException { 40 | 41 | return 42 | 43 | SimpleBlockFusionProcessor.INSTANCE.process(doc) 44 | | BlockProximityFusion.MAX_DISTANCE_1.process(doc) 45 | | DensityRulesClassifier.INSTANCE.process(doc); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/KeepEverythingExtractor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.extractors; 19 | 20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException; 21 | import com.kohlschutter.boilerpipe.document.TextDocument; 22 | import com.kohlschutter.boilerpipe.filters.simple.MarkEverythingContentFilter; 23 | 24 | /** 25 | * Marks everything as content. 26 | */ 27 | public final class KeepEverythingExtractor extends ExtractorBase { 28 | 29 | public static final KeepEverythingExtractor INSTANCE = new KeepEverythingExtractor(); 30 | 31 | private KeepEverythingExtractor() { 32 | 33 | } 34 | 35 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException { 36 | return MarkEverythingContentFilter.INSTANCE.process(doc); 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/KeepEverythingWithMinKWordsExtractor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.extractors; 19 | 20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException; 21 | import com.kohlschutter.boilerpipe.document.TextDocument; 22 | import com.kohlschutter.boilerpipe.filters.heuristics.SimpleBlockFusionProcessor; 23 | import com.kohlschutter.boilerpipe.filters.simple.MarkEverythingContentFilter; 24 | import com.kohlschutter.boilerpipe.filters.simple.MinWordsFilter; 25 | 26 | /** 27 | * A full-text extractor which extracts the largest text component of a page. For news articles, it 28 | * may perform better than the {@link DefaultExtractor}, but usually worse than 29 | * {@link ArticleExtractor}. 30 | */ 31 | public final class KeepEverythingWithMinKWordsExtractor extends ExtractorBase { 32 | 33 | private final MinWordsFilter filter; 34 | 35 | public KeepEverythingWithMinKWordsExtractor(final int kMin) { 36 | this.filter = new MinWordsFilter(kMin); 37 | } 38 | 39 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException { 40 | return SimpleBlockFusionProcessor.INSTANCE.process(doc) 41 | | MarkEverythingContentFilter.INSTANCE.process(doc) | filter.process(doc); 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/LargestContentExtractor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.extractors; 19 | 20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException; 21 | import com.kohlschutter.boilerpipe.document.TextDocument; 22 | import com.kohlschutter.boilerpipe.filters.english.NumWordsRulesClassifier; 23 | import com.kohlschutter.boilerpipe.filters.heuristics.BlockProximityFusion; 24 | import com.kohlschutter.boilerpipe.filters.heuristics.KeepLargestBlockFilter; 25 | 26 | /** 27 | * A full-text extractor which extracts the largest text component of a page. For news articles, it 28 | * may perform better than the {@link DefaultExtractor}, but usually worse than 29 | * {@link ArticleExtractor}. 30 | */ 31 | public final class LargestContentExtractor extends ExtractorBase { 32 | public static final LargestContentExtractor INSTANCE = new LargestContentExtractor(); 33 | 34 | private LargestContentExtractor() { 35 | } 36 | 37 | /** 38 | * Returns the singleton instance for {@link LargestContentExtractor}. 39 | */ 40 | public static LargestContentExtractor getInstance() { 41 | return INSTANCE; 42 | } 43 | 44 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException { 45 | return NumWordsRulesClassifier.INSTANCE.process(doc) 46 | | BlockProximityFusion.MAX_DISTANCE_1.process(doc) 47 | | KeepLargestBlockFilter.INSTANCE.process(doc); 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/NumWordsRulesExtractor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.extractors; 19 | 20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException; 21 | import com.kohlschutter.boilerpipe.document.TextDocument; 22 | import com.kohlschutter.boilerpipe.filters.english.NumWordsRulesClassifier; 23 | 24 | /** 25 | * A quite generic full-text extractor solely based upon the number of words per block (the current, 26 | * the previous and the next block). 27 | */ 28 | public class NumWordsRulesExtractor extends ExtractorBase { 29 | public static final NumWordsRulesExtractor INSTANCE = new NumWordsRulesExtractor(); 30 | 31 | /** 32 | * Returns the singleton instance for {@link NumWordsRulesExtractor}. 33 | */ 34 | public static NumWordsRulesExtractor getInstance() { 35 | return INSTANCE; 36 | } 37 | 38 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException { 39 | 40 | return NumWordsRulesClassifier.INSTANCE.process(doc); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Some standard extractors (i.e., completely piped BoilerpipeFilters) 3 | */ 4 | package com.kohlschutter.boilerpipe.extractors; 5 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/english/HeuristicFilterBase.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.filters.english; 19 | 20 | import com.kohlschutter.boilerpipe.document.TextBlock; 21 | 22 | /** 23 | * Base class for some heuristics that are used by boilerpipe filters. 24 | */ 25 | abstract class HeuristicFilterBase { 26 | 27 | protected static int getNumFullTextWords(final TextBlock tb) { 28 | return getNumFullTextWords(tb, 9); 29 | } 30 | 31 | protected static int getNumFullTextWords(final TextBlock tb, float minTextDensity) { 32 | if (tb.getTextDensity() >= minTextDensity) { 33 | return tb.getNumWords(); 34 | } else { 35 | return 0; 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/english/MinFulltextWordsFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.filters.english; 19 | 20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter; 21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException; 22 | import com.kohlschutter.boilerpipe.document.TextBlock; 23 | import com.kohlschutter.boilerpipe.document.TextDocument; 24 | 25 | /** 26 | * Keeps only those content blocks which contain at least k full-text words (measured by 27 | * {@link HeuristicFilterBase#getNumFullTextWords(TextBlock)}). k is 30 by default. 28 | */ 29 | public final class MinFulltextWordsFilter extends HeuristicFilterBase implements BoilerpipeFilter { 30 | public static final MinFulltextWordsFilter DEFAULT_INSTANCE = new MinFulltextWordsFilter(30); 31 | private final int minWords; 32 | 33 | public static MinFulltextWordsFilter getDefaultInstance() { 34 | return DEFAULT_INSTANCE; 35 | } 36 | 37 | public MinFulltextWordsFilter(final int minWords) { 38 | this.minWords = minWords; 39 | } 40 | 41 | public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { 42 | 43 | boolean changes = false; 44 | 45 | for (TextBlock tb : doc.getTextBlocks()) { 46 | if (!tb.isContent()) { 47 | continue; 48 | } 49 | if (getNumFullTextWords(tb) < minWords) { 50 | tb.setIsContent(false); 51 | changes = true; 52 | } 53 | 54 | } 55 | 56 | return changes; 57 | 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/english/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * These BoilerpipeFilters have only been tested on English text. 3 | * 4 | * That is, they will probably work with other Western languages, but maybe need some parameter tuning to perform well. 5 | */ 6 | package com.kohlschutter.boilerpipe.filters.english; 7 | 8 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/heuristics/ListAtEndFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.filters.heuristics; 19 | 20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter; 21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException; 22 | import com.kohlschutter.boilerpipe.document.TextBlock; 23 | import com.kohlschutter.boilerpipe.document.TextDocument; 24 | import com.kohlschutter.boilerpipe.labels.DefaultLabels; 25 | 26 | /** 27 | * Marks nested list-item blocks after the end of the main content. 28 | */ 29 | public final class ListAtEndFilter implements BoilerpipeFilter { 30 | public static final ListAtEndFilter INSTANCE = new ListAtEndFilter(); 31 | 32 | private ListAtEndFilter() { 33 | } 34 | 35 | public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { 36 | 37 | boolean changes = false; 38 | 39 | int tagLevel = Integer.MAX_VALUE; 40 | for (TextBlock tb : doc.getTextBlocks()) { 41 | if (tb.isContent() && tb.hasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) { 42 | tagLevel = tb.getTagLevel(); 43 | } else { 44 | if (tb.getTagLevel() > tagLevel && tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT) 45 | && tb.hasLabel(DefaultLabels.LI) && tb.getLinkDensity() == 0) { 46 | tb.setIsContent(true); 47 | changes = true; 48 | } else { 49 | tagLevel = Integer.MAX_VALUE; 50 | } 51 | } 52 | } 53 | 54 | return changes; 55 | 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/heuristics/SimpleBlockFusionProcessor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.filters.heuristics; 19 | 20 | import java.util.Iterator; 21 | import java.util.List; 22 | 23 | import com.kohlschutter.boilerpipe.BoilerpipeFilter; 24 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException; 25 | import com.kohlschutter.boilerpipe.document.TextBlock; 26 | import com.kohlschutter.boilerpipe.document.TextDocument; 27 | 28 | /** 29 | * Merges two subsequent blocks if their text densities are equal. 30 | */ 31 | public class SimpleBlockFusionProcessor implements BoilerpipeFilter { 32 | public static final SimpleBlockFusionProcessor INSTANCE = new SimpleBlockFusionProcessor(); 33 | 34 | /** 35 | * Returns the singleton instance for BlockFusionProcessor. 36 | */ 37 | public static SimpleBlockFusionProcessor getInstance() { 38 | return INSTANCE; 39 | } 40 | 41 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException { 42 | List textBlocks = doc.getTextBlocks(); 43 | boolean changes = false; 44 | 45 | if (textBlocks.size() < 2) { 46 | return false; 47 | } 48 | 49 | TextBlock b1 = textBlocks.get(0); 50 | for (Iterator it = textBlocks.listIterator(1); it.hasNext();) { 51 | TextBlock b2 = it.next(); 52 | 53 | final boolean similar = (b1.getTextDensity() == b2.getTextDensity()); 54 | 55 | if (similar) { 56 | b1.mergeNext(b2); 57 | it.remove(); 58 | changes = true; 59 | } else { 60 | b1 = b2; 61 | } 62 | } 63 | 64 | return changes; 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/heuristics/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * These BoilerpipeFilters are pure heuristics. 3 | */ 4 | package com.kohlschutter.boilerpipe.filters.heuristics; 5 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/InvertedFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.filters.simple; 19 | 20 | import java.util.List; 21 | 22 | import com.kohlschutter.boilerpipe.BoilerpipeFilter; 23 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException; 24 | import com.kohlschutter.boilerpipe.document.TextBlock; 25 | import com.kohlschutter.boilerpipe.document.TextDocument; 26 | 27 | /** 28 | * Reverts the "isContent" flag for all {@link TextBlock}s 29 | */ 30 | public final class InvertedFilter implements BoilerpipeFilter { 31 | public static final InvertedFilter INSTANCE = new InvertedFilter(); 32 | 33 | private InvertedFilter() { 34 | } 35 | 36 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException { 37 | 38 | List tbs = doc.getTextBlocks(); 39 | if (tbs.isEmpty()) { 40 | return false; 41 | } 42 | for (TextBlock tb : tbs) { 43 | tb.setIsContent(!tb.isContent()); 44 | } 45 | 46 | return true; 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/LabelToBoilerplateFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.filters.simple; 19 | 20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter; 21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException; 22 | import com.kohlschutter.boilerpipe.document.TextBlock; 23 | import com.kohlschutter.boilerpipe.document.TextDocument; 24 | import com.kohlschutter.boilerpipe.labels.DefaultLabels; 25 | 26 | /** 27 | * Marks all blocks that contain a given label as "boilerplate". 28 | */ 29 | public final class LabelToBoilerplateFilter implements BoilerpipeFilter { 30 | public static final LabelToBoilerplateFilter INSTANCE_STRICTLY_NOT_CONTENT = 31 | new LabelToBoilerplateFilter(DefaultLabels.STRICTLY_NOT_CONTENT); 32 | 33 | private String[] labels; 34 | 35 | public LabelToBoilerplateFilter(final String... label) { 36 | this.labels = label; 37 | } 38 | 39 | public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { 40 | 41 | boolean changes = false; 42 | 43 | BLOCK_LOOP : for (TextBlock tb : doc.getTextBlocks()) { 44 | if (tb.isContent()) { 45 | for (String label : labels) { 46 | if (tb.hasLabel(label)) { 47 | tb.setIsContent(false); 48 | changes = true; 49 | continue BLOCK_LOOP; 50 | } 51 | } 52 | } 53 | } 54 | 55 | return changes; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/LabelToContentFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.filters.simple; 19 | 20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter; 21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException; 22 | import com.kohlschutter.boilerpipe.document.TextBlock; 23 | import com.kohlschutter.boilerpipe.document.TextDocument; 24 | 25 | /** 26 | * Marks all blocks that contain a given label as "content". 27 | */ 28 | public final class LabelToContentFilter implements BoilerpipeFilter { 29 | private String[] labels; 30 | 31 | public LabelToContentFilter(final String... label) { 32 | this.labels = label; 33 | } 34 | 35 | public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { 36 | 37 | boolean changes = false; 38 | 39 | BLOCK_LOOP : for (TextBlock tb : doc.getTextBlocks()) { 40 | if (!tb.isContent()) { 41 | for (String label : labels) { 42 | if (tb.hasLabel(label)) { 43 | tb.setIsContent(true); 44 | changes = true; 45 | continue BLOCK_LOOP; 46 | } 47 | } 48 | } 49 | } 50 | 51 | return changes; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/MarkEverythingBoilerplateFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.filters.simple; 19 | 20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter; 21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException; 22 | import com.kohlschutter.boilerpipe.document.TextBlock; 23 | import com.kohlschutter.boilerpipe.document.TextDocument; 24 | 25 | /** 26 | * Marks all blocks as boilerplate. 27 | */ 28 | public final class MarkEverythingBoilerplateFilter implements BoilerpipeFilter { 29 | public static final MarkEverythingBoilerplateFilter INSTANCE = 30 | new MarkEverythingBoilerplateFilter(); 31 | 32 | private MarkEverythingBoilerplateFilter() { 33 | } 34 | 35 | public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { 36 | 37 | boolean changes = false; 38 | 39 | for (TextBlock tb : doc.getTextBlocks()) { 40 | if (tb.isContent()) { 41 | tb.setIsContent(false); 42 | changes = true; 43 | } 44 | } 45 | 46 | return changes; 47 | 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/MarkEverythingContentFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.filters.simple; 19 | 20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter; 21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException; 22 | import com.kohlschutter.boilerpipe.document.TextBlock; 23 | import com.kohlschutter.boilerpipe.document.TextDocument; 24 | 25 | /** 26 | * Marks all blocks as content. 27 | */ 28 | public final class MarkEverythingContentFilter implements BoilerpipeFilter { 29 | public static final MarkEverythingContentFilter INSTANCE = new MarkEverythingContentFilter(); 30 | 31 | private MarkEverythingContentFilter() { 32 | } 33 | 34 | public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { 35 | 36 | boolean changes = false; 37 | 38 | for (TextBlock tb : doc.getTextBlocks()) { 39 | if (!tb.isContent()) { 40 | tb.setIsContent(true); 41 | changes = true; 42 | } 43 | } 44 | 45 | return changes; 46 | 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/MinWordsFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.filters.simple; 19 | 20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter; 21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException; 22 | import com.kohlschutter.boilerpipe.document.TextBlock; 23 | import com.kohlschutter.boilerpipe.document.TextDocument; 24 | 25 | /** 26 | * Keeps only those content blocks which contain at least k words. 27 | */ 28 | public final class MinWordsFilter implements BoilerpipeFilter { 29 | private final int minWords; 30 | 31 | public MinWordsFilter(final int minWords) { 32 | this.minWords = minWords; 33 | } 34 | 35 | public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { 36 | 37 | boolean changes = false; 38 | 39 | for (TextBlock tb : doc.getTextBlocks()) { 40 | if (!tb.isContent()) { 41 | continue; 42 | } 43 | if (tb.getNumWords() < minWords) { 44 | tb.setIsContent(false); 45 | changes = true; 46 | } 47 | 48 | } 49 | 50 | return changes; 51 | 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * These BoilerpipeFilters are straight-forward and probably not really specific to English. 3 | */ 4 | package com.kohlschutter.boilerpipe.filters.simple; 5 | 6 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/labels/ConditionalLabelAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.labels; 19 | 20 | import com.kohlschutter.boilerpipe.conditions.TextBlockCondition; 21 | import com.kohlschutter.boilerpipe.document.TextBlock; 22 | 23 | /** 24 | * Adds labels to a {@link TextBlock} if the given criteria are met. 25 | */ 26 | public final class ConditionalLabelAction extends LabelAction { 27 | 28 | private final TextBlockCondition condition; 29 | 30 | public ConditionalLabelAction(TextBlockCondition condition, String... labels) { 31 | super(labels); 32 | this.condition = condition; 33 | } 34 | 35 | public void addTo(final TextBlock tb) { 36 | if (condition.meetsCondition(tb)) { 37 | addLabelsTo(tb); 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/labels/DefaultLabels.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.labels; 19 | 20 | import com.kohlschutter.boilerpipe.document.TextBlock; 21 | 22 | /** 23 | * Some pre-defined labels which can be used in conjunction with {@link TextBlock#addLabel(String)} 24 | * and {@link TextBlock#hasLabel(String)}. 25 | */ 26 | public final class DefaultLabels { 27 | public static final String TITLE = "de.l3s.boilerpipe/TITLE"; 28 | public static final String ARTICLE_METADATA = "de.l3s.boilerpipe/ARTICLE_METADATA"; 29 | public static final String INDICATES_END_OF_TEXT = "de.l3s.boilerpipe/INDICATES_END_OF_TEXT"; 30 | public static final String MIGHT_BE_CONTENT = "de.l3s.boilerpipe/MIGHT_BE_CONTENT"; 31 | public static final String VERY_LIKELY_CONTENT = "de.l3s.boilerpipe/VERY_LIKELY_CONTENT"; 32 | public static final String STRICTLY_NOT_CONTENT = "de.l3s.boilerpipe/STRICTLY_NOT_CONTENT"; 33 | public static final String HR = "de.l3s.boilerpipe/HR"; 34 | public static final String LI = "de.l3s.boilerpipe/LI"; 35 | 36 | public static final String HEADING = "de.l3s.boilerpipe/HEADING"; 37 | public static final String H1 = "de.l3s.boilerpipe/H1"; 38 | public static final String H2 = "de.l3s.boilerpipe/H2"; 39 | public static final String H3 = "de.l3s.boilerpipe/H3"; 40 | 41 | public static final String MARKUP_PREFIX = "<"; 42 | 43 | private DefaultLabels() { 44 | // not to be instantiated 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/labels/LabelAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.labels; 19 | 20 | import java.util.Arrays; 21 | 22 | import com.kohlschutter.boilerpipe.document.TextBlock; 23 | 24 | /** 25 | * Helps adding labels to {@link TextBlock}s. 26 | * 27 | * @see ConditionalLabelAction 28 | */ 29 | public class LabelAction { 30 | protected final String[] labels; 31 | 32 | public LabelAction(String... labels) { 33 | this.labels = labels; 34 | } 35 | 36 | public void addTo(final TextBlock tb) { 37 | addLabelsTo(tb); 38 | } 39 | 40 | protected final void addLabelsTo(final TextBlock tb) { 41 | tb.addLabels(labels); 42 | } 43 | 44 | public String toString() { 45 | return super.toString() + "{" + Arrays.asList(labels) + "}"; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * The Boilerpipe top-level package. 3 | */ 4 | package com.kohlschutter.boilerpipe; 5 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/sax/HTMLDocument.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.sax; 19 | 20 | import java.io.ByteArrayInputStream; 21 | import java.nio.charset.Charset; 22 | 23 | import org.xml.sax.InputSource; 24 | 25 | /** 26 | * An {@link InputSourceable} for {@link HTMLFetcher}. 27 | */ 28 | public class HTMLDocument implements InputSourceable { 29 | private final Charset charset; 30 | private final byte[] data; 31 | 32 | public HTMLDocument(final byte[] data, final Charset charset) { 33 | this.data = data; 34 | this.charset = charset; 35 | } 36 | 37 | public HTMLDocument(final String data) { 38 | Charset cs = Charset.forName("utf-8"); 39 | this.data = data.getBytes(cs); 40 | this.charset = cs; 41 | } 42 | 43 | public Charset getCharset() { 44 | return charset; 45 | } 46 | 47 | public byte[] getData() { 48 | return data; 49 | } 50 | 51 | public InputSource toInputSource() { 52 | final InputSource is = new InputSource(new ByteArrayInputStream(data)); 53 | is.setEncoding(charset.name()); 54 | return is; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/sax/InputSourceable.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.sax; 19 | 20 | import org.xml.sax.InputSource; 21 | 22 | /** 23 | * An InputSourceable can return an arbitrary number of new {@link InputSource}s for a given 24 | * document. 25 | */ 26 | public interface InputSourceable { 27 | InputSource toInputSource(); 28 | } 29 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/sax/TagAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.sax; 19 | 20 | import org.xml.sax.Attributes; 21 | import org.xml.sax.SAXException; 22 | 23 | /** 24 | * Defines an action that is to be performed whenever a particular tag occurs during HTML parsing. 25 | */ 26 | public interface TagAction { 27 | 28 | boolean start(final BoilerpipeHTMLContentHandler instance, final String localName, 29 | final String qName, final Attributes atts) throws SAXException; 30 | 31 | boolean end(final BoilerpipeHTMLContentHandler instance, final String localName, 32 | final String qName) throws SAXException; 33 | 34 | boolean changesTagLevel(); 35 | } -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/sax/TagActionMap.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.sax; 19 | 20 | import java.util.HashMap; 21 | 22 | /** 23 | * Base class for definition a set of {@link TagAction}s that are to be used for the HTML parsing 24 | * process. 25 | * 26 | * @see DefaultTagActionMap 27 | */ 28 | public abstract class TagActionMap extends HashMap { 29 | private static final long serialVersionUID = 1L; 30 | 31 | /** 32 | * Sets a particular {@link TagAction} for a given tag. Any existing TagAction for that tag will 33 | * be removed and overwritten. 34 | * 35 | * @param tag The tag (will be stored internally 1. as it is, 2. lower-case, 3. upper-case) 36 | * @param action The {@link TagAction} 37 | */ 38 | protected void setTagAction(final String tag, final TagAction action) { 39 | put(tag.toUpperCase(), action); 40 | put(tag.toLowerCase(), action); 41 | put(tag, action); 42 | } 43 | 44 | /** 45 | * Adds a particular {@link TagAction} for a given tag. If a TagAction already exists for that 46 | * tag, a chained action, consisting of the previous and the new {@link TagAction} is created. 47 | * 48 | * @param tag The tag (will be stored internally 1. as it is, 2. lower-case, 3. upper-case) 49 | * @param action The {@link TagAction} 50 | */ 51 | protected void addTagAction(final String tag, final TagAction action) { 52 | TagAction previousAction = get(tag); 53 | if (previousAction == null) { 54 | setTagAction(tag, action); 55 | } else { 56 | setTagAction(tag, new CommonTagActions.Chained(previousAction, action)); 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/sax/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes related to parsing and producing HTML from/to Boilerpipe TextDocuments. 3 | */ 4 | package com.kohlschutter.boilerpipe.sax; 5 | 6 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/util/UnicodeTokenizer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * boilerpipe 3 | * 4 | * Copyright (c) 2009, 2014 Christian Kohlschütter 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.kohlschutter.boilerpipe.util; 19 | 20 | import java.util.regex.Pattern; 21 | 22 | /** 23 | * Tokenizes text according to Unicode word boundaries and strips off non-word characters. 24 | */ 25 | public class UnicodeTokenizer { 26 | private static final Pattern PAT_WORD_BOUNDARY = Pattern.compile("\\b"); 27 | private static final Pattern PAT_NOT_WORD_BOUNDARY = Pattern 28 | .compile("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)/])[\u2063]*"); 29 | 30 | /** 31 | * Tokenizes the text and returns an array of tokens. 32 | * 33 | * @param text The text 34 | * @return The tokens 35 | */ 36 | public static String[] tokenize(final CharSequence text) { 37 | return PAT_NOT_WORD_BOUNDARY.matcher(PAT_WORD_BOUNDARY.matcher(text).replaceAll("\u2063")) 38 | .replaceAll("$1").replaceAll("[ \u2063]+", " ").trim().split("[ ]+"); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/util/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Some helper classes. 3 | */ 4 | package com.kohlschutter.boilerpipe.util; 5 | 6 | -------------------------------------------------------------------------------- /boilerpipe/nekohtml/dependency-reduced-pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | cc-dbp-parent-pom 5 | com.ibm.research.ai.ki 6 | 1.0.0-SNAPSHOT 7 | ../../pom.xml 8 | 9 | 4.0.0 10 | nekohtml 11 | 1.9.13-SNAPSHOT 12 | 13 | 14 | 15 | maven-shade-plugin 16 | 2.3 17 | 18 | 19 | package 20 | 21 | shade 22 | 23 | 24 | 25 | 26 | net.sourceforge.nekohtml:nekohtml 27 | 28 | 29 | true 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | xerces 39 | xercesImpl 40 | 2.9.1 41 | compile 42 | 43 | 44 | xml-apis 45 | xml-apis 46 | 1.3.04 47 | compile 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /boilerpipe/nekohtml/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | 7 | com.ibm.research.ai.ki 8 | cc-dbp-parent-pom 9 | 1.0.0-SNAPSHOT 10 | ../.. 11 | 12 | 13 | nekohtml 14 | 1.9.13-SNAPSHOT 15 | 16 | 17 | 18 | 19 | maven-shade-plugin 20 | 2.3 21 | 22 | 23 | package 24 | 25 | shade 26 | 27 | 28 | 29 | 30 | net.sourceforge.nekohtml:nekohtml 31 | 32 | 33 | true 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | net.sourceforge.nekohtml 44 | nekohtml 45 | 1.9.13 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.corpus/src/main/java/com/ibm/research/ai/ki/corpora/crawl/CharsetDetect.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.corpora.crawl; 19 | 20 | import java.io.*; 21 | import java.nio.charset.*; 22 | 23 | import org.mozilla.universalchardet.*; 24 | 25 | public class CharsetDetect { 26 | static String mapCharset(String charsetName) { 27 | try { 28 | if (Charset.isSupported(charsetName)) 29 | return charsetName; 30 | String lc = charsetName.toLowerCase(); 31 | if(lc.contains("iso8859-1") || lc.contains("iso-8859-1")) { 32 | return "cp1252"; 33 | } 34 | return charsetName; 35 | } catch (Throwable t) { 36 | return "UTF-8"; 37 | } 38 | } 39 | 40 | public static String getCharsetFromBytes(byte buffer[]) throws IOException { 41 | UniversalDetector detector = new UniversalDetector(null); 42 | detector.handleData(buffer, 0, buffer.length); 43 | detector.dataEnd(); 44 | String charsetName = detector.getDetectedCharset(); 45 | detector.reset(); 46 | return mapCharset(charsetName); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.corpus/src/main/java/com/ibm/research/ai/ki/corpora/crawl/CommonCrawlConfig.java: -------------------------------------------------------------------------------- 1 | package com.ibm.research.ai.ki.corpora.crawl; 2 | 3 | import com.ibm.research.ai.ki.util.*; 4 | 5 | public class CommonCrawlConfig extends PropertyStruct { 6 | private static final long serialVersionUID = 1L; 7 | 8 | /** 9 | * See https://github.com/optimaize/language-detector for langauge options 10 | */ 11 | public String language = "en"; 12 | /** 13 | * The language detector is typically very confident, most values are close to one or zero 14 | */ 15 | public double minLanguageConfidence = 0.8; 16 | /** 17 | * Possible options are LinkAnnotation, SectionHeader, Paragraph and TextFormating. 18 | * LinkAnnotation retains the anchor tag information (which spans of text are links and where they link to). 19 | */ 20 | public String[] annotationTypes = new String[] {"LinkAnnotation"}; 21 | /** 22 | * Number of threads downloading parts of Common Crawl, also the number of part files that will be created. 23 | */ 24 | public int numThreads = 8; 25 | /** 26 | * URL prefix to add to the WARC file list 27 | */ 28 | public String urlPrefix = "https://commoncrawl.s3.amazonaws.com/"; 29 | 30 | /** 31 | * To download only a portion of common crawl, limited to this many files. 32 | */ 33 | public int warcFileLimit; 34 | } 35 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.corpus/src/main/resources/cc-dbp/cc-dbp.properties: -------------------------------------------------------------------------------- 1 | #CommonCrawlConfig 2 | 3 | language=en 4 | minLanguageConfidence=0.8 5 | numThreads=8 6 | annotationTypes = [LinkAnnotation] 7 | urlPrefix = https://commoncrawl.s3.amazonaws.com/ 8 | 9 | 10 | 11 | #support downloading only a portion with 12 | warcFileLimit=10 -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.corpus/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=WARN, stdout 3 | 4 | # Redirect log messages to console 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.corpus/src/main/resources/simplelogger.properties: -------------------------------------------------------------------------------- 1 | # SLF4J's SimpleLogger configuration file 2 | # Simple implementation of Logger that sends all enabled log messages, for all defined loggers, to System.err. 3 | 4 | # Default logging detail level for all instances of SimpleLogger. 5 | # Must be one of ("trace", "debug", "info", "warn", or "error"). 6 | # If not specified, defaults to "info". 7 | org.slf4j.simpleLogger.defaultLogLevel=warn 8 | 9 | # Logging detail level for a SimpleLogger instance named "xxxxx". 10 | # Must be one of ("trace", "debug", "info", "warn", or "error"). 11 | # If not specified, the default logging detail level is used. 12 | #org.slf4j.simpleLogger.log.xxxxx= 13 | 14 | # Set to true if you want the current date and time to be included in output messages. 15 | # Default is false, and will output the number of milliseconds elapsed since startup. 16 | #org.slf4j.simpleLogger.showDateTime=false 17 | 18 | # The date and time format to be used in the output messages. 19 | # The pattern describing the date and time format is the same that is used in java.text.SimpleDateFormat. 20 | # If the format is not specified or is invalid, the default format is used. 21 | # The default format is yyyy-MM-dd HH:mm:ss:SSS Z. 22 | #org.slf4j.simpleLogger.dateTimeFormat=yyyy-MM-dd HH:mm:ss:SSS Z 23 | 24 | # Set to true if you want to output the current thread name. 25 | # Defaults to true. 26 | #org.slf4j.simpleLogger.showThreadName=true 27 | 28 | # Set to true if you want the Logger instance name to be included in output messages. 29 | # Defaults to true. 30 | #org.slf4j.simpleLogger.showLogName=true 31 | 32 | # Set to true if you want the last component of the name to be included in output messages. 33 | # Defaults to false. 34 | #org.slf4j.simpleLogger.showShortLogName=false -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kb/README.md: -------------------------------------------------------------------------------- 1 | 1) ConvertDBpedia 2 | deal with the whole 'M' suffix thing 3 | 4 | Optional: get idCounts.tsv (only have spark version for this right now) this requires running BuildGazetteer on the unfiltered 5 | 6 | 2) BuildGroundTruth 7 | 3) BuildGazetteer 8 | 4) TypePairFilter 9 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kb/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | 5 | com.ibm.research.ai.ki 6 | cc-dbp-parent-pom 7 | 1.0.0-SNAPSHOT 8 | 9 | 10 | kb 11 | 1.0.0-SNAPSHOT 12 | 13 | 14 | 15 | com.ibm.research.ai.ki 16 | util 17 | 1.0.0-SNAPSHOT 18 | 19 | 20 | 21 | com.ibm.research.ai.ki 22 | nlp 23 | 1.0.0-SNAPSHOT 24 | 25 | 26 | 27 | com.ibm.research.ai.ki 28 | kbp 29 | 1.0.0-SNAPSHOT 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/ConfigureMinMaxEntityFreq.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kb; 19 | 20 | import java.io.*; 21 | import java.util.*; 22 | 23 | import com.ibm.research.ai.ki.util.*; 24 | 25 | /** 26 | * Shows examples of entities that occur at different frequency ranges, so that a sensible maximum occurrence frequency can be selected, 27 | * and possibly a minimum occurrence frequency. 28 | * @author mrglass 29 | * 30 | */ 31 | public class ConfigureMinMaxEntityFreq { 32 | public static void main(String[] args) { 33 | String kbDir = args[0]; 34 | 35 | RandomUtil.Sample[] termsByFreq = new RandomUtil.Sample[20]; 36 | for (int i = 0; i < termsByFreq.length; ++i) { 37 | termsByFreq[i] = new RandomUtil.Sample<>(20); 38 | } 39 | Map idCounts = SparseVectors.fromString(FileUtil.readFileAsString(new File(kbDir, KBFiles.idCountsTsv))); 40 | for (Map.Entry e : idCounts.entrySet()) { 41 | int bucket = (int)Math.log(e.getValue().value); 42 | if (bucket < 0) bucket = 0; 43 | if (bucket >= termsByFreq.length) bucket = termsByFreq.length-1; 44 | termsByFreq[bucket].maybeSave(Lang.LPAD(""+((int)e.getValue().value), 10)+" "+e.getKey()); 45 | } 46 | for (int i = 0; i < termsByFreq.length; ++i) { 47 | System.out.println("======================================="); 48 | System.out.println(Lang.stringList(termsByFreq[i], "\n")); 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/GroundTruthConfig.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kb; 19 | 20 | import com.ibm.research.ai.ki.util.*; 21 | 22 | public class GroundTruthConfig extends PropertyStruct { 23 | private static final long serialVersionUID = 1L; 24 | 25 | 26 | public int minCorpusCount = 1; 27 | public int maxCorpusCount = 300000; 28 | public int minUnaryCount = 100; 29 | public boolean useRelationTaxonomy = true; 30 | 31 | //CONSIDER: also type selection config 32 | } 33 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/KBConfig.java: -------------------------------------------------------------------------------- 1 | package com.ibm.research.ai.ki.kb; 2 | 3 | import java.io.*; 4 | 5 | import com.ibm.research.ai.ki.util.*; 6 | 7 | public class KBConfig extends PropertyStruct { 8 | private static final long serialVersionUID = 1L; 9 | 10 | public String kbDir; 11 | 12 | /** 13 | * To avoid generic terms, we ignore terms that occur more than this many times. 14 | */ 15 | public int maxNodeCorpusCount = 3000000; 16 | /** 17 | * We can ignore rare terms if desired. 18 | */ 19 | public int minNodeCorpusCount = 1; 20 | /** 21 | * 22 | */ 23 | public int minUnaryCount = 100; 24 | /** 25 | * Whether to consider super-relations in the labels for context sets. 26 | */ 27 | public boolean useRelationTaxonomy = true; 28 | 29 | //for the coarse-grained type system 30 | /** 31 | * A type must have this many instances for which it is the most specific type 32 | */ 33 | public int minTypeSize = 3000; 34 | /** 35 | * We will have no more than this many types in the coarse grained type system 36 | */ 37 | public int maxNumberOfTypes = 100; 38 | 39 | //for the type filter 40 | /** 41 | * If an unordered type-pair does not have at least this many triples, it will not have any contexts generated. 42 | * So if number-number relations never occur, we will never generated contexts for a number-number node-pair. 43 | */ 44 | public int minTypePairFreq = 1; 45 | 46 | public int minTypeFreqForUnary = 1; 47 | 48 | 49 | public File kbDir() { 50 | return new File(kbDir); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/KBFiles.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kb; 19 | 20 | /** 21 | * Files that can be present in a kb directory 22 | * @author mrglass 23 | * 24 | */ 25 | public class KBFiles { 26 | public static final String triplesTsv = "triples.tsv"; 27 | public static final String labelsTsv = "labels.tsv"; 28 | public static final String relationTaxonomyTsv = "relationTaxonomy.tsv"; 29 | public static final String typesTsv = "types.tsv"; 30 | public static final String popularityTsv = "popularity.tsv"; 31 | //from DocEntityStats in ie.spark 32 | public static final String idCountsTsv = "idCounts.tsv"; 33 | } 34 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/conversion/DBpediaKBConfig.java: -------------------------------------------------------------------------------- 1 | package com.ibm.research.ai.ki.kb.conversion; 2 | 3 | import java.io.*; 4 | 5 | import com.ibm.research.ai.ki.kb.*; 6 | 7 | public class DBpediaKBConfig extends KBConfig { 8 | private static final long serialVersionUID = 1L; 9 | 10 | public String dbpediaOwlUrl; 11 | 12 | public String objectsUrl; 13 | 14 | public String literalsUrl; 15 | 16 | public String labelsUrl; 17 | 18 | public String typesUrl; 19 | 20 | /** 21 | * We can construct the KB without using idCounts.tsv if desired. Since getting idCounts.tsv requires running a gazetteer over the corpus and is potentially slow. 22 | */ 23 | public boolean noNodeCorpusCounts; 24 | 25 | 26 | protected File file(String url) { 27 | return new File(kbDir, url.substring(url.lastIndexOf('/')+1)); 28 | } 29 | 30 | public File dbpediaOwlFile() { 31 | return file(dbpediaOwlUrl); 32 | } 33 | 34 | public File objectsFile() { 35 | return file(objectsUrl); 36 | } 37 | 38 | public File literalsFile() { 39 | return file(literalsUrl); 40 | } 41 | 42 | public File labelsFile() { 43 | return file(labelsUrl); 44 | } 45 | 46 | public File typesFile() { 47 | return file(typesUrl); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/explore/FilterByCorpusCount.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kb.explore; 19 | 20 | import java.io.*; 21 | import java.util.*; 22 | 23 | import com.ibm.research.ai.ki.formats.*; 24 | import com.ibm.research.ai.ki.util.*; 25 | 26 | 27 | public class FilterByCorpusCount { 28 | 29 | public static void main(String[] args) { 30 | String kbDir = args[0]; 31 | String kbDirFiltered = args[1]; 32 | int minCount = 1; 33 | if (args.length > 2) 34 | minCount = Integer.parseInt(args[2]); 35 | Map idCounts = SparseVectors.fromString(FileUtil.readFileAsString(new File(kbDir, "idCounts.tsv"))); 36 | try (PrintStream out = FileUtil.getFilePrintStream(new File(kbDirFiltered, "labels.tsv").getAbsolutePath())) { 37 | for (String[] lbl : new SimpleTsvIterable(new File(kbDir, "labels.tsv"))) { 38 | if (SparseVectors.getDefaultZero(idCounts, lbl[0]) >= minCount) { 39 | out.println(Lang.stringList(lbl, "\t")); 40 | } 41 | } 42 | } 43 | try (PrintStream out = FileUtil.getFilePrintStream(new File(kbDirFiltered, "triples.tsv").getAbsolutePath())) { 44 | for (String[] trip : new SimpleTsvIterable(new File(kbDir, "triples.tsv"))) { 45 | if (SparseVectors.getDefaultZero(idCounts, trip[0]) >= minCount && SparseVectors.getDefaultZero(idCounts, trip[2]) >= minCount) { 46 | out.println(Lang.stringList(trip, "\t")); 47 | } 48 | } 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kb/src/main/resources/dbpediaConfig.properties: -------------------------------------------------------------------------------- 1 | dbpediaOwlUrl=http://downloads.dbpedia.org/2016-10/dbpedia_2016-10.owl 2 | objectsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/mappingbased_objects_en.ttl.bz2 3 | literalsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/mappingbased_literals_en.ttl.bz2 4 | labelsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/labels_en.ttl.bz2 5 | typesUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/instance_types_transitive_en.ttl.bz2 6 | 7 | #for ground truth 8 | maxNodeCorpusCount = 300000 9 | minNodeCorpusCount = 1 10 | useRelationTaxonomy = True 11 | 12 | #for the coarse-grained type system 13 | minTypeSize = 3000 14 | maxNumberOfTypes = 100 15 | 16 | #for the type filter 17 | minTypePairFreq = 1 18 | 19 | noNodeCorpusCounts = False 20 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | 5 | com.ibm.research.ai.ki 6 | cc-dbp-parent-pom 7 | 1.0.0-SNAPSHOT 8 | 9 | 10 | kbp 11 | 1.0.0-SNAPSHOT 12 | 13 | 14 | 15 | com.ibm.research.ai.ki 16 | util 17 | 1.0.0-SNAPSHOT 18 | 19 | 20 | 21 | com.ibm.research.ai.ki 22 | nlp 23 | 1.0.0-SNAPSHOT 24 | 25 | 26 | 27 | 28 | org.apache.wink 29 | wink-json4j 30 | ${wink-json4j.version} 31 | 32 | 33 | 34 | com.google.guava 35 | guava 36 | ${guava.version} 37 | 38 | 39 | 40 | org.apache.commons 41 | commons-lang3 42 | ${commons-lang3.version} 43 | 44 | 45 | 46 | commons-cli 47 | commons-cli 48 | ${commons-cli.version} 49 | 50 | 51 | 52 | it.unimi.dsi 53 | fastutil 54 | 7.1.0 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/CoveredTextEntityId.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp; 19 | 20 | import java.util.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | import com.ibm.reseach.ai.ki.nlp.types.*; 24 | 25 | /** 26 | * For those entities without an id, we simply give them an id equal to the covered text, case normalized. 27 | * So it is a text-equals entity linker. 28 | * @author mrglass 29 | * 30 | */ 31 | public class CoveredTextEntityId implements IPostprocessEntityRecognition { 32 | private static final long serialVersionUID = 1L; 33 | 34 | @Override 35 | public void initialize(Properties config) {} 36 | 37 | @Override 38 | public void process(Document doc) { 39 | for (EntityWithId e : doc.getAnnotations(EntityWithId.class)) { 40 | if (e.id == null) 41 | e.id = e.coveredText(doc).toLowerCase().trim().replaceAll("\\s+", " "); 42 | } 43 | } 44 | 45 | @Override 46 | public void initialize(IGroundTruth gt, RelexConfig config) {} 47 | 48 | } 49 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/DocumentFeatureString.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp; 19 | 20 | import com.ibm.reseach.ai.ki.nlp.*; 21 | 22 | public class DocumentFeatureString implements DocumentStructure { 23 | private static final long serialVersionUID = 1L; 24 | 25 | public String featureString; 26 | 27 | public DocumentFeatureString(String featureString) { 28 | this.featureString = featureString; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/FilterEntsByGroundTruth.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp; 19 | 20 | import java.util.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | import com.ibm.reseach.ai.ki.nlp.types.*; 24 | 25 | public class FilterEntsByGroundTruth implements IPostprocessEntityRecognition { 26 | private static final long serialVersionUID = 1L; 27 | 28 | protected Set relevantUrls; 29 | 30 | @Override 31 | public void initialize(Properties config) {} 32 | 33 | @Override 34 | public void process(Document doc) { 35 | doc.removeAnnotations(EntityWithId.class, e -> !relevantUrls.contains(e.id)); 36 | } 37 | 38 | @Override 39 | public void initialize(IGroundTruth gt, RelexConfig config) { 40 | this.relevantUrls = gt.getRelevantIds(); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IEntityPairFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp; 19 | 20 | import java.io.*; 21 | 22 | /** 23 | * A class implementing this will be specified in the RelexConfig if some filtering of entity-pairs is desired. 24 | * Otherwise the tsv dataset will contain all pairs of EntityWithId that occur in the same sentence. 25 | * @author mrglass 26 | * 27 | */ 28 | public interface IEntityPairFilter extends Serializable { 29 | /** 30 | * In Spark, initialize is called in the Spark head 31 | * @param gt 32 | * @param config 33 | */ 34 | public void initialize(GroundTruth gt, RelexConfig config); 35 | /** 36 | * Return true if the entity-pair is a good candidate 37 | * @param id1 38 | * @param type1 39 | * @param id2 40 | * @param type2 41 | * @return 42 | */ 43 | public boolean test(String id1, String type1, String id2, String type2); 44 | } 45 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IGroundTruth.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp; 19 | 20 | import java.io.*; 21 | import java.util.*; 22 | 23 | /** 24 | * A generic ground truth interface, the methods needed for preprocessing. 25 | * @author mrglass 26 | * 27 | */ 28 | public interface IGroundTruth extends Serializable { 29 | public String getType(String id); 30 | public Set getRelevantIds(); 31 | 32 | public Map buildEntitySetId2Relations(); 33 | } 34 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IPostprocessEntityRecognition.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp; 19 | 20 | import com.ibm.reseach.ai.ki.nlp.*; 21 | 22 | /** 23 | * An annotator that postprocesses the entity recognition and linking. 24 | * Often to remove entities not of interest, or to fill in type based on id or id for NIL entity linking. 25 | * @author mrglass 26 | * 27 | */ 28 | public interface IPostprocessEntityRecognition extends Annotator { 29 | public void initialize(IGroundTruth gt, RelexConfig config); 30 | } 31 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IRelexDatasetManager.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp; 19 | 20 | import java.io.*; 21 | 22 | import com.ibm.research.ai.ki.kbp.*; 23 | 24 | /** 25 | * Provides classes for representing and creating a dataset for training/evaluation/mass-apply of 26 | * a relational knowledge induction system. 27 | * 28 | * @author mrglass 29 | * 30 | * @param 31 | */ 32 | public interface IRelexDatasetManager extends Serializable { 33 | 34 | public IRelexTsv getTsvMaker(); 35 | public IGroundTruth getGroundTruth(); 36 | public Class getMentionClass(); 37 | public IRelexTensors getTensorMaker(); 38 | 39 | /** 40 | * before this method is called, only getMentionClass is supposed to be called 41 | * @param config 42 | */ 43 | public void initialize(RelexConfig config); 44 | } -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IRelexMention.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp; 19 | 20 | import java.io.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | 24 | /** 25 | * So we can unify the code for binary and unary relation mention 26 | * @author mrglass 27 | * 28 | */ 29 | public interface IRelexMention extends Serializable { 30 | //for reduce by key 31 | public String groupId(); 32 | public int groupSplit(int splitCount); 33 | 34 | //the canonically ordered list of ids, separated by '\t'; if group ids are enabled the group id is given here too 35 | public String entitySetId(); 36 | 37 | //downsampling and splitting train/validate/test 38 | public double getNegativeDownsamplePriority(); 39 | public double getDatasetSplitPosition(); 40 | //for negative downsampling 41 | public boolean isNegative(); 42 | 43 | //where the document the mention comes from appears in the x-axis of the document learning curve (0-1) 44 | public double getDocumentLearningCurvePosition(); 45 | 46 | //for vocab construction 47 | public String[] getTypes(); 48 | public String[] getRelations(); 49 | public String[] getTokens(Annotator tokenizer); 50 | 51 | //saving and loading from tsv 52 | public void fromString(String tsvLine); 53 | 54 | public String toString(); 55 | 56 | //to avoid duplicates in a mentionset, if non-null, two IRelexMentions that share a uniquenessString are duplicates. 57 | public String uniquenessString(); 58 | 59 | /** 60 | * A human readable format for showing the support for an extracted relation. 61 | * @return 62 | */ 63 | public String toSupportString(); 64 | 65 | public void convertToPlaceholders(); 66 | } 67 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IRelexTensors.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp; 19 | 20 | import java.io.*; 21 | import java.util.*; 22 | 23 | import com.ibm.reseach.ai.ki.nlp.*; 24 | 25 | /** 26 | * Creates the deep learning input tensors from a set of RelexMentions 27 | * @author mrglass 28 | * 29 | * @param 30 | */ 31 | public interface IRelexTensors extends Serializable { 32 | public String[] getTypes(); 33 | public String[] getRelations(); 34 | /** 35 | * The first object is assumed to be the String groupId. 36 | * @param tokenizer 37 | * @param fullMentionSet 38 | * @return 39 | */ 40 | public List makeInstances(Annotator tokenizer, Collection fullMentionSet); 41 | } 42 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IRelexTsv.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp; 19 | 20 | import java.io.*; 21 | import java.util.*; 22 | 23 | import com.ibm.reseach.ai.ki.nlp.*; 24 | 25 | /** 26 | * Pulls the IRelexMentions out of a Document that has EntityWithId, Token and Sentence annotations. 27 | * @author mrglass 28 | * 29 | * @param 30 | */ 31 | public interface IRelexTsv extends Serializable { 32 | public List getMentions(Document doc); 33 | } 34 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/NounPhraseEntityWithId.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp; 19 | 20 | import java.util.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | import com.ibm.reseach.ai.ki.nlp.types.*; 24 | 25 | public class NounPhraseEntityWithId implements Annotator { 26 | private static final long serialVersionUID = 1L; 27 | 28 | public static final String SOURCE = NounPhraseEntityWithId.class.getSimpleName(); 29 | 30 | @Override 31 | public void initialize(Properties config) {} 32 | 33 | //NPs with these as their first tokens are not entity terms 34 | protected Set ignoreFirstTokens = new HashSet<>(Arrays.asList( 35 | "the", 36 | "that", "these", "those", "this", 37 | "a", "an", 38 | "who", "which", "it", 39 | "its", "your", "our", "my", "their", 40 | "you", "me")); 41 | 42 | 43 | @Override 44 | public void process(Document doc) { 45 | for (Chunk c : doc.getAnnotations(Chunk.class)) { 46 | if ("NP".equals(c.tag)) { 47 | 48 | Token firstToken = doc.getAnnotations(Token.class, c).get(0); 49 | if (ignoreFirstTokens.contains(firstToken.coveredText(doc).toLowerCase())) 50 | continue; 51 | 52 | if (c.coveredText(doc).replaceAll("\\W+", "").isEmpty()) 53 | continue; 54 | 55 | doc.addAnnotation(new EntityWithId(SOURCE, 56 | c.start, c.end, 57 | GroundTruth.unknownType, c.coveredText(doc).toLowerCase())); 58 | } 59 | } 60 | //we could drop chunk annotations now 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/RelexDatasetFiles.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp; 19 | 20 | /** 21 | * The files saved in the tsv dataset to tensor dataset conversion. 22 | * Also used when training and applying the model. 23 | * 24 | * @author mrglass 25 | * 26 | */ 27 | public class RelexDatasetFiles { 28 | //in the convert dir 29 | public static final String wordVectors = "wordVectors.ef"; 30 | public static final String groupSplits = "groupSplits.ser.gz"; 31 | public static final String tokenizerPipeline = "tokenizer.ser.gz"; 32 | public static final String typePairFilterFile = "typePairs.tsv"; 33 | public static final String typeFilterFile = "typeUnary.tsv"; 34 | /** 35 | * Created by DocEntityStats 36 | */ 37 | public static final String idCountsFile = "idCounts.tsv"; 38 | 39 | public static final String dataDirSuffix = "Dir"; 40 | 41 | //in the hdfsOutputDir 42 | public static final String hdfsMentions = "relexMentions.tsv"; 43 | public static final String hdfsTensors = "tensors.b64"; 44 | 45 | } 46 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/ShowExamples.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp; 19 | 20 | import java.io.*; 21 | import java.util.*; 22 | 23 | import com.google.common.collect.*; 24 | 25 | import com.ibm.research.ai.ki.util.*; 26 | import com.ibm.research.ai.ki.util.RandomUtil.*; 27 | 28 | /** 29 | * Reads the simple tsv format of RelexMention.Reader/Writer. And shows examples of 'interesting' entity-pair mention sets. 30 | * @author mrglass 31 | * 32 | */ 33 | public class ShowExamples { 34 | /** 35 | * The samples that are interesting 36 | * @param m 37 | * @return 38 | */ 39 | static boolean isInteresting(List m) { 40 | return m.size() > 1 && !m.get(0).isNegative(); 41 | } 42 | 43 | /** 44 | * Example args: 45 | * simpleFormat/train.tsv 46 | * 47 | * @param args 48 | */ 49 | public static void main(String[] args) { 50 | 51 | RandomUtil.Sample sample = new RandomUtil.Sample(20); 52 | for (List m : RelexMentionReader.getSetReader(new File(args[0]), RelexMention.class)) { 53 | if (isInteresting(m) && sample.shouldSave()) { 54 | RelexMention m1 = m.get(0); 55 | sample.save( 56 | m1.span1.substring(m1.sentence)+"\t"+ 57 | m1.span2.substring(m1.sentence)+"\t"+ 58 | Lang.stringList(m1.relTypes, ",")+"\n "+ 59 | Lang.stringList(Iterables.transform(m, mi -> mi.sentence), "\n ")); 60 | } 61 | } 62 | 63 | System.out.println(Lang.stringList(sample, "\n\n=======================\n")); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/Tokenizer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp; 19 | 20 | import java.io.*; 21 | import java.util.Properties; 22 | 23 | import com.ibm.reseach.ai.ki.nlp.*; 24 | import com.ibm.research.ai.ki.kbp.*; 25 | import com.ibm.research.ai.ki.nlp.parse.*; 26 | import com.ibm.research.ai.ki.util.*; 27 | 28 | public abstract class Tokenizer { 29 | private static Annotator tokenizer = null; 30 | public static Annotator getTokenizer(RelexConfig config) { 31 | synchronized (Tokenizer.class) { 32 | if (tokenizer == null) { 33 | if (config.tokenizerPipelineFile != null) { 34 | tokenizer = FileUtil.loadObjectFromFile(config.tokenizerPipelineFile); 35 | } else if (new File(config.convertDir, RelexDatasetFiles.tokenizerPipeline).exists()) { 36 | tokenizer = FileUtil.loadObjectFromFile(new File(config.convertDir, RelexDatasetFiles.tokenizerPipeline)); 37 | } else { 38 | tokenizer = new Pipeline( 39 | new ClearNLPTokenize() 40 | //, new DigitSequenceTokenize() //add some special tokenization for digit groups 41 | ); 42 | } 43 | tokenizer.initialize(new Properties()); 44 | } 45 | return tokenizer; 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/TypePairEntityPairFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp; 19 | 20 | import java.io.*; 21 | import java.util.*; 22 | 23 | import com.ibm.research.ai.ki.formats.*; 24 | 25 | public class TypePairEntityPairFilter implements IEntityPairFilter { 26 | private static final long serialVersionUID = 1L; 27 | 28 | protected Set typePairs = new HashSet<>(); 29 | 30 | @Override 31 | public void initialize(GroundTruth gt, RelexConfig config) { 32 | if (!new File(config.convertDir, "typePairs.tsv").exists()) 33 | throw new IllegalArgumentException("No typePairs.tsv file in convertDir"); 34 | for (String[] parts : new SimpleTsvIterable(new File(config.convertDir, RelexDatasetFiles.typePairFilterFile))) { 35 | String t1 = parts[0]; 36 | String t2 = parts[1]; 37 | typePairs.add(t1+'\t'+t2); 38 | } 39 | } 40 | 41 | @Override 42 | public boolean test(String id1, String type1, String id2, String type2) { 43 | String tp = null; 44 | if (type1.compareTo(type2) <= 0) { 45 | tp = type1+'\t'+type2; 46 | } else { 47 | tp = type2+'\t'+type1; 48 | } 49 | 50 | return typePairs.contains(tp); 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/unary/IEntityFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp.unary; 19 | 20 | import java.io.*; 21 | 22 | import com.ibm.research.ai.ki.kbp.*; 23 | 24 | 25 | public interface IEntityFilter extends Serializable { 26 | public void initialize(UnaryGroundTruth gt, RelexConfig config); 27 | public boolean test(String docId, String id, String type); 28 | } 29 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/unary/RelexDatasetManagerUnary.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.kbp.unary; 19 | 20 | import java.io.*; 21 | 22 | import com.ibm.research.ai.ki.kbp.*; 23 | import com.ibm.research.ai.ki.util.*; 24 | 25 | public class RelexDatasetManagerUnary implements IRelexDatasetManager { 26 | private static final long serialVersionUID = 1L; 27 | 28 | RelexConfig config; 29 | UnaryGroundTruth gt; 30 | 31 | @Override 32 | public IRelexTsv getTsvMaker() { 33 | if (gt == null && new File(config.groundTruthFile).exists()) 34 | this.gt = FileUtil.loadObjectFromFile(config.groundTruthFile); 35 | return new UnaryRelexTsvDataset(gt, config); 36 | } 37 | 38 | @Override 39 | public IGroundTruth getGroundTruth() { 40 | if (gt == null && new File(config.groundTruthFile).exists()) 41 | this.gt = FileUtil.loadObjectFromFile(config.groundTruthFile); 42 | return gt; 43 | } 44 | 45 | @Override 46 | public Class getMentionClass() { 47 | return UnaryRelexMention.class; 48 | } 49 | 50 | @Override 51 | public IRelexTensors getTensorMaker() { 52 | return new UnaryRelexTensors(config); 53 | } 54 | 55 | @Override 56 | public void initialize(RelexConfig config) { 57 | this.config = config; 58 | 59 | } 60 | 61 | 62 | } 63 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.kbp/src/main/resources/relexConfigNonSpark.properties: -------------------------------------------------------------------------------- 1 | documentSampleFraction = 1.0 2 | negativeExampleSampleFraction = 0.05 3 | targetNegativeToPositveRatio = -1 4 | directionStyle = bothWays 5 | titleContext = False 6 | sectionContext = False 7 | limitEntitiesToGroundTruth = False 8 | gtTypes = False 9 | vocabLimit = 2000000 10 | vocabMinCount = 2 11 | initialEmbeddingsFile = TODO/wordvectorFileInEmbeddingFormat.ef 12 | minMentionSet = 1 13 | maxMentionSet = 100 14 | maxMentionGroups = 5 15 | maxPositionEmbeddings = 80 16 | typeStyle = single 17 | groundTruthFile = TODO/gt.ser.gz 18 | convertDir = TODO/ 19 | datasetSplitNames = [train, validate, test] 20 | datasetSpitFractions = [0.8, 0.1, 0.1] 21 | #typePairs.tsv expected to exist in convertDir 22 | entityPairFilterClass = com.ibm.research.ai.ki.kbp.TypePairEntityPairFilter -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/Annotator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp; 19 | 20 | import java.io.*; 21 | import java.util.*; 22 | 23 | /** 24 | * NOTE: Annotators are expected to be threadsafe. 25 | * @author mrglass 26 | * 27 | */ 28 | public interface Annotator extends Serializable { 29 | public void initialize(Properties config); 30 | public void process(Document doc); 31 | } 32 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/DocumentStructure.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp; 19 | 20 | import java.io.*; 21 | 22 | /** 23 | * marker interface for information like 'annotations' but without a meaningful Span 24 | * examples: 25 | * coref chains 26 | * relations or frames inferred from multiple sentences 27 | * date of document 28 | * annotation worker that produced an annotated document 29 | * document category 30 | * @author mrglass 31 | * 32 | */ 33 | public interface DocumentStructure extends Serializable { 34 | //CONSIDER: public String getSource(); 35 | } 36 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/DocumentWriter2.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp; 19 | 20 | import com.ibm.research.ai.ki.util.*; 21 | import com.ibm.research.ai.ki.util.io.*; 22 | 23 | import java.io.*; 24 | import java.nio.file.*; 25 | import java.util.zip.*; 26 | 27 | /** 28 | * Version of DocumentWriter based on the new abstract class MultiFileWriter. 29 | * Needs testing before DocumentWriter is replaced. 30 | * @author mrglass 31 | * 32 | */ 33 | public class DocumentWriter2 extends MultiFileWriter { 34 | public DocumentWriter2(File rootDir) { 35 | super(rootDir); 36 | } 37 | 38 | public DocumentWriter2(File rootDir, int itemsPerFile, boolean overwrite) { 39 | super(rootDir, itemsPerFile, overwrite); 40 | } 41 | 42 | @Override 43 | protected String getExt() { 44 | return ".ser.gz"; 45 | } 46 | 47 | @Override 48 | protected void write(ObjectOutputStream stream, Document obj) throws IOException { 49 | stream.writeObject(obj); 50 | } 51 | 52 | @Override 53 | protected ObjectOutputStream getStream(File f) throws IOException { 54 | return new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(f), 2 << 16)); 55 | } 56 | 57 | 58 | public synchronized void write(Document doc) { 59 | try { 60 | super.write(doc); 61 | } catch (Exception e) { 62 | Lang.error(e); 63 | } 64 | } 65 | 66 | @Override 67 | protected void deepenDirectories() { 68 | super.deepenDirectories(); 69 | } 70 | 71 | @Override 72 | public synchronized void close() { 73 | super.close(); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Author.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import com.fasterxml.jackson.annotation.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | 24 | public class Author implements DocumentStructure { 25 | private static final long serialVersionUID = 1L; 26 | public String id; 27 | @JsonCreator 28 | public Author(@JsonProperty("id") String id) { 29 | this.id = id; 30 | } 31 | } 32 | 33 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Categories.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import java.util.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | import com.ibm.research.ai.ki.util.*; 24 | 25 | /** 26 | * The categories that a Document belongs to (like Wikipedia categories) 27 | * @author mrglass 28 | * 29 | */ 30 | public class Categories extends HashSet implements DocumentStructure { 31 | private static final long serialVersionUID = 1L; 32 | /** 33 | * add a category to a Document 34 | * @param doc 35 | * @param category 36 | */ 37 | public static void addCategory(Document doc, String category) { 38 | Categories cats = doc.getDocumentStructure(Categories.class); 39 | if (cats == null) { 40 | cats = new Categories(); 41 | doc.setDocumentStructure(cats); 42 | } 43 | cats.add(category); 44 | } 45 | /** 46 | * unmodifiable set of categories 47 | * @param doc 48 | * @return 49 | */ 50 | public static Set getCategories(Document doc) { 51 | return Collections.unmodifiableSet(Lang.NVL(doc.getDocumentStructure(Categories.class), (Set)Collections.EMPTY_SET)); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Chunk.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import com.fasterxml.jackson.annotation.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | 24 | public class Chunk extends Annotation { 25 | private static final long serialVersionUID = 1L; 26 | 27 | public String tag; //NP, VP, PP 28 | 29 | @JsonCreator 30 | public Chunk(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end, @JsonProperty("tag") String tag) { 31 | super(source, start, end); 32 | this.tag = tag; 33 | } 34 | 35 | @Override 36 | public String highlightLabel() { 37 | return tag; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/DocDate.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import java.util.*; 21 | 22 | import com.fasterxml.jackson.annotation.*; 23 | 24 | import com.ibm.reseach.ai.ki.nlp.*; 25 | 26 | /** 27 | * Date the document was written or created 28 | * @author mrglass 29 | * 30 | */ 31 | public class DocDate implements DocumentStructure { 32 | private static final long serialVersionUID = 1L; 33 | public Date date; 34 | @JsonCreator 35 | public DocDate(@JsonProperty("date") Date date) { 36 | this.date = date; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/DocumentContentType.java: -------------------------------------------------------------------------------- 1 | package com.ibm.reseach.ai.ki.nlp.types; 2 | 3 | import com.ibm.reseach.ai.ki.nlp.*; 4 | 5 | import com.fasterxml.jackson.annotation.*; 6 | 7 | public class DocumentContentType implements DocumentStructure { 8 | 9 | private static final long serialVersionUID = 1L; 10 | public String contentType; 11 | @JsonCreator 12 | public DocumentContentType(@JsonProperty("contentType") String contentType) { 13 | this.contentType = contentType; 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/DocumentSource.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import com.ibm.reseach.ai.ki.nlp.*; 21 | 22 | import com.fasterxml.jackson.annotation.*; 23 | 24 | public class DocumentSource implements DocumentStructure { 25 | private static final long serialVersionUID = 1L; 26 | public String source; 27 | @JsonCreator 28 | public DocumentSource(@JsonProperty("source") String source) { 29 | this.source = source; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Entity.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import com.fasterxml.jackson.annotation.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | 24 | public class Entity extends Annotation { 25 | private static final long serialVersionUID = 1L; 26 | 27 | public String type; 28 | 29 | @JsonCreator 30 | public Entity(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end, @JsonProperty("type") String type) { 31 | super(source, start, end); 32 | this.type = type; 33 | } 34 | 35 | public Entity(Entity e) { 36 | this(e.source, e.start, e.end, e.type); 37 | } 38 | 39 | @Override 40 | public String highlightLabel() { 41 | return type; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/EntityWithId.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import com.ibm.research.ai.ki.util.*; 21 | 22 | import com.fasterxml.jackson.annotation.*; 23 | 24 | /** 25 | * Like a LinkedEntity but with just a single string for the id. 26 | * @author mrglass 27 | * 28 | */ 29 | public class EntityWithId extends Entity { 30 | private static final long serialVersionUID = 1L; 31 | 32 | public String id; 33 | 34 | public EntityWithId(Entity e) { 35 | this(e.source, e.start, e.end, e.type, 36 | (e instanceof EntityWithId) ? ((EntityWithId)e).id : null); 37 | } 38 | 39 | public EntityWithId( 40 | @JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end, 41 | @JsonProperty("type") String type, @JsonProperty("id") String id) 42 | { 43 | super(source, start, end, type); 44 | this.id = id; 45 | } 46 | 47 | @Override 48 | public String highlightLabel() { 49 | return Lang.NVL(type,"unk")+":"+id; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Event.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import java.util.*; 21 | 22 | import com.fasterxml.jackson.annotation.*; 23 | 24 | import com.ibm.reseach.ai.ki.nlp.*; 25 | import com.ibm.research.ai.ki.util.*; 26 | 27 | /** 28 | * The span of the Event is something like the event extent and is usually not very meaningful. 29 | * Most of the semantics come from the eventType and the argument mentions and roles. 30 | * @author mrglass 31 | * 32 | */ 33 | public class Event extends Annotation { 34 | private static final long serialVersionUID = 1L; 35 | 36 | public Event(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end, @JsonProperty("type") String type) { 37 | super(source, start, end); 38 | this.type = type; 39 | } 40 | 41 | public void addArgument(Document doc, String role, Entity entity) { 42 | arguments.add(Pair.of(role, doc.getAnnoRef(entity))); 43 | } 44 | 45 | //the type of the event 46 | public String type; 47 | 48 | //CONSIDER: maybe allow arguments to be Annotation in general rather than requiring Entity 49 | //Pair is role name and entity, role name may be null 50 | public List>> arguments = new ArrayList<>(); 51 | 52 | //CONSIDER: pull the trigger into an EventTrigger annotation class? 53 | //the span of the trigger, may be null 54 | public Span trigger; 55 | 56 | @Override 57 | public String highlightLabel() { 58 | return type; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/LinkAnnotation.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import com.fasterxml.jackson.annotation.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | 24 | /** 25 | * Annotation for a link (like wikilink or web link) in text 26 | * @author mrglass 27 | * 28 | */ 29 | public class LinkAnnotation extends Annotation { 30 | private static final long serialVersionUID = 1L; 31 | 32 | public String target; 33 | 34 | @JsonCreator 35 | public LinkAnnotation(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end, @JsonProperty("target") String target) { 36 | super(source, start, end); 37 | this.target = target; 38 | } 39 | 40 | @Override 41 | public String highlightLabel() { 42 | return "A:"+target; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/ListAnnotation.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import java.util.*; 21 | import java.util.stream.*; 22 | 23 | import com.fasterxml.jackson.annotation.*; 24 | import com.google.common.collect.*; 25 | 26 | import com.ibm.reseach.ai.ki.nlp.*; 27 | 28 | /** 29 | * Annotation describing a list in text 30 | * @author mrglass 31 | * 32 | */ 33 | public class ListAnnotation extends Annotation { 34 | private static final long serialVersionUID = 1L; 35 | 36 | @JsonCreator 37 | public ListAnnotation(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) { 38 | super(source, start, end); 39 | } 40 | 41 | public List> items = new ArrayList<>(); 42 | 43 | public void addListItem(Document doc, ListItem item) { 44 | doc.addAnnotation(item); 45 | items.add(doc.getAnnoRef(item)); 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/ListItem.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import com.fasterxml.jackson.annotation.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | 24 | public class ListItem extends Annotation { 25 | private static final long serialVersionUID = 1L; 26 | 27 | @JsonCreator 28 | public ListItem(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) { 29 | super(source, start, end); 30 | } 31 | public ListItem(Document doc, ListAnnotation list, String source, int first, int last) { 32 | super(source, first, last); 33 | list.items.add(doc.getAnnoRef(this)); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Paragraph.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import com.fasterxml.jackson.annotation.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | 24 | public class Paragraph extends Annotation { 25 | private static final long serialVersionUID = 1L; 26 | 27 | @JsonCreator 28 | public Paragraph(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) { 29 | super(source, start, end); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Relation.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import com.fasterxml.jackson.annotation.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | 24 | public class Relation extends Annotation { 25 | private static final long serialVersionUID = 1L; 26 | 27 | public String relationType; 28 | protected AnnoRef arg1; 29 | protected AnnoRef arg2; 30 | //CONSIDER: type parameter rather than general 'Annotation' 31 | public Annotation getArg1() { 32 | return arg1.get(); 33 | } 34 | public Annotation getArg2() { 35 | return arg2.get(); 36 | } 37 | 38 | //to support subclasses of Relation that name their arguments 39 | public String getArg1Name() { 40 | return "arg1"; 41 | } 42 | public String getArg2Name() { 43 | return "arg2"; 44 | } 45 | 46 | @JsonCreator 47 | public Relation(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) { 48 | super(source, start, end); 49 | } 50 | 51 | 52 | public Relation(String source, Document doc, Annotation arg1, Annotation arg2, String relationType) { 53 | super(source, Math.min(arg1.start, arg2.start), Math.max(arg1.end, arg2.end)); 54 | this.arg1 = doc.getAnnoRef(arg1); 55 | this.arg2 = doc.getAnnoRef(arg2); 56 | this.relationType = relationType; 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Section.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import com.fasterxml.jackson.annotation.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | 24 | public class Section extends Annotation { 25 | private static final long serialVersionUID = 1L; 26 | 27 | @JsonCreator 28 | public Section(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) { 29 | super(source, start, end); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/SectionHeader.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import com.fasterxml.jackson.annotation.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | import com.ibm.research.ai.ki.util.*; 24 | 25 | public class SectionHeader extends Annotation { 26 | private static final long serialVersionUID = 1L; 27 | 28 | @JsonCreator 29 | public SectionHeader(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) { 30 | super(source, start, end); 31 | } 32 | 33 | public AnnoRef
sectionBody; 34 | public AnnoRef superSection; 35 | 36 | } 37 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Sentence.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import com.fasterxml.jackson.annotation.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | 24 | public class Sentence extends Annotation { 25 | 26 | private static final long serialVersionUID = 1L; 27 | 28 | @JsonCreator 29 | public Sentence(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) { 30 | super(source, start, end); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/TextFormatting.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import com.fasterxml.jackson.annotation.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | 24 | public class TextFormatting extends Annotation { 25 | private static final long serialVersionUID = 1L; 26 | 27 | public enum Format {bold, italic}; 28 | 29 | public final Format format; 30 | 31 | @JsonCreator 32 | public TextFormatting(@JsonProperty("source") String source, 33 | @JsonProperty("start") int start, @JsonProperty("end") int end, 34 | @JsonProperty("format") Format format) 35 | { 36 | super(source, start, end); 37 | this.format = format; 38 | } 39 | 40 | @Override 41 | public String highlightLabel() { 42 | return format == Format.italic ? "i" : "b"; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Title.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.reseach.ai.ki.nlp.types; 19 | 20 | import com.fasterxml.jackson.annotation.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | 24 | public class Title extends Annotation { 25 | private static final long serialVersionUID = 1L; 26 | 27 | @JsonCreator 28 | public Title(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) { 29 | super(source, start, end); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/research/ai/ki/nlp/parse/DigitSequenceTokenize.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.nlp.parse; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | import java.util.Properties; 23 | import java.util.regex.Matcher; 24 | import java.util.regex.Pattern; 25 | 26 | import com.ibm.reseach.ai.ki.nlp.*; 27 | import com.ibm.reseach.ai.ki.nlp.types.*; 28 | import com.ibm.research.ai.ki.util.*; 29 | 30 | /** 31 | * Adds tokens for each digit sequence (\b[0-9]+\b) 32 | * @author mrglass 33 | * 34 | */ 35 | public class DigitSequenceTokenize implements Annotator { 36 | private static final long serialVersionUID = 1L; 37 | 38 | public static final String SOURCE = "DST"; 39 | 40 | protected Pattern digitSeq = Pattern.compile("\\b[0-9]+\\b"); 41 | 42 | @Override 43 | public void initialize(Properties config) {} 44 | 45 | @Override 46 | public void process(Document doc) { 47 | Matcher m = digitSeq.matcher(doc.text); 48 | NonOverlappingSpans nos = new NonOverlappingSpans(); 49 | List toAdd = new ArrayList<>(); 50 | while (m.find()) { 51 | Token t = new Token(SOURCE, m.start(), m.end()); 52 | t.lemma = t.coveredText(doc); 53 | t.pos = "CD"; 54 | if (!nos.addSpan(t)) { 55 | throw new Error("Span overlaps?? "+doc.toSimpleInlineMarkup()); 56 | } 57 | toAdd.add(t); 58 | } 59 | //remove tokens that overlap with our new ones 60 | doc.removeAnnotations(Token.class, t -> nos.overlaps(t)); 61 | for (Token t : toAdd) 62 | doc.addAnnotation(t); 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/research/ai/ki/nlp/parse/NormalizeTextTransform.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.nlp.parse; 19 | 20 | import com.ibm.reseach.ai.ki.nlp.*; 21 | 22 | /** 23 | * Like the text normalization in Google's w2v but with flag for [0-9] -> ' ' 24 | * @author mrglass 25 | * 26 | */ 27 | public class NormalizeTextTransform extends TransformString { 28 | private static final long serialVersionUID = 1L; 29 | 30 | protected boolean removeDigits; 31 | 32 | public NormalizeTextTransform(boolean removeDigits) { 33 | super("com/ibm/research/ai/ki/nlp/parse/normalizeText-replace.tsv"); 34 | this.removeDigits = removeDigits; 35 | } 36 | 37 | @Override 38 | public String transform(String text, OffsetCorrection trans2orig, OffsetCorrection orig2trans) { 39 | String result = super.transform(text,trans2orig,orig2trans).toLowerCase(); 40 | if (removeDigits) 41 | result = result.replaceAll("[0-9]", " "); 42 | return result; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/research/ai/ki/nlp/parse/RegexTokenize.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.nlp.parse; 19 | 20 | import java.io.*; 21 | import java.util.*; 22 | import java.util.regex.*; 23 | 24 | import com.ibm.reseach.ai.ki.nlp.*; 25 | import com.ibm.reseach.ai.ki.nlp.types.*; 26 | import com.ibm.research.ai.ki.util.*; 27 | 28 | public class RegexTokenize implements Annotator { 29 | private static final long serialVersionUID = 1L; 30 | 31 | public static final String REGEX_KEY = "tokenRegex"; 32 | //whitespace or punctuation delimits tokens, and are themselves not tokens 33 | public static final String DEFAULT = "[\\p{Punct}\\s]+"; 34 | 35 | public static final String WHITESPACE = Lang.pWhite_Space+"+"; 36 | 37 | public static final String SOURCE = RegexTokenize.class.getSimpleName(); 38 | 39 | protected Pattern tokenRegex; 40 | 41 | @Override 42 | public void initialize(Properties config) { 43 | tokenRegex = Pattern.compile(Lang.NVL(config.getProperty(REGEX_KEY), DEFAULT)); 44 | } 45 | 46 | @Override 47 | public void process(Document doc) { 48 | for (Annotation seg : doc.getSegmentation(Sentence.class, Paragraph.class)) { 49 | Matcher m = tokenRegex.matcher(doc.coveredText(seg)); 50 | int prevStart = 0; 51 | while (m.find()) { 52 | if (m.start() > prevStart) { 53 | doc.addAnnotation(new Token(SOURCE, seg.start + prevStart, seg.start + m.start())); 54 | } 55 | prevStart = m.end(); 56 | } 57 | if (prevStart != seg.length()) //may end with whitespace 58 | doc.addAnnotation(new Token(SOURCE, seg.start + prevStart, seg.end)); 59 | } 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/resources/com/ibm/research/ai/ki/nlp/parse/clearNLP-replace.tsv: -------------------------------------------------------------------------------- 1 | \u02BA " 2 | \u2036 " 3 | \u2033 " 4 | \u201C " 5 | \u201D " 6 | \u201E " 7 | \u201F " 8 | \u275D " 9 | \u275E " 10 | \u3003 " 11 | \u301F " 12 | \u301D " 13 | \u301E " 14 | \uFF02 " 15 | \u00B4 ' 16 | \u02B9 ' 17 | \u02BC ' 18 | \u02C8 ' 19 | \u0301 ' 20 | \u2018 ' 21 | \u2019 ' 22 | \u201B ' 23 | \u2032 ' 24 | \u275B ' 25 | \u275C ' 26 | \u02CB ` 27 | \u0300 ` 28 | \u2035 ` 29 | \u2037 ''' 30 | \u2010 - 31 | \u2011 - 32 | \u2012 - 33 | \u2013 - 34 | \u2014 - 35 | \u2015 - 36 | \u0335 - 37 | \u0336 - 38 | \u2016 || 39 | \u2017 _ 40 | \u02CD _ 41 | \u0331 _ 42 | \u0332 _ 43 | \u0333 _ 44 | \u02DC ~ 45 | \u0303 ~ 46 | \u0330 ~ 47 | \u2053 ~ 48 | \u223C ~ 49 | \u301C ~ 50 | \u0334 ~ 51 | \u02C2 < 52 | \u02C3 > 53 | \u27EA < 54 | \u27EB > 55 | \u2039 < 56 | \u203A > 57 | \u27E8 < 58 | \u27E9 > 59 | \u3008 < 60 | \u3009 > 61 | \u27E6 [ 62 | \u27E7 ] 63 | \u3014 [ 64 | \u3015 ] 65 | \u3016 [ 66 | \u3017 ] 67 | \u3018 [ 68 | \u3019 ] 69 | \u301A [ 70 | \u301B ] 71 | \u2983 { 72 | \u2984 } 73 | \u02C4 ^ 74 | \u02C6 ^ 75 | \u0302 ^ 76 | \u2038 ^ 77 | \u2303 ^ 78 | \u01C0 | 79 | \u05C0 | 80 | \u2223 | 81 | \u2758 | 82 | \u00F7 / 83 | \u2044 / 84 | \u2215 / 85 | \u0337 / 86 | \u0338 / 87 | \u20E5 \ 88 | \u2216 \ 89 | \u066D * 90 | \u204E * 91 | \u2217 * 92 | \u2731 * 93 | \u0589 : 94 | \u05C3 : 95 | \u2236 : 96 | \u2264 <= 97 | \u2265 >= 98 | \u2266 <= 99 | \u2267 >= 100 | \u066A % 101 | \u2052 % 102 | \u01C3 ! 103 | \u2762 ! 104 | \u266F # 105 | \u201A , 106 | \u203D ? 107 | \u2025 .. 108 | \u2026 ... 109 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/resources/com/ibm/research/ai/ki/nlp/parse/normalizeText-replace.tsv: -------------------------------------------------------------------------------- 1 | ’ ' 2 | ′ ' 3 | '' 4 | ' ' 5 | “ " 6 | ” " 7 | " " 8 | . . 9 |
10 | , , 11 | ( ( 12 | ) ) 13 | ! ! 14 | ? ? 15 | ; 16 | : 17 | - - 18 | = 19 | = 20 | * 21 | | 22 | « 23 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/resources/downloadOpenNLPModels.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | models=( "en-token.bin" "en-sent.bin" "en-pos-maxent.bin" "en-chunker.bin" "en-parser-chunking.bin" ) 4 | nermodels=( "en-ner-date.bin" "en-ner-location.bin" "en-ner-money.bin" "en-ner-organization.bin" "en-ner-percentage.bin" "en-ner-person.bin" "en-ner-time.bin" ) 5 | 6 | allmodels=("${models[@]}" "${nermodels[@]}") 7 | 8 | #download the models from http://opennlp.sourceforge.net/models-1.5/ 9 | for file in "${allmodels[@]}" 10 | do 11 | if [ ! -f $file ]; then 12 | wget http://opennlp.sourceforge.net/models-1.5/$file 13 | fi 14 | done 15 | 16 | 17 | #CONSIDER: instead use download-maven-plugin (https://stackoverflow.com/questions/2741806/maven-downloading-files-from-url) in the nlp project 18 | # run in the validate phase (first phase) 19 | # to download each OpenNLP model file to ${project.basedir}/src/main/resources 20 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/resources/en-sent.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/cc-dbp/92f6079dd2a56d33827f944913bb369ebf33f027/com.ibm.research.ai.ki.nlp/src/main/resources/en-sent.bin -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=WARN, stdout 3 | 4 | # Redirect log messages to console 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/test/java/com/ibm/research/ai/ki/nlp/OverlappingSpansTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.nlp; 19 | 20 | import java.util.*; 21 | 22 | import com.google.common.collect.*; 23 | 24 | import com.ibm.reseach.ai.ki.nlp.*; 25 | import com.ibm.research.ai.ki.util.*; 26 | 27 | public class OverlappingSpansTest { 28 | public void validate(Document doc) { 29 | OverlappingSpans ospans = new OverlappingSpans(doc.getAnnotations(Annotation.class)); 30 | List sample = RandomUtil.getSample(doc.getAnnotations(Annotation.class), 100); 31 | for (Annotation a : sample) { 32 | validate(ospans, doc, a); 33 | } 34 | } 35 | protected void validate(OverlappingSpans ospans, Document doc, Annotation a) { 36 | Set ores = ospans.getSpansOverlapping(a); 37 | Set linearRes = new HashSet<>(); 38 | for (Annotation oa : doc.getAnnotations(Annotation.class)) { 39 | if (oa.overlaps(a)) 40 | linearRes.add(oa); 41 | } 42 | int matchSize = Sets.intersection(ores, linearRes).size(); 43 | if (matchSize != ores.size()) 44 | throw new Error("fail"); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/test/java/com/ibm/research/ai/ki/nlp/TransformStringTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.nlp; 19 | 20 | import com.ibm.reseach.ai.ki.nlp.*; 21 | 22 | public class TransformStringTest { 23 | public void validate(Document doc) { 24 | //TODO: transform the doc and transform it back? 25 | //offset correction testing: 26 | //string normalize gazetteer and text, indexOf, denormalize, then check that all span matches are still normalized matches 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/test/java/com/ibm/research/ai/ki/nlp/parse/TestClearNLP.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.nlp.parse; 19 | 20 | import java.io.File; 21 | import java.util.Properties; 22 | 23 | import com.ibm.reseach.ai.ki.nlp.*; 24 | import com.ibm.research.ai.ki.nlp.parse.*; 25 | import com.ibm.research.ai.ki.util.*; 26 | 27 | public class TestClearNLP { 28 | public static void main(String[] args) { 29 | String testDir = args[0]; 30 | 31 | //ClearNLPTransform transform = new ClearNLPTransform(); 32 | //System.out.println(transform.transform("The man ran ( ; ) and so ( ' ).", null, null)); 33 | //System.out.println(transform.transform("Alhazen\n\n(;   ), also known by the Lat", null, null)); 34 | Pipeline p = new Pipeline(new ClearNLPSentence(), new ClearNLPPOS(), new ClearNLPParse()); 35 | p.initialize(new Properties()); 36 | p.enableProfiling(); 37 | PeriodicChecker report = new PeriodicChecker(100); 38 | int docNum = 0; 39 | for (Document doc : new PipelinedDocuments(p, new DocumentReader(new File(testDir)))) { 40 | ++docNum; 41 | if (report.isTime()) { 42 | System.out.println("On document "+docNum); 43 | System.out.println(p.stringProfiling()); 44 | } 45 | } 46 | System.out.println(p.stringProfiling()); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.nlp/src/test/java/com/ibm/research/ai/ki/nlp/parse/TestNER.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.nlp.parse; 19 | 20 | import java.io.*; 21 | 22 | import com.ibm.reseach.ai.ki.nlp.*; 23 | import com.ibm.reseach.ai.ki.nlp.types.*; 24 | import com.ibm.research.ai.ki.nlp.parse.*; 25 | import com.ibm.research.ai.ki.util.*; 26 | import com.ibm.research.ai.ki.util.RandomUtil.*; 27 | 28 | import java.util.*; 29 | 30 | 31 | public class TestNER { 32 | public static void main(String[] args) { 33 | File docDir = new File(args[0]); 34 | Pipeline p = new Pipeline( 35 | new ResettingAnnotator(), new OpenNLPSentence(), 36 | new ClearNLPTokenize(), new ClearNLPPOS(), 37 | new ClearNLPNER(), new OpenNLPNER()); 38 | p.initialize(new Properties()); 39 | p.enableProfiling(); 40 | Iterable docs = new PipelinedDocuments(p, new DocumentReader(docDir)); 41 | Map typeCounts = new HashMap<>(); 42 | RandomUtil.Sample sampled = new RandomUtil.Sample<>(50); 43 | for (Document doc : docs) { 44 | for (Entity e : doc.getAnnotations(Entity.class)) { 45 | SparseVectors.increase(typeCounts, e.type+"-"+e.source, 1.0); 46 | if (sampled.shouldSave()) 47 | sampled.save(e.coveredText(doc)+" :: "+e.type+"-"+e.source); 48 | } 49 | } 50 | System.out.println(SparseVectors.toString(typeCounts)); 51 | System.out.println(Lang.stringList(sampled, "\n")); 52 | System.out.println(p.annotatorListing()); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.spark/src/main/resources/relexConfig.properties: -------------------------------------------------------------------------------- 1 | documentSampleFraction = 1.0 2 | negativeExampleSampleFraction = 0.05 3 | targetNegativeToPositveRatio = -1 4 | directionStyle = bothWays 5 | titleContext = False 6 | sectionContext = False 7 | limitEntitiesToGroundTruth = False 8 | gtTypes = False 9 | vocabLimit = 2000000 10 | vocabMinCount = 2 11 | initialEmbeddingsFile = TODO/wordvectorFileInEmbeddingFormat.ef 12 | minMentionSet = 1 13 | maxMentionSet = 100 14 | maxMentionGroups = 5 15 | maxPositionEmbeddings = 80 16 | typeStyle = single 17 | groundTruthFile = TODO/gt.ser.gz 18 | convertDir = TODO/ 19 | datasetSplitNames = [train, validate, test] 20 | datasetSpitFractions = [0.8, 0.1, 0.1] 21 | #typePairs.tsv expected to exist in convertDir 22 | entityPairFilterClass = com.ibm.research.ai.ki.kbp.TypePairEntityPairFilter -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | 5 | com.ibm.research.ai.ki 6 | cc-dbp-parent-pom 7 | 1.0.0-SNAPSHOT 8 | 9 | 10 | util 11 | 1.0.0-SNAPSHOT 12 | 13 | 14 | 15 | com.google.guava 16 | guava 17 | ${guava.version} 18 | 19 | 20 | 21 | org.apache.commons 22 | commons-math3 23 | ${commons-math3.version} 24 | 25 | 26 | 27 | org.apache.commons 28 | commons-lang3 29 | ${commons-lang3.version} 30 | 31 | 32 | 33 | org.apache.commons 34 | commons-compress 35 | ${commons-compress.version} 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/formats/SimpleTsvIterable.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.formats; 19 | 20 | import java.io.*; 21 | import java.util.*; 22 | 23 | import com.google.common.collect.*; 24 | 25 | import com.ibm.research.ai.ki.util.*; 26 | 27 | public class SimpleTsvIterable implements Iterable { 28 | protected String filename; 29 | protected boolean skipHeader; 30 | 31 | public SimpleTsvIterable(File file) { 32 | this(file.getAbsolutePath(),false); 33 | } 34 | public SimpleTsvIterable(File file, boolean skipHeader) { 35 | this(file.getAbsolutePath(),skipHeader); 36 | } 37 | 38 | public SimpleTsvIterable(String filename) { 39 | this(filename,false); 40 | } 41 | public SimpleTsvIterable(String filename, boolean skipHeader) { 42 | this.filename = filename; 43 | this.skipHeader = skipHeader; 44 | } 45 | 46 | @Override 47 | public Iterator iterator() { 48 | Iterator lineIter = Iterators.filter( 49 | FileUtil.getRawLines(filename).iterator(), 50 | s -> !s.isEmpty()); 51 | if (skipHeader && lineIter.hasNext()) 52 | lineIter.next(); 53 | return new NextOnlyIterator() { 54 | @Override 55 | protected String[] getNext() { 56 | if (!lineIter.hasNext()) 57 | return null; 58 | return lineIter.next().split("\t"); 59 | } 60 | 61 | }; 62 | } 63 | 64 | 65 | } 66 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/CombinedSpans.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util; 19 | 20 | import com.google.common.collect.*; 21 | 22 | public class CombinedSpans { 23 | public CombinedSpans() { 24 | spans = TreeRangeSet.create(); 25 | } 26 | protected CombinedSpans(RangeSet spans) { 27 | this.spans = spans; 28 | } 29 | protected RangeSet spans; 30 | 31 | public void add(Span s) { 32 | spans.add(Range.closedOpen(s.start, s.end)); 33 | } 34 | 35 | public boolean contains(Span s) { 36 | return spans.encloses(Range.closedOpen(s.start, s.end)); 37 | } 38 | 39 | public boolean contains(int position) { 40 | return spans.contains(position); 41 | } 42 | 43 | public CombinedSpans complement() { 44 | return new CombinedSpans(spans.complement()); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/FirstPairComparator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util; 19 | 20 | import java.io.*; 21 | import java.util.Comparator; 22 | 23 | /** 24 | * orders Pairs based on their first element 25 | * @author partha 26 | * 27 | */ 28 | public class FirstPairComparator implements Comparator, Serializable { 29 | 30 | private Comparator comparator; 31 | boolean reverse; 32 | 33 | /** 34 | * default constructor that assumes the first element is Comparable 35 | */ 36 | public FirstPairComparator() { 37 | 38 | } 39 | /** 40 | * constructor that takes a Comparator where null means treat the elements as Comparable 41 | * @param comp 42 | */ 43 | public FirstPairComparator(Comparator comparator) { 44 | this.comparator = comparator; 45 | } 46 | 47 | /** 48 | * reverses the order 49 | */ 50 | public void setReverseOrdering() { 51 | this.reverse = true; 52 | } 53 | 54 | @Override 55 | public int compare(Pair p1, Pair p2) { 56 | if (p1 == null) throw new IllegalArgumentException(); 57 | if (p2 == null) throw new IllegalArgumentException(); 58 | if (comparator != null) { 59 | return reverse?comparator.compare(p2.first, p1.first):comparator.compare(p1.first, p2.first); 60 | } else { 61 | return reverse?((Comparable) p2.first).compareTo(p1.first): ((Comparable) p1.first).compareTo(p2.first); //null means treat the elements as Comparable 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/LogLinear.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util; 19 | 20 | public class LogLinear { 21 | /** 22 | * inverse of logistic, log-odds 23 | * @param x 24 | * @return 25 | */ 26 | public static double logit(double x) { 27 | return Math.log(x / (1-x)); 28 | } 29 | /** 30 | * Sigmoid function 31 | * @param x 32 | * @return 33 | */ 34 | public static double logistic(double x) { 35 | return 1.0/(1.0+Math.exp(-x)); 36 | } 37 | /** 38 | * smoothes x away from values too close to zero or one 39 | * x will be in the range [smoothby, (1-smoothby)] if it was in the range [0,1] originally 40 | * @param x 41 | * @param smoothby 42 | * @return 43 | */ 44 | public static double smooth(double x, double smoothby) { 45 | return (1-2*smoothby)*x+smoothby; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/MutableDouble.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util; 19 | 20 | import java.io.Serializable; 21 | import java.util.*; 22 | 23 | 24 | public class MutableDouble implements Cloneable, Comparable, Serializable { 25 | 26 | private static final long serialVersionUID = 1L; 27 | 28 | public double value; 29 | 30 | public MutableDouble() { 31 | value = 0; 32 | } 33 | 34 | public MutableDouble(double value) { 35 | this.value = value; 36 | } 37 | 38 | @Override 39 | public int compareTo(MutableDouble that) { 40 | return this.value == that.value ? 0 : this.value < that.value ? -1 : 1; 41 | } 42 | 43 | @Override 44 | public String toString() { 45 | return String.valueOf(this.value); 46 | } 47 | 48 | public static class AbsValueComparator implements Comparator { 49 | public int compare(MutableDouble o1, MutableDouble o2) { 50 | if (o1 == null || o2 == null) return 0; 51 | return (int)Math.signum(Math.abs(o1.value) - Math.abs(o2.value)); 52 | } 53 | 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/MutableInteger.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util; 19 | 20 | import java.io.Serializable; 21 | 22 | public class MutableInteger implements Cloneable, Comparable, Serializable { 23 | 24 | private static final long serialVersionUID = 1L; 25 | 26 | public int value; 27 | 28 | public MutableInteger() { 29 | value = 0; 30 | } 31 | 32 | public MutableInteger(int value) { 33 | this.value = value; 34 | } 35 | 36 | @Override 37 | public int compareTo(MutableInteger that) { 38 | return this.value == that.value ? 0 : this.value < that.value ? -1 : 1; 39 | } 40 | 41 | public String toString() { 42 | return String.valueOf(value); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/NextOnlyIterator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util; 19 | 20 | import java.util.*; 21 | 22 | public abstract class NextOnlyIterator implements Iterator, AutoCloseable { 23 | private T next; 24 | private boolean done = false; 25 | 26 | abstract protected T getNext(); 27 | 28 | 29 | @Override 30 | public void remove() { 31 | throw new UnsupportedOperationException(); 32 | } 33 | 34 | @Override 35 | public boolean hasNext() { 36 | if (done) 37 | return false; 38 | if (next == null) 39 | next = getNext(); 40 | done = next == null; 41 | if (done) 42 | close(); 43 | return !done; 44 | } 45 | 46 | @Override 47 | public T next() { 48 | if (done) 49 | return null; 50 | if (next != null) { 51 | T toRet = next; 52 | next = null; 53 | return toRet; 54 | } 55 | T toRet = getNext(); 56 | done = toRet == null; 57 | if (done) 58 | close(); 59 | return toRet; 60 | } 61 | 62 | public void close() {} 63 | 64 | @Override 65 | protected void finalize() throws Throwable { 66 | super.finalize(); 67 | if (!done) 68 | close(); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/eval/MultiPrecisionRecall.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util.eval; 19 | 20 | import java.util.*; 21 | 22 | import com.google.common.collect.*; 23 | 24 | import com.ibm.research.ai.ki.util.eval.PrecisionRecall.*; 25 | 26 | public class MultiPrecisionRecall { 27 | public static final String ALL = "ALL"; 28 | 29 | public Map prs; 30 | 31 | public MultiPrecisionRecall() { 32 | prs = new HashMap<>(); 33 | prs.put(ALL, new PrecisionRecall()); 34 | } 35 | 36 | public void addAnswered(String id, double score, boolean relevant, double weight, String... tags) { 37 | Instance inst = new Instance(id, score, relevant, weight); 38 | for (String t : tags) { 39 | if (t == null) 40 | continue; 41 | if (t.equals(ALL)) 42 | throw new IllegalArgumentException("The tag '"+ALL+"' is reserved"); 43 | //CONSIDER: check for duplicate tags? 44 | prs.computeIfAbsent(t, s -> new PrecisionRecall()).addAnswered(inst); 45 | } 46 | prs.get(ALL).addAnswered(inst); 47 | } 48 | 49 | public void addOutOfRecall(int outOfRecallCount, String... tags) { 50 | for (String t : tags) { 51 | if (t.equals(ALL)) 52 | throw new IllegalArgumentException("The tag '"+ALL+"' is reserved"); 53 | //CONSIDER: check for duplicate tags? 54 | prs.computeIfAbsent(t, s -> new PrecisionRecall()).addOutOfRecall(outOfRecallCount); 55 | } 56 | prs.get(ALL).addOutOfRecall(outOfRecallCount); 57 | } 58 | 59 | public Map computeSummaryScores() { 60 | return Maps.transformValues(prs, pr -> pr.computeSummaryScores()); 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/graphs/GraphAlgorithms.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util.graphs; 19 | 20 | import java.util.*; 21 | import java.util.function.*; 22 | 23 | import com.ibm.research.ai.ki.util.*; 24 | 25 | public class GraphAlgorithms { 26 | /** 27 | * Construct the transitive closure of things related to nodes. 28 | * @param nodes 29 | * @param getRelated get the nodes related to a given node 30 | * @return mapping from the members of nodes, to nodes that they are related to, and nodes those are transitively related to 31 | */ 32 | public static > Map> transitiveClosure(Iterable nodes, Function getRelated) { 33 | Map> tc = new HashMap<>(); 34 | for (Node ni : nodes) { 35 | Set rel = tc.computeIfAbsent(ni, k -> new HashSet<>()); 36 | IN rn = getRelated.apply(ni); 37 | if (rn != null) { 38 | for (Node ri : rn) { 39 | rel.add(ri); 40 | } 41 | } 42 | } 43 | 44 | boolean changed; 45 | Set toAdd = new HashSet<>(); 46 | do { 47 | changed = false; 48 | for (Map.Entry> e : tc.entrySet()) { 49 | toAdd.clear(); 50 | for (Node r : e.getValue()) { 51 | toAdd.addAll(Lang.NVL(tc.get(r),Collections.EMPTY_SET)); 52 | } 53 | if (e.getValue().addAll(toAdd)) 54 | changed = true; 55 | } 56 | } while (changed); 57 | 58 | return tc; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/io/OldVersionOf.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util.io; 19 | 20 | /** 21 | * Used by RefactoringObjectInputStream 22 | * When breaking serialization compatibility for a class Foo 23 | * 1) copy the old version to a new class name FooV1. 24 | * 2) have FooV1 implement OldVersionOf Foo 25 | * 3) change Foo in the way desired and write the convert function for FooV1 26 | * 4) update the serialVersionId in Foo 27 | * 5) create a mapping in serializedMappings.properties: com.ibm.Foo:oldSerialVersionId -> com.ibm.FooV1 28 | * @author mrglass 29 | * 30 | * @param the class that it is an old version of 31 | */ 32 | public interface OldVersionOf { 33 | public T convert(); 34 | } 35 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/parallel/ISimpleExecutor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util.parallel; 19 | 20 | public interface ISimpleExecutor { 21 | /** 22 | * The number of threads that this executor uses. 23 | * @return 24 | */ 25 | public int getNumProcessors(); 26 | /** 27 | * Waits for all tasks to finish, polling every milliPoll milliseconds 28 | * @param milliPoll 29 | */ 30 | public void awaitFinishing(long milliPoll); 31 | /** 32 | * Waits for all tasks to finish 33 | */ 34 | public void awaitFinishing(); 35 | /** 36 | * Add the task to the list of things to execute in parallel 37 | * @param task 38 | */ 39 | public void execute(Runnable task); 40 | 41 | /** 42 | * Waits for all submitted tasks to finish but no longer accepts additional tasks. 43 | * The executor cannot be used after this is executed. 44 | */ 45 | public void shutdown(); 46 | 47 | /** 48 | * True if all submitted tasks have finished 49 | * @return 50 | */ 51 | public boolean isFinished(); 52 | } 53 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/parallel/SingleThreadedExecutor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util.parallel; 19 | 20 | public class SingleThreadedExecutor implements ISimpleExecutor { 21 | 22 | @Override 23 | public int getNumProcessors() { 24 | return 1; 25 | } 26 | 27 | @Override 28 | public void awaitFinishing(long milliPoll) { 29 | } 30 | 31 | @Override 32 | public void awaitFinishing() { 33 | } 34 | 35 | @Override 36 | public void execute(Runnable task) { 37 | task.run(); 38 | } 39 | 40 | @Override 41 | public void shutdown() { 42 | } 43 | 44 | public boolean isFinished() { 45 | return true; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/parallel/StreamEater.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util.parallel; 19 | 20 | import java.io.*; 21 | import java.util.function.*; 22 | 23 | import com.ibm.research.ai.ki.util.*; 24 | 25 | public class StreamEater extends Thread { 26 | public static StreamEater eatStream(BufferedReader in, Consumer lineHandler) { 27 | StreamEater e = new StreamEater(in, lineHandler); 28 | e.setDaemon(true); 29 | e.start(); 30 | return e; 31 | } 32 | 33 | private StreamEater(BufferedReader in, Consumer lineHandler) { 34 | this.in = in; 35 | this.lineHandler = lineHandler; 36 | } 37 | private BufferedReader in; 38 | private Consumer lineHandler; 39 | @Override 40 | public void run() { 41 | try { 42 | String line = null; 43 | while ((line = in.readLine()) != null) { 44 | if (lineHandler != null) 45 | lineHandler.accept(line); 46 | } 47 | in.close(); 48 | } catch (Exception e) { 49 | throw new Error(e); 50 | } 51 | } 52 | } -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/main/resources/com/ibm/research/ai/ki/util/serializedMappings.properties: -------------------------------------------------------------------------------- 1 | #old class name = new class name 2 | 3 | #example: 4 | #com.ibm.research.ai.ki.Anchor=com.ibm.research.ai.ki.kbp.Anchor -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/test/java/com/ibm/research/ai/ki/util/BjUtilTestCounter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util; 19 | 20 | public class BjUtilTestCounter { 21 | public static void main(String[] args) throws InterruptedException { 22 | for (int i=0; i<2000; i++){ 23 | Thread.sleep(10); 24 | System.out.println(i); 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/test/java/com/ibm/research/ai/ki/util/ExecuteJavaProc.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util; 19 | 20 | import java.io.File; 21 | import java.io.IOException; 22 | 23 | import com.ibm.research.ai.ki.util.*; 24 | 25 | public class ExecuteJavaProc { 26 | private ExecuteJavaProc() { 27 | } 28 | 29 | public static int exec(Class klass) throws IOException, InterruptedException { 30 | String javaHome = System.getProperty("java.home"); 31 | String javaBin = javaHome + File.separator + "bin" + File.separator + "java"; 32 | String classpath = System.getProperty("java.class.path"); 33 | String className = klass.getCanonicalName(); 34 | ProcessBuilder builder = new ProcessBuilder(javaBin, "-cp", classpath, className); 35 | Process process = builder.start(); 36 | FileUtil.readProcessAsString(process); 37 | process.waitFor(); 38 | return process.exitValue(); 39 | } 40 | 41 | public static void main(String[] args) throws IOException, InterruptedException { 42 | exec(BjUtilTestCounter.class); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/test/java/com/ibm/research/ai/ki/util/NBestTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util; 19 | 20 | import static org.junit.Assert.assertEquals; 21 | 22 | import java.util.List; 23 | 24 | import com.ibm.research.ai.ki.util.*; 25 | 26 | import org.junit.Test; 27 | 28 | public class NBestTest { 29 | 30 | @Test 31 | public void testAddTAndEmpty() { 32 | int limit = 5; 33 | NBest nBest = new NBest(limit); 34 | int max = 0; 35 | for (int i=0; i< 20; i++){ 36 | int rand = 1 + (int)(Math.random() * ((100 - 1) + 1)); 37 | nBest.add(new Person("Name_"+i, rand)); 38 | max = max people = nBest.empty(); 41 | assertEquals(limit, people.size()); 42 | assertEquals(max, people.get(0).age); 43 | assertEquals(0, nBest.empty().size()); 44 | } 45 | 46 | class Person implements Comparable{ 47 | private String name; 48 | private int age; 49 | 50 | public Person(String name, int age) { 51 | this.name = name; 52 | this.age = age; 53 | } 54 | 55 | @Override 56 | public int compareTo(Person other) { 57 | if (this.age==other.age){ 58 | return 0; 59 | } 60 | return this.age > other.age? 1:-1; 61 | } 62 | 63 | @Override 64 | public String toString() { 65 | return name+" : "+age; 66 | } 67 | 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/test/java/com/ibm/research/ai/ki/util/NonOverlappingTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util; 19 | 20 | import static org.junit.Assert.*; 21 | 22 | import java.util.*; 23 | 24 | import com.ibm.research.ai.ki.util.*; 25 | 26 | import org.junit.*; 27 | 28 | public class NonOverlappingTest { 29 | @Test 30 | public void vsLinearScan() { 31 | Random rand = new Random(123); 32 | for (int scale : new int[] {1, 10, 100}) { 33 | long nosTime = 0; 34 | long lsTime = 0; 35 | for (int testi = 0; testi < 100; ++testi) { 36 | List spans = SpanTest.randomSpans(rand, scale); 37 | if (rand.nextBoolean()) { 38 | Collections.sort(spans, new Span.LengthComparator().reversed()); 39 | } 40 | 41 | List nonOverlapping = new ArrayList<>(); 42 | NonOverlappingSpans nos = new NonOverlappingSpans(); 43 | for (Span s : spans) { 44 | long start = System.nanoTime(); 45 | boolean lsOk = true; 46 | for (Span n : nonOverlapping) 47 | if (n.overlaps(s)) { 48 | lsOk = false; 49 | break; 50 | } 51 | if (lsOk) 52 | nonOverlapping.add(s); 53 | lsTime += System.nanoTime() - start; 54 | 55 | start = System.nanoTime(); 56 | boolean nosOk = nos.addSpan(s); 57 | nosTime += System.nanoTime() - start; 58 | 59 | assertEquals(lsOk, nosOk); 60 | } 61 | } 62 | //System.out.println("Speedup = "+(double)lsTime/(double)nosTime); 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/test/java/com/ibm/research/ai/ki/util/PropertyLoaderTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util; 19 | 20 | import static org.junit.Assert.assertEquals; 21 | 22 | import com.ibm.research.ai.ki.util.*; 23 | 24 | import org.junit.Test; 25 | 26 | public class PropertyLoaderTest { 27 | 28 | 29 | @Test 30 | public void testLoadProperties() { 31 | assertEquals("value", PropertyLoader.loadProperties("com.ibm.research.ai.ki.util.1").get("name")); 32 | assertEquals("value", PropertyLoader.loadProperties("/com/ibm/research/ai/ki/util/1").get("name")); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/test/java/com/ibm/research/ai/ki/util/RandomUtilTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * cc-dbp-dataset 3 | * 4 | * Copyright (c) 2017 IBM 5 | * 6 | * The author licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.ibm.research.ai.ki.util; 19 | 20 | import static org.junit.Assert.*; 21 | 22 | import java.util.HashMap; 23 | import java.util.HashSet; 24 | import java.util.Map; 25 | import java.util.Set; 26 | 27 | import com.ibm.research.ai.ki.util.*; 28 | 29 | import org.junit.Test; 30 | 31 | public class RandomUtilTest { 32 | 33 | @Test 34 | public void testRandomInt() { 35 | for (int i = 0; i < 100; i++) { 36 | assertEquals(0, RandomUtil.randomInt(0, 1)); 37 | } 38 | } 39 | 40 | @Test 41 | public void testRandomMemberAndRemove() { 42 | Set integers = new HashSet(); 43 | for (int i = 0; i < 10000; i++) { 44 | integers.add(i); 45 | } 46 | for (int i = 0; i < 10000; i++) { 47 | assertTrue(integers.contains(RandomUtil.randomMember(integers))); 48 | } 49 | 50 | for (int i = 0; i < 100000; i++) { 51 | assertTrue(!integers.contains(RandomUtil.removeRandom(integers))); 52 | } 53 | } 54 | 55 | @Test 56 | public void testRandomEntry() { 57 | Map integers = new HashMap(); 58 | for (int i = 0; i < 10000; i++) { 59 | integers.put(i,i); 60 | } 61 | for (int i = 0; i < 10000; i++) { 62 | assertTrue(integers.containsKey(RandomUtil.randomEntry(integers).getKey())); 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /com.ibm.research.ai.ki.util/src/test/resources/com/ibm/research/ai/ki/util/1.properties: -------------------------------------------------------------------------------- 1 | name=value 2 | -------------------------------------------------------------------------------- /config.properties: -------------------------------------------------------------------------------- 1 | #CommonCrawlConfig 2 | 3 | language=en 4 | minLanguageConfidence=0.8 5 | numThreads=8 6 | #save these types as offset annotation in the corpus 7 | annotationTypes = [LinkAnnotation] 8 | urlPrefix = https://commoncrawl.s3.amazonaws.com/ 9 | 10 | #support downloading only a portion of Common Crawl 11 | #warcFileLimit=1000 12 | 13 | 14 | #DBpediaKBConfig 15 | 16 | dbpediaOwlUrl=http://downloads.dbpedia.org/2016-10/dbpedia_2016-10.owl 17 | objectsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/mappingbased_objects_en.ttl.bz2 18 | literalsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/mappingbased_literals_en.ttl.bz2 19 | labelsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/labels_en.ttl.bz2 20 | typesUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/instance_types_transitive_en.ttl.bz2 21 | 22 | #for ground truth 23 | maxNodeCorpusCount = 500000 24 | minNodeCorpusCount = 1 25 | useRelationTaxonomy = True 26 | 27 | #for the coarse-grained type system 28 | minTypeSize = 3000 29 | maxNumberOfTypes = 100 30 | 31 | #for the type filter 32 | minTypePairFreq = 1 33 | 34 | noNodeCorpusCounts = False 35 | 36 | 37 | #RelexConfig 38 | 39 | documentSampleFraction = 1.0 40 | negativeExampleSampleFraction = 0.05 41 | targetNegativeToPositveRatio = -1 42 | directionStyle = bothWays 43 | titleContext = False 44 | sectionContext = False 45 | limitEntitiesToGroundTruth = False 46 | gtTypes = False 47 | vocabLimit = 2000000 48 | vocabMinCount = 2 49 | minMentionSet = 1 50 | maxMentionSet = 100 51 | maxMentionGroups = 5 52 | maxPositionEmbeddings = 80 53 | typeStyle = single 54 | datasetSplitNames = [train, validate, test] 55 | datasetSpitFractions = [0.8, 0.1, 0.1] 56 | entityPairFilterClass = com.ibm.research.ai.ki.kbp.TypePairEntityPairFilter -------------------------------------------------------------------------------- /configSmall-de.properties: -------------------------------------------------------------------------------- 1 | #CommonCrawlConfig 2 | 3 | #German language (changed en->de) 4 | language=de 5 | 6 | minLanguageConfidence=0.8 7 | numThreads=8 8 | #save these types as offset annotation in the corpus 9 | annotationTypes = [LinkAnnotation] 10 | urlPrefix = https://commoncrawl.s3.amazonaws.com/ 11 | 12 | #support downloading only a portion of Common Crawl 13 | warcFileLimit=1000 14 | 15 | 16 | #DBpediaKBConfig 17 | 18 | dbpediaOwlUrl=http://downloads.dbpedia.org/2016-10/dbpedia_2016-10.owl 19 | 20 | #German language (changed urls en->de) 21 | objectsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/de/mappingbased_objects_de.ttl.bz2 22 | literalsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/de/mappingbased_literals_de.ttl.bz2 23 | labelsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/de/labels_de.ttl.bz2 24 | typesUrl=http://downloads.dbpedia.org/2016-10/core-i18n/de/instance_types_transitive_de.ttl.bz2 25 | 26 | #for ground truth 27 | #German language (50000 -> 5000) because less text 28 | maxNodeCorpusCount = 5000 29 | minNodeCorpusCount = 1 30 | useRelationTaxonomy = True 31 | 32 | #for the coarse-grained type system 33 | #German language (3000 -> 1000) because KB is smaller 34 | minTypeSize = 1000 35 | maxNumberOfTypes = 100 36 | 37 | #for the type filter 38 | minTypePairFreq = 1 39 | 40 | noNodeCorpusCounts = False 41 | 42 | 43 | #RelexConfig 44 | 45 | documentSampleFraction = 1.0 46 | negativeExampleSampleFraction = 0.05 47 | targetNegativeToPositveRatio = -1 48 | directionStyle = bothWays 49 | titleContext = False 50 | sectionContext = False 51 | limitEntitiesToGroundTruth = False 52 | gtTypes = False 53 | vocabLimit = 2000000 54 | vocabMinCount = 2 55 | minMentionSet = 1 56 | maxMentionSet = 100 57 | maxMentionGroups = 5 58 | maxPositionEmbeddings = 80 59 | typeStyle = single 60 | datasetSplitNames = [train, validate, test] 61 | datasetSpitFractions = [0.8, 0.1, 0.1] 62 | entityPairFilterClass = com.ibm.research.ai.ki.kbp.TypePairEntityPairFilter 63 | -------------------------------------------------------------------------------- /configSmall.properties: -------------------------------------------------------------------------------- 1 | #CommonCrawlConfig 2 | 3 | language=en 4 | minLanguageConfidence=0.8 5 | numThreads=8 6 | #save these types as offset annotation in the corpus 7 | annotationTypes = [LinkAnnotation] 8 | urlPrefix = https://commoncrawl.s3.amazonaws.com/ 9 | 10 | #support downloading only a portion of Common Crawl 11 | warcFileLimit=1000 12 | 13 | 14 | #DBpediaKBConfig 15 | 16 | dbpediaOwlUrl=http://downloads.dbpedia.org/2016-10/dbpedia_2016-10.owl 17 | objectsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/mappingbased_objects_en.ttl.bz2 18 | literalsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/mappingbased_literals_en.ttl.bz2 19 | labelsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/labels_en.ttl.bz2 20 | typesUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/instance_types_transitive_en.ttl.bz2 21 | 22 | #for ground truth 23 | maxNodeCorpusCount = 50000 24 | minNodeCorpusCount = 1 25 | useRelationTaxonomy = True 26 | 27 | #for the coarse-grained type system 28 | minTypeSize = 3000 29 | maxNumberOfTypes = 100 30 | 31 | #for the type filter 32 | minTypePairFreq = 1 33 | 34 | noNodeCorpusCounts = False 35 | 36 | 37 | #RelexConfig 38 | 39 | documentSampleFraction = 1.0 40 | negativeExampleSampleFraction = 0.05 41 | targetNegativeToPositveRatio = -1 42 | directionStyle = bothWays 43 | titleContext = False 44 | sectionContext = False 45 | limitEntitiesToGroundTruth = False 46 | gtTypes = False 47 | vocabLimit = 2000000 48 | vocabMinCount = 2 49 | minMentionSet = 1 50 | maxMentionSet = 100 51 | maxMentionGroups = 5 52 | maxPositionEmbeddings = 80 53 | typeStyle = single 54 | datasetSplitNames = [train, validate, test] 55 | datasetSpitFractions = [0.8, 0.1, 0.1] 56 | entityPairFilterClass = com.ibm.research.ai.ki.kbp.TypePairEntityPairFilter -------------------------------------------------------------------------------- /createSmall-de.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #stop at first error, unset variables are errors 4 | set -o nounset 5 | set -o errexit 6 | 7 | if [ "$#" -ne 1 ]; then 8 | echo "Please supply a single argument, the directory to save the dataset" 9 | exit 1 10 | fi 11 | 12 | ./create.sh $1 configSmall-de.properties 13 | -------------------------------------------------------------------------------- /createSmall.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #stop at first error, unset variables are errors 4 | set -o nounset 5 | set -o errexit 6 | 7 | if [ "$#" -ne 1 ]; then 8 | echo "Please supply a single argument, the directory to save the dataset" 9 | exit 1 10 | fi 11 | 12 | ./create.sh $1 configSmall.properties -------------------------------------------------------------------------------- /unaryConfig.properties: -------------------------------------------------------------------------------- 1 | #RelexConfig 2 | 3 | documentSampleFraction = 1.0 4 | negativeExampleSampleFraction = 1.0 5 | targetNegativeToPositveRatio = -1 6 | vocabLimit = 2000000 7 | vocabMinCount = 2 8 | minMentionSet = 1 9 | maxMentionSet = 100 10 | maxMentionGroups = 5 11 | maxPositionEmbeddings = 80 12 | typeStyle = single 13 | datasetSplitNames = [train, validate, test] 14 | datasetSpitFractions = [0.8, 0.1, 0.1] 15 | entityPairFilterClass = com.ibm.research.ai.ki.kbp.unary.DownsampleEntityFilter 16 | relexManagerClass = com.ibm.research.ai.ki.kbp.unary.RelexDatasetManagerUnary -------------------------------------------------------------------------------- /unaryCreate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #stop at first error, unset variables are errors 4 | set -o nounset 5 | set -o errexit 6 | 7 | scriptDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 8 | 9 | # Base directory to save the cc-dbp dataset in 10 | baseDir=$1 11 | # Configuration file to use 12 | config=${2:-unaryConfig.properties} 13 | 14 | # baseline context set construction 15 | java -Xmx8G -cp com.ibm.research.ai.ki.kbp/target/kbp-1.0.0-SNAPSHOT-jar-with-dependencies.jar \ 16 | com.ibm.research.ai.ki.kbp.KBPBuildDataset -unaryConfig $config -in $baseDir/docs-gaz.json.gz.b64 -out $baseDir/dataset -kb $baseDir/kb 17 | 18 | # show sample of positive context sets 19 | awk -F $'\t' '$6!=""' $baseDir/dataset/unaryContextSets/contexts-part0.tsv | head 20 | --------------------------------------------------------------------------------