├── .gitignore
├── LICENSE
├── README.md
├── boilerpipe
├── boilerpipe-common
│ ├── LICENSE
│ ├── README.md
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ └── java
│ │ └── com
│ │ └── kohlschutter
│ │ └── boilerpipe
│ │ ├── BoilerpipeDocumentSource.java
│ │ ├── BoilerpipeExtractor.java
│ │ ├── BoilerpipeFilter.java
│ │ ├── BoilerpipeInput.java
│ │ ├── BoilerpipeProcessingException.java
│ │ ├── conditions
│ │ └── TextBlockCondition.java
│ │ ├── document
│ │ ├── BPAnnotation.java
│ │ ├── HeaderAnnotation.java
│ │ ├── Image.java
│ │ ├── Link.java
│ │ ├── ParagraphAnnotation.java
│ │ ├── TextBlock.java
│ │ ├── TextDocument.java
│ │ ├── TextDocumentStatistics.java
│ │ ├── TextFormatAnnotation.java
│ │ └── package-info.java
│ │ ├── estimators
│ │ └── SimpleEstimator.java
│ │ ├── extractors
│ │ ├── ArticleExtractor.java
│ │ ├── ArticleSentencesExtractor.java
│ │ ├── CanolaExtractor.java
│ │ ├── CommonExtractors.java
│ │ ├── DefaultExtractor.java
│ │ ├── ExtractorBase.java
│ │ ├── KeepEverythingExtractor.java
│ │ ├── KeepEverythingWithMinKWordsExtractor.java
│ │ ├── LargestContentExtractor.java
│ │ ├── NumWordsRulesExtractor.java
│ │ └── package-info.java
│ │ ├── filters
│ │ ├── debug
│ │ │ └── PrintDebugFilter.java
│ │ ├── english
│ │ │ ├── DensityRulesClassifier.java
│ │ │ ├── HeuristicFilterBase.java
│ │ │ ├── IgnoreBlocksAfterContentFilter.java
│ │ │ ├── IgnoreBlocksAfterContentFromEndFilter.java
│ │ │ ├── KeepLargestFulltextBlockFilter.java
│ │ │ ├── MinFulltextWordsFilter.java
│ │ │ ├── NumWordsRulesClassifier.java
│ │ │ ├── TerminatingBlocksFinder.java
│ │ │ └── package-info.java
│ │ ├── heuristics
│ │ │ ├── AddPrecedingLabelsFilter.java
│ │ │ ├── ArticleMetadataFilter.java
│ │ │ ├── BlockProximityFusion.java
│ │ │ ├── ContentFusion.java
│ │ │ ├── DocumentTitleMatchClassifier.java
│ │ │ ├── ExpandTitleToContentFilter.java
│ │ │ ├── KeepLargestBlockFilter.java
│ │ │ ├── LabelFusion.java
│ │ │ ├── LargeBlockSameTagLevelToContentFilter.java
│ │ │ ├── ListAtEndFilter.java
│ │ │ ├── SimpleBlockFusionProcessor.java
│ │ │ ├── TrailingHeadlineToBoilerplateFilter.java
│ │ │ └── package-info.java
│ │ └── simple
│ │ │ ├── BoilerplateBlockFilter.java
│ │ │ ├── InvertedFilter.java
│ │ │ ├── LabelToBoilerplateFilter.java
│ │ │ ├── LabelToContentFilter.java
│ │ │ ├── MarkEverythingBoilerplateFilter.java
│ │ │ ├── MarkEverythingContentFilter.java
│ │ │ ├── MinClauseWordsFilter.java
│ │ │ ├── MinWordsFilter.java
│ │ │ ├── SplitParagraphBlocksFilter.java
│ │ │ ├── SurroundingToContentFilter.java
│ │ │ └── package-info.java
│ │ ├── labels
│ │ ├── ConditionalLabelAction.java
│ │ ├── DefaultLabels.java
│ │ └── LabelAction.java
│ │ ├── package-info.java
│ │ ├── sax
│ │ ├── BoilerpipeHTMLContentHandler.java
│ │ ├── BoilerpipeHTMLParser.java
│ │ ├── BoilerpipeSAXInput.java
│ │ ├── CommonTagActions.java
│ │ ├── DefaultTagActionMap.java
│ │ ├── HTMLDocument.java
│ │ ├── HTMLFetcher.java
│ │ ├── HTMLHighlighter.java
│ │ ├── ImageExtractor.java
│ │ ├── InputSourceable.java
│ │ ├── MarkupTagAction.java
│ │ ├── TagAction.java
│ │ ├── TagActionMap.java
│ │ └── package-info.java
│ │ └── util
│ │ ├── UnicodeTokenizer.java
│ │ └── package-info.java
└── nekohtml
│ ├── dependency-reduced-pom.xml
│ ├── pom.xml
│ └── src
│ └── main
│ └── java
│ └── org
│ └── cyberneko
│ └── html
│ ├── HTMLElements.java
│ └── HTMLTagBalancer.java
├── com.ibm.research.ai.ki.corpus
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── com
│ │ └── ibm
│ │ └── research
│ │ └── ai
│ │ └── ki
│ │ └── corpora
│ │ └── crawl
│ │ ├── CharsetDetect.java
│ │ ├── CommonCrawlConfig.java
│ │ ├── HtmlToDocument.java
│ │ ├── LanguageScorer.java
│ │ ├── SaveCommonCrawl.java
│ │ ├── SaveCommonCrawlBase.java
│ │ └── SaveCommonCrawlHdfs.java
│ └── resources
│ ├── cc-dbp
│ └── cc-dbp.properties
│ ├── log4j.properties
│ └── simplelogger.properties
├── com.ibm.research.ai.ki.kb
├── README.md
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── com
│ │ └── ibm
│ │ └── research
│ │ └── ai
│ │ └── ki
│ │ └── kb
│ │ ├── BuildGazetteer.java
│ │ ├── BuildGroundTruth.java
│ │ ├── ConfigureMinMaxEntityFreq.java
│ │ ├── FindUnary.java
│ │ ├── GroundTruthConfig.java
│ │ ├── KBConfig.java
│ │ ├── KBFiles.java
│ │ ├── NodePopularity.java
│ │ ├── RelationTaxonomy.java
│ │ ├── SelectTypes.java
│ │ ├── TypePairFilter.java
│ │ ├── conversion
│ │ ├── ConvertDBpedia.java
│ │ ├── DBpediaKBConfig.java
│ │ ├── MergeNodesDBpedia.java
│ │ ├── SelectRelations.java
│ │ └── SummaryCharts.java
│ │ └── explore
│ │ ├── CheckLabelCollisions.java
│ │ └── FilterByCorpusCount.java
│ └── resources
│ ├── dbpediaConfig.properties
│ └── relationSample.txt
├── com.ibm.research.ai.ki.kbp
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── com
│ │ └── ibm
│ │ └── research
│ │ └── ai
│ │ └── ki
│ │ └── kbp
│ │ ├── CoveredTextEntityId.java
│ │ ├── CreateTsvDataset.java
│ │ ├── CreateTsvDatasetTokenWindow.java
│ │ ├── DocumentFeatureString.java
│ │ ├── DocumentPreprocessing.java
│ │ ├── FilterEntsByGroundTruth.java
│ │ ├── GazetteerEDL.java
│ │ ├── GroundTruth.java
│ │ ├── GroupRelexMentionTsvDataset.java
│ │ ├── IEntityPairFilter.java
│ │ ├── IGroundTruth.java
│ │ ├── IPostprocessEntityRecognition.java
│ │ ├── IRelexDatasetManager.java
│ │ ├── IRelexMention.java
│ │ ├── IRelexTensors.java
│ │ ├── IRelexTsv.java
│ │ ├── KBPBuildDataset.java
│ │ ├── NounPhraseEntityWithId.java
│ │ ├── RelexConfig.java
│ │ ├── RelexDatasetFiles.java
│ │ ├── RelexDatasetManagerBinary.java
│ │ ├── RelexMention.java
│ │ ├── RelexMentionReader.java
│ │ ├── RelexStats.java
│ │ ├── RelexTensors.java
│ │ ├── RelexVocab.java
│ │ ├── ShowExamples.java
│ │ ├── Tokenizer.java
│ │ ├── TypePairEntityPairFilter.java
│ │ ├── baselines
│ │ └── NREConvert.java
│ │ ├── embeddings
│ │ ├── EmbeddingFormat.java
│ │ └── Word2VecConverter.java
│ │ └── unary
│ │ ├── DownsampleEntityFilter.java
│ │ ├── IEntityFilter.java
│ │ ├── RelexDatasetManagerUnary.java
│ │ ├── UnaryGroundTruth.java
│ │ ├── UnaryRelexMention.java
│ │ ├── UnaryRelexTensors.java
│ │ └── UnaryRelexTsvDataset.java
│ └── resources
│ └── relexConfigNonSpark.properties
├── com.ibm.research.ai.ki.nlp
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── com
│ │ │ └── ibm
│ │ │ ├── reseach
│ │ │ └── ai
│ │ │ │ └── ki
│ │ │ │ └── nlp
│ │ │ │ ├── AnnoRef.java
│ │ │ │ ├── Annotation.java
│ │ │ │ ├── Annotator.java
│ │ │ │ ├── Document.java
│ │ │ │ ├── DocumentJSONDeserializer.java
│ │ │ │ ├── DocumentJSONSerializer.java
│ │ │ │ ├── DocumentReader.java
│ │ │ │ ├── DocumentSerialize.java
│ │ │ │ ├── DocumentStructure.java
│ │ │ │ ├── DocumentWriter.java
│ │ │ │ ├── DocumentWriter2.java
│ │ │ │ ├── OffsetCorrection.java
│ │ │ │ ├── Pipeline.java
│ │ │ │ ├── PipelinedDocuments.java
│ │ │ │ ├── ResettingAnnotator.java
│ │ │ │ ├── TransformBase.java
│ │ │ │ ├── TransformRegex.java
│ │ │ │ ├── TransformString.java
│ │ │ │ ├── conversion
│ │ │ │ └── NIFSerialization.java
│ │ │ │ └── types
│ │ │ │ ├── Author.java
│ │ │ │ ├── Categories.java
│ │ │ │ ├── Chunk.java
│ │ │ │ ├── CorefIndex.java
│ │ │ │ ├── DocDate.java
│ │ │ │ ├── DocRelations.java
│ │ │ │ ├── DocumentContentType.java
│ │ │ │ ├── DocumentSource.java
│ │ │ │ ├── Entity.java
│ │ │ │ ├── EntityWithId.java
│ │ │ │ ├── Event.java
│ │ │ │ ├── LinkAnnotation.java
│ │ │ │ ├── LinkedEntity.java
│ │ │ │ ├── ListAnnotation.java
│ │ │ │ ├── ListItem.java
│ │ │ │ ├── Paragraph.java
│ │ │ │ ├── Relation.java
│ │ │ │ ├── Section.java
│ │ │ │ ├── SectionHeader.java
│ │ │ │ ├── Sentence.java
│ │ │ │ ├── TextFormatting.java
│ │ │ │ ├── Title.java
│ │ │ │ ├── Token.java
│ │ │ │ └── XmlTag.java
│ │ │ └── research
│ │ │ └── ai
│ │ │ └── ki
│ │ │ └── nlp
│ │ │ └── parse
│ │ │ ├── ClearNLPNER.java
│ │ │ ├── ClearNLPPOS.java
│ │ │ ├── ClearNLPParse.java
│ │ │ ├── ClearNLPSentence.java
│ │ │ ├── ClearNLPTokenize.java
│ │ │ ├── ClearNLPTransform.java
│ │ │ ├── DigitSequenceTokenize.java
│ │ │ ├── EntityToOccurrences.java
│ │ │ ├── GazetteerMatcher.java
│ │ │ ├── NormalizeTextTransform.java
│ │ │ ├── OpenNLPChunk.java
│ │ │ ├── OpenNLPNER.java
│ │ │ ├── OpenNLPPOS.java
│ │ │ ├── OpenNLPSentence.java
│ │ │ ├── OpenNLPTokenize.java
│ │ │ ├── RegexParagraph.java
│ │ │ ├── RegexTokenize.java
│ │ │ └── TokensSnapToEntities.java
│ └── resources
│ │ ├── com
│ │ └── ibm
│ │ │ └── research
│ │ │ └── ai
│ │ │ └── ki
│ │ │ └── nlp
│ │ │ └── parse
│ │ │ ├── clearNLP-replace.tsv
│ │ │ └── normalizeText-replace.tsv
│ │ ├── downloadOpenNLPModels.sh
│ │ ├── en-sent.bin
│ │ └── log4j.properties
│ └── test
│ └── java
│ └── com
│ └── ibm
│ └── research
│ └── ai
│ └── ki
│ └── nlp
│ ├── OverlappingSpansTest.java
│ ├── TestJSON.java
│ ├── TransformStringTest.java
│ └── parse
│ ├── TestClearNLP.java
│ ├── TestGazetteerMatcher.java
│ └── TestNER.java
├── com.ibm.research.ai.ki.spark
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── com
│ │ └── ibm
│ │ └── research
│ │ └── ai
│ │ └── ki
│ │ └── spark
│ │ ├── Base64ToBinary.java
│ │ ├── CorpusStatistics.java
│ │ ├── CreateW2VFile.java
│ │ ├── DocEntityStats.java
│ │ ├── GatherRelexStats.java
│ │ ├── GatherRelexVocab.java
│ │ ├── GazetteerPreprocess.java
│ │ ├── NonSparkGatherVocab.java
│ │ ├── RelexBuildDataset.java
│ │ ├── RelexTensorDataset.java
│ │ ├── RelexTsvDataset.java
│ │ ├── RunPipelineSpark.java
│ │ └── SimpleSparkJob.java
│ ├── resources
│ └── relexConfig.properties
│ └── scripts
│ ├── java-viacloud
│ └── java-viaspark
├── com.ibm.research.ai.ki.util
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── com
│ │ │ └── ibm
│ │ │ └── research
│ │ │ └── ai
│ │ │ └── ki
│ │ │ ├── formats
│ │ │ ├── ArchiveEntryIterable.java
│ │ │ ├── NTriples.java
│ │ │ └── SimpleTsvIterable.java
│ │ │ └── util
│ │ │ ├── BlockShuffler.java
│ │ │ ├── CollectionUtil.java
│ │ │ ├── CombinedSpans.java
│ │ │ ├── DenseVectors.java
│ │ │ ├── Distribution.java
│ │ │ ├── FileUtil.java
│ │ │ ├── FirstPairComparator.java
│ │ │ ├── HashMapUtil.java
│ │ │ ├── Lang.java
│ │ │ ├── LogLinear.java
│ │ │ ├── MutableDouble.java
│ │ │ ├── MutableInteger.java
│ │ │ ├── NBest.java
│ │ │ ├── NestedIterable.java
│ │ │ ├── NextOnlyIterator.java
│ │ │ ├── NonOverlappingSpans.java
│ │ │ ├── OverlappingSpans.java
│ │ │ ├── Pair.java
│ │ │ ├── PeriodicChecker.java
│ │ │ ├── PropertyLoader.java
│ │ │ ├── PropertyStruct.java
│ │ │ ├── RandomUtil.java
│ │ │ ├── SecondPairComparator.java
│ │ │ ├── Span.java
│ │ │ ├── SparseVectors.java
│ │ │ ├── ThreadedLoopIterator.java
│ │ │ ├── Warnings.java
│ │ │ ├── eval
│ │ │ ├── BootstrappingConfidenceInterval.java
│ │ │ ├── MultiPrecisionRecall.java
│ │ │ ├── PrecisionRecall.java
│ │ │ └── SamplingPermutationTest.java
│ │ │ ├── graphs
│ │ │ ├── GraphAlgorithms.java
│ │ │ ├── SnowballSampler.java
│ │ │ └── TreeAlgorithms.java
│ │ │ ├── io
│ │ │ ├── DataIO.java
│ │ │ ├── MultiFileWriter.java
│ │ │ ├── OldVersionOf.java
│ │ │ ├── RefactoringObjectInputStream.java
│ │ │ ├── TensorFileReader.java
│ │ │ └── TensorFileWriter.java
│ │ │ └── parallel
│ │ │ ├── BlockingThreadedExecutor.java
│ │ │ ├── ISimpleExecutor.java
│ │ │ ├── PollingThreadedExecutor.java
│ │ │ ├── SingleThreadedExecutor.java
│ │ │ └── StreamEater.java
│ └── resources
│ │ └── com
│ │ └── ibm
│ │ └── research
│ │ └── ai
│ │ └── ki
│ │ └── util
│ │ └── serializedMappings.properties
│ └── test
│ ├── java
│ └── com
│ │ └── ibm
│ │ └── research
│ │ └── ai
│ │ └── ki
│ │ └── util
│ │ ├── BjUtilTestCounter.java
│ │ ├── ExecuteJavaProc.java
│ │ ├── FileIteratorTest.java
│ │ ├── FileUtilTest.java
│ │ ├── HashMapUtilTest.java
│ │ ├── LangTest.java
│ │ ├── NBestTest.java
│ │ ├── NonOverlappingTest.java
│ │ ├── OverlappingSpansTest.java
│ │ ├── PrecisionRecallTest.java
│ │ ├── PropertyLoaderTest.java
│ │ ├── RandomUtilTest.java
│ │ ├── SpanTest.java
│ │ ├── SparseVectorsTest.java
│ │ └── TestTreeAlgorithms.java
│ └── resources
│ └── com
│ └── ibm
│ └── research
│ └── ai
│ └── ki
│ └── util
│ └── 1.properties
├── config.properties
├── configSmall-de.properties
├── configSmall.properties
├── create.sh
├── createSmall-de.sh
├── createSmall.sh
├── pom.xml
├── unaryConfig.properties
└── unaryCreate.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | .classpath
2 | .project
3 | .settings
4 | **/.classpath
5 | **/.project
6 | **/.settings
7 | target/**
8 | */target/**
9 | */target/*
10 | .metadata
11 | clientdb.xml
12 | release.properties
13 | pom.xml.releaseBackup
14 | *~
15 |
16 |
17 | # User specified git ignore directories (works recursively).
18 | *.DS_Store
19 | .metadata
20 | .recommenders
21 |
22 | .idea/
23 | **/.idea/
24 |
25 |
26 | # Byte-compiled / optimized / DLL files
27 | __pycache__/
28 | *.py[cod]
29 | *$py.class
30 |
31 | # C extensions
32 | *.so
33 |
34 | # Distribution / packaging
35 | .Python
36 | env/
37 | build/
38 | develop-eggs/
39 | dist/
40 | downloads/
41 | eggs/
42 | .eggs/
43 | lib/
44 | lib64/
45 | parts/
46 | sdist/
47 | var/
48 | *.egg-info/
49 | .installed.cfg
50 | *.egg
51 |
52 | # PyInstaller
53 | # Usually these files are written by a python script from a template
54 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
55 | *.manifest
56 | *.spec
57 |
58 | # Installer logs
59 | pip-log.txt
60 | pip-delete-this-directory.txt
61 |
62 | # Unit test / coverage reports
63 | htmlcov/
64 | .tox/
65 | .coverage
66 | .coverage.*
67 | .cache
68 | nosetests.xml
69 | coverage.xml
70 | *,cover
71 | .hypothesis/
72 |
73 | # Translations
74 | *.mo
75 | *.pot
76 |
77 | # Django stuff:
78 | *.log
79 | local_settings.py
80 |
81 | # Flask stuff:
82 | instance/
83 | .webassets-cache
84 |
85 | # Scrapy stuff:
86 | .scrapy
87 |
88 | # Sphinx documentation
89 | docs/_build/
90 |
91 | # PyBuilder
92 | target/
93 |
94 | # IPython Notebook
95 | .ipynb_checkpoints
96 |
97 | # pyenv
98 | .python-version
99 |
100 | # celery beat schedule file
101 | celerybeat-schedule
102 |
103 | # dotenv
104 | .env
105 |
106 | # virtualenv
107 | venv/
108 | ENV/
109 |
110 | # Spyder project settings
111 | .spyderproject
112 |
113 | # Rope project settings
114 | .ropeproject
115 |
116 |
117 |
118 |
119 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/LICENSE:
--------------------------------------------------------------------------------
1 | boilerpipe
2 |
3 | Copyright (c) 2009, 2014 Christian Kohlschütter
4 |
5 | The author licenses this file to You under the Apache License, Version 2.0
6 | (the "License"); you may not use this file except in compliance with
7 | the License. You may obtain a copy of the License at
8 |
9 | http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/README.md:
--------------------------------------------------------------------------------
1 | Fork of boilerpipe from https://github.com/kohlschutter/boilerpipe.
2 |
3 | This version produces offset annotation for links in the extracted TextBlocks.
4 | It also places a double newline between disconnected text blocks to help in paragraph and sentence segementation.
5 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 |
6 |
7 | com.ibm.research.ai.ki
8 | cc-dbp-parent-pom
9 | 1.0.0-SNAPSHOT
10 | ../..
11 |
12 |
13 | boilerpipe-common
14 | 1.0.0-SNAPSHOT
15 |
16 |
17 |
18 | com.ibm.research.ai.ki
19 | nekohtml
20 | 1.9.13-SNAPSHOT
21 |
22 |
23 |
24 | xerces
25 | xercesImpl
26 | 2.12.0
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/BoilerpipeDocumentSource.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe;
19 |
20 | import com.kohlschutter.boilerpipe.document.TextDocument;
21 |
22 | /**
23 | * Something that can be represented as a {@link TextDocument}.
24 | */
25 | public interface BoilerpipeDocumentSource {
26 | TextDocument toTextDocument() throws BoilerpipeProcessingException;
27 | }
28 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/BoilerpipeFilter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe;
19 |
20 | import com.kohlschutter.boilerpipe.document.TextDocument;
21 |
22 | /**
23 | * A generic {@link BoilerpipeFilter}. Takes a {@link TextDocument} and processes it somehow.
24 | */
25 | public interface BoilerpipeFilter {
26 | /**
27 | * Processes the given document doc.
28 | *
29 | * @param doc The {@link TextDocument} that is to be processed.
30 | * @return true if changes have been made to the {@link TextDocument}.
31 | * @throws BoilerpipeProcessingException
32 | */
33 | boolean process(final TextDocument doc) throws BoilerpipeProcessingException;
34 | }
35 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/BoilerpipeInput.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe;
19 |
20 | import com.kohlschutter.boilerpipe.document.TextDocument;
21 |
22 | /**
23 | * A source that returns {@link TextDocument}s.
24 | */
25 | public interface BoilerpipeInput {
26 | /**
27 | * Returns (somehow) a {@link TextDocument}.
28 | *
29 | * @return A {@link TextDocument}.
30 | * @throws BoilerpipeProcessingException
31 | */
32 | TextDocument getTextDocument() throws BoilerpipeProcessingException;
33 | }
34 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/BoilerpipeProcessingException.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe;
19 |
20 | /**
21 | * Exception for signaling failure in the processing pipeline.
22 | */
23 | public class BoilerpipeProcessingException extends Exception {
24 | private static final long serialVersionUID = 1L;
25 |
26 | public BoilerpipeProcessingException() {
27 | super();
28 | }
29 |
30 | public BoilerpipeProcessingException(String message, Throwable cause) {
31 | super(message, cause);
32 | }
33 |
34 | public BoilerpipeProcessingException(String message) {
35 | super(message);
36 | }
37 |
38 | public BoilerpipeProcessingException(Throwable cause) {
39 | super(cause);
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/conditions/TextBlockCondition.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.conditions;
19 |
20 | import com.kohlschutter.boilerpipe.document.TextBlock;
21 | import com.kohlschutter.boilerpipe.labels.ConditionalLabelAction;
22 |
23 | /**
24 | * Evaluates whether a given {@link TextBlock} meets a certain condition.
25 | *
26 | * Useful in combination with {@link ConditionalLabelAction}.
27 | */
28 | public interface TextBlockCondition {
29 | /**
30 | * Returns true iff the given {@link TextBlock} tb meets the defined condition.
31 | *
32 | * @param tb
33 | * @return iff the condition is met.
34 | */
35 | boolean meetsCondition(final TextBlock tb);
36 | }
37 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/BPAnnotation.java:
--------------------------------------------------------------------------------
1 | package com.kohlschutter.boilerpipe.document;
2 |
3 | /**
4 | * Used to represent structured elements of the html page that will be retained as offset annotations on the document.
5 | * @author mrglass
6 | *
7 | */
8 | public abstract class BPAnnotation implements Cloneable {
9 | public static final boolean debug = false;
10 |
11 | //CONSIDER: tag type? like 'a' or 'h1' or 'b'
12 | public int start;
13 | public int end;
14 |
15 | public final String localName;
16 |
17 | protected BPAnnotation(String localName) {
18 | this.start = 10000000;
19 | this.end = -10000000;
20 | this.localName = localName.toLowerCase();
21 | }
22 |
23 | public boolean isValid() {
24 | return end > start;
25 | }
26 |
27 | public void addOffset(int offset) {
28 | this.start += offset;
29 | this.end += offset;
30 | }
31 |
32 | public BPAnnotation clone() {
33 | try {
34 | return (BPAnnotation)super.clone();
35 | } catch (CloneNotSupportedException e) {
36 | throw new Error(e);
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/HeaderAnnotation.java:
--------------------------------------------------------------------------------
1 | package com.kohlschutter.boilerpipe.document;
2 |
3 | /**
4 | * HTML h* header annotation.
5 | * @author mrglass
6 | *
7 | */
8 | public class HeaderAnnotation extends BPAnnotation {
9 | public HeaderAnnotation(String localName) {
10 | super(localName);
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/Link.java:
--------------------------------------------------------------------------------
1 | package com.kohlschutter.boilerpipe.document;
2 |
3 | /**
4 | * HTML anchor tag as offset annotation
5 | * @author mrglass
6 | *
7 | */
8 | public class Link extends BPAnnotation {
9 | public String href;
10 |
11 | public Link(String href) {
12 | super("a");
13 | this.href = href;
14 | }
15 |
16 | public boolean isValid() {
17 | return start < end && href != null;
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/ParagraphAnnotation.java:
--------------------------------------------------------------------------------
1 | package com.kohlschutter.boilerpipe.document;
2 |
3 | /**
4 | * HTML paragraph tag
5 | * @author mrglass
6 | *
7 | */
8 | public class ParagraphAnnotation extends BPAnnotation {
9 | public ParagraphAnnotation() {
10 | super("p");
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/TextDocumentStatistics.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.document;
19 |
20 | /**
21 | * Provides shallow statistics on a given {@link TextDocument}
22 | */
23 | public final class TextDocumentStatistics {
24 | private int numWords = 0;
25 | private int numBlocks = 0;
26 |
27 | /**
28 | * Computes statistics on a given {@link TextDocument}.
29 | *
30 | * @param doc The {@link TextDocument}.
31 | * @param contentOnly if true then o
32 | */
33 | public TextDocumentStatistics(final TextDocument doc, final boolean contentOnly) {
34 | for (TextBlock tb : doc.getTextBlocks()) {
35 | if (contentOnly && !tb.isContent()) {
36 | continue;
37 | }
38 |
39 | numWords += tb.getNumWords();
40 | numBlocks++;
41 | }
42 | }
43 |
44 | /**
45 | * Returns the average number of words at block-level (= overall number of words divided by the
46 | * number of blocks).
47 | *
48 | * @return Average
49 | */
50 | public float avgNumWords() {
51 | return numWords / (float) numBlocks;
52 | }
53 |
54 | /**
55 | * Returns the overall number of words in all blocks.
56 | *
57 | * @return Sum
58 | */
59 | public int getNumWords() {
60 | return numWords;
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/TextFormatAnnotation.java:
--------------------------------------------------------------------------------
1 | package com.kohlschutter.boilerpipe.document;
2 |
3 | public class TextFormatAnnotation extends BPAnnotation {
4 | public TextFormatAnnotation(String localName) {
5 | super(localName);
6 | }
7 | }
8 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * The Boilerpipe document model.
3 | */
4 | package com.kohlschutter.boilerpipe.document;
5 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/estimators/SimpleEstimator.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.estimators;
19 |
20 | import com.kohlschutter.boilerpipe.BoilerpipeExtractor;
21 | import com.kohlschutter.boilerpipe.document.TextDocumentStatistics;
22 | import com.kohlschutter.boilerpipe.extractors.ArticleExtractor;
23 | import com.kohlschutter.boilerpipe.extractors.DefaultExtractor;
24 |
25 | /**
26 | * Estimates the "goodness" of a {@link BoilerpipeExtractor} on a given document.
27 | */
28 | public final class SimpleEstimator {
29 |
30 | /**
31 | * Returns the singleton instance of {@link SimpleEstimator}
32 | */
33 | public static final SimpleEstimator INSTANCE = new SimpleEstimator();
34 |
35 | private SimpleEstimator() {
36 | }
37 |
38 | /**
39 | * Given the statistics of the document before and after applying the {@link BoilerpipeExtractor},
40 | * can we regard the extraction quality (too) low?
41 | *
42 | * Works well with {@link DefaultExtractor}, {@link ArticleExtractor} and others.
43 | *
44 | * @param dsBefore
45 | * @param dsAfter
46 | * @return true if low quality is to be expected.
47 | */
48 | public boolean isLowQuality(final TextDocumentStatistics dsBefore,
49 | final TextDocumentStatistics dsAfter) {
50 | if (dsBefore.getNumWords() < 90 || dsAfter.getNumWords() < 70) {
51 | return true;
52 | }
53 |
54 | if (dsAfter.avgNumWords() < 25) {
55 | return true;
56 | }
57 |
58 | return false;
59 | }
60 |
61 | }
62 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/ArticleSentencesExtractor.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.extractors;
19 |
20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
21 | import com.kohlschutter.boilerpipe.document.TextDocument;
22 | import com.kohlschutter.boilerpipe.filters.simple.MinClauseWordsFilter;
23 | import com.kohlschutter.boilerpipe.filters.simple.SplitParagraphBlocksFilter;
24 |
25 | /**
26 | * A full-text extractor which is tuned towards extracting sentences from news articles.
27 | */
28 | public final class ArticleSentencesExtractor extends ExtractorBase {
29 | public static final ArticleSentencesExtractor INSTANCE = new ArticleSentencesExtractor();
30 |
31 | /**
32 | * Returns the singleton instance for {@link ArticleSentencesExtractor}.
33 | */
34 | public static ArticleSentencesExtractor getInstance() {
35 | return INSTANCE;
36 | }
37 |
38 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
39 | return
40 |
41 | ArticleExtractor.INSTANCE.process(doc) | SplitParagraphBlocksFilter.INSTANCE.process(doc)
42 | | MinClauseWordsFilter.INSTANCE.process(doc);
43 | }
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/CommonExtractors.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.extractors;
19 |
20 | import com.kohlschutter.boilerpipe.BoilerpipeExtractor;
21 |
22 | /**
23 | * Provides quick access to common {@link BoilerpipeExtractor}s.
24 | */
25 | public final class CommonExtractors {
26 | private CommonExtractors() {
27 | }
28 |
29 | /**
30 | * Works very well for most types of Article-like HTML.
31 | */
32 | public static final ArticleExtractor ARTICLE_EXTRACTOR = ArticleExtractor.INSTANCE;
33 |
34 | /**
35 | * Usually worse than {@link ArticleExtractor}, but simpler/no heuristics.
36 | */
37 | public static final DefaultExtractor DEFAULT_EXTRACTOR = DefaultExtractor.INSTANCE;
38 |
39 | /**
40 | * Like {@link DefaultExtractor}, but keeps the largest text block only.
41 | */
42 | public static final LargestContentExtractor LARGEST_CONTENT_EXTRACTOR =
43 | LargestContentExtractor.INSTANCE;
44 |
45 | /**
46 | * Trained on krdwrd Canola (different definition of "boilerplate"). You may give it a try.
47 | */
48 | public static final CanolaExtractor CANOLA_EXTRACTOR = CanolaExtractor.INSTANCE;
49 |
50 | /**
51 | * Dummy Extractor; should return the input text. Use this to double-check that your problem is
52 | * within a particular {@link BoilerpipeExtractor}, or somewhere else.
53 | */
54 | public static final KeepEverythingExtractor KEEP_EVERYTHING_EXTRACTOR =
55 | KeepEverythingExtractor.INSTANCE;
56 | }
57 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/DefaultExtractor.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.extractors;
19 |
20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
21 | import com.kohlschutter.boilerpipe.document.TextDocument;
22 | import com.kohlschutter.boilerpipe.filters.english.DensityRulesClassifier;
23 | import com.kohlschutter.boilerpipe.filters.heuristics.BlockProximityFusion;
24 | import com.kohlschutter.boilerpipe.filters.heuristics.SimpleBlockFusionProcessor;
25 |
26 | /**
27 | * A quite generic full-text extractor.
28 | */
29 | public class DefaultExtractor extends ExtractorBase {
30 | public static final DefaultExtractor INSTANCE = new DefaultExtractor();
31 |
32 | /**
33 | * Returns the singleton instance for {@link DefaultExtractor}.
34 | */
35 | public static DefaultExtractor getInstance() {
36 | return INSTANCE;
37 | }
38 |
39 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
40 |
41 | return
42 |
43 | SimpleBlockFusionProcessor.INSTANCE.process(doc)
44 | | BlockProximityFusion.MAX_DISTANCE_1.process(doc)
45 | | DensityRulesClassifier.INSTANCE.process(doc);
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/KeepEverythingExtractor.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.extractors;
19 |
20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
21 | import com.kohlschutter.boilerpipe.document.TextDocument;
22 | import com.kohlschutter.boilerpipe.filters.simple.MarkEverythingContentFilter;
23 |
24 | /**
25 | * Marks everything as content.
26 | */
27 | public final class KeepEverythingExtractor extends ExtractorBase {
28 |
29 | public static final KeepEverythingExtractor INSTANCE = new KeepEverythingExtractor();
30 |
31 | private KeepEverythingExtractor() {
32 |
33 | }
34 |
35 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
36 | return MarkEverythingContentFilter.INSTANCE.process(doc);
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/KeepEverythingWithMinKWordsExtractor.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.extractors;
19 |
20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
21 | import com.kohlschutter.boilerpipe.document.TextDocument;
22 | import com.kohlschutter.boilerpipe.filters.heuristics.SimpleBlockFusionProcessor;
23 | import com.kohlschutter.boilerpipe.filters.simple.MarkEverythingContentFilter;
24 | import com.kohlschutter.boilerpipe.filters.simple.MinWordsFilter;
25 |
26 | /**
27 | * A full-text extractor which extracts the largest text component of a page. For news articles, it
28 | * may perform better than the {@link DefaultExtractor}, but usually worse than
29 | * {@link ArticleExtractor}.
30 | */
31 | public final class KeepEverythingWithMinKWordsExtractor extends ExtractorBase {
32 |
33 | private final MinWordsFilter filter;
34 |
35 | public KeepEverythingWithMinKWordsExtractor(final int kMin) {
36 | this.filter = new MinWordsFilter(kMin);
37 | }
38 |
39 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
40 | return SimpleBlockFusionProcessor.INSTANCE.process(doc)
41 | | MarkEverythingContentFilter.INSTANCE.process(doc) | filter.process(doc);
42 | }
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/LargestContentExtractor.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.extractors;
19 |
20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
21 | import com.kohlschutter.boilerpipe.document.TextDocument;
22 | import com.kohlschutter.boilerpipe.filters.english.NumWordsRulesClassifier;
23 | import com.kohlschutter.boilerpipe.filters.heuristics.BlockProximityFusion;
24 | import com.kohlschutter.boilerpipe.filters.heuristics.KeepLargestBlockFilter;
25 |
26 | /**
27 | * A full-text extractor which extracts the largest text component of a page. For news articles, it
28 | * may perform better than the {@link DefaultExtractor}, but usually worse than
29 | * {@link ArticleExtractor}.
30 | */
31 | public final class LargestContentExtractor extends ExtractorBase {
32 | public static final LargestContentExtractor INSTANCE = new LargestContentExtractor();
33 |
34 | private LargestContentExtractor() {
35 | }
36 |
37 | /**
38 | * Returns the singleton instance for {@link LargestContentExtractor}.
39 | */
40 | public static LargestContentExtractor getInstance() {
41 | return INSTANCE;
42 | }
43 |
44 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
45 | return NumWordsRulesClassifier.INSTANCE.process(doc)
46 | | BlockProximityFusion.MAX_DISTANCE_1.process(doc)
47 | | KeepLargestBlockFilter.INSTANCE.process(doc);
48 | }
49 |
50 | }
51 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/NumWordsRulesExtractor.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.extractors;
19 |
20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
21 | import com.kohlschutter.boilerpipe.document.TextDocument;
22 | import com.kohlschutter.boilerpipe.filters.english.NumWordsRulesClassifier;
23 |
24 | /**
25 | * A quite generic full-text extractor solely based upon the number of words per block (the current,
26 | * the previous and the next block).
27 | */
28 | public class NumWordsRulesExtractor extends ExtractorBase {
29 | public static final NumWordsRulesExtractor INSTANCE = new NumWordsRulesExtractor();
30 |
31 | /**
32 | * Returns the singleton instance for {@link NumWordsRulesExtractor}.
33 | */
34 | public static NumWordsRulesExtractor getInstance() {
35 | return INSTANCE;
36 | }
37 |
38 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
39 |
40 | return NumWordsRulesClassifier.INSTANCE.process(doc);
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Some standard extractors (i.e., completely piped BoilerpipeFilters)
3 | */
4 | package com.kohlschutter.boilerpipe.extractors;
5 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/english/HeuristicFilterBase.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.filters.english;
19 |
20 | import com.kohlschutter.boilerpipe.document.TextBlock;
21 |
22 | /**
23 | * Base class for some heuristics that are used by boilerpipe filters.
24 | */
25 | abstract class HeuristicFilterBase {
26 |
27 | protected static int getNumFullTextWords(final TextBlock tb) {
28 | return getNumFullTextWords(tb, 9);
29 | }
30 |
31 | protected static int getNumFullTextWords(final TextBlock tb, float minTextDensity) {
32 | if (tb.getTextDensity() >= minTextDensity) {
33 | return tb.getNumWords();
34 | } else {
35 | return 0;
36 | }
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/english/MinFulltextWordsFilter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.filters.english;
19 |
20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 | import com.kohlschutter.boilerpipe.document.TextDocument;
24 |
25 | /**
26 | * Keeps only those content blocks which contain at least k full-text words (measured by
27 | * {@link HeuristicFilterBase#getNumFullTextWords(TextBlock)}). k is 30 by default.
28 | */
29 | public final class MinFulltextWordsFilter extends HeuristicFilterBase implements BoilerpipeFilter {
30 | public static final MinFulltextWordsFilter DEFAULT_INSTANCE = new MinFulltextWordsFilter(30);
31 | private final int minWords;
32 |
33 | public static MinFulltextWordsFilter getDefaultInstance() {
34 | return DEFAULT_INSTANCE;
35 | }
36 |
37 | public MinFulltextWordsFilter(final int minWords) {
38 | this.minWords = minWords;
39 | }
40 |
41 | public boolean process(final TextDocument doc) throws BoilerpipeProcessingException {
42 |
43 | boolean changes = false;
44 |
45 | for (TextBlock tb : doc.getTextBlocks()) {
46 | if (!tb.isContent()) {
47 | continue;
48 | }
49 | if (getNumFullTextWords(tb) < minWords) {
50 | tb.setIsContent(false);
51 | changes = true;
52 | }
53 |
54 | }
55 |
56 | return changes;
57 |
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/english/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * These BoilerpipeFilters have only been tested on English text.
3 | *
4 | * That is, they will probably work with other Western languages, but maybe need some parameter tuning to perform well.
5 | */
6 | package com.kohlschutter.boilerpipe.filters.english;
7 |
8 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/heuristics/ListAtEndFilter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.filters.heuristics;
19 |
20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 | import com.kohlschutter.boilerpipe.document.TextDocument;
24 | import com.kohlschutter.boilerpipe.labels.DefaultLabels;
25 |
26 | /**
27 | * Marks nested list-item blocks after the end of the main content.
28 | */
29 | public final class ListAtEndFilter implements BoilerpipeFilter {
30 | public static final ListAtEndFilter INSTANCE = new ListAtEndFilter();
31 |
32 | private ListAtEndFilter() {
33 | }
34 |
35 | public boolean process(final TextDocument doc) throws BoilerpipeProcessingException {
36 |
37 | boolean changes = false;
38 |
39 | int tagLevel = Integer.MAX_VALUE;
40 | for (TextBlock tb : doc.getTextBlocks()) {
41 | if (tb.isContent() && tb.hasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) {
42 | tagLevel = tb.getTagLevel();
43 | } else {
44 | if (tb.getTagLevel() > tagLevel && tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT)
45 | && tb.hasLabel(DefaultLabels.LI) && tb.getLinkDensity() == 0) {
46 | tb.setIsContent(true);
47 | changes = true;
48 | } else {
49 | tagLevel = Integer.MAX_VALUE;
50 | }
51 | }
52 | }
53 |
54 | return changes;
55 |
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/heuristics/SimpleBlockFusionProcessor.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.filters.heuristics;
19 |
20 | import java.util.Iterator;
21 | import java.util.List;
22 |
23 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
24 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
25 | import com.kohlschutter.boilerpipe.document.TextBlock;
26 | import com.kohlschutter.boilerpipe.document.TextDocument;
27 |
28 | /**
29 | * Merges two subsequent blocks if their text densities are equal.
30 | */
31 | public class SimpleBlockFusionProcessor implements BoilerpipeFilter {
32 | public static final SimpleBlockFusionProcessor INSTANCE = new SimpleBlockFusionProcessor();
33 |
34 | /**
35 | * Returns the singleton instance for BlockFusionProcessor.
36 | */
37 | public static SimpleBlockFusionProcessor getInstance() {
38 | return INSTANCE;
39 | }
40 |
41 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
42 | List textBlocks = doc.getTextBlocks();
43 | boolean changes = false;
44 |
45 | if (textBlocks.size() < 2) {
46 | return false;
47 | }
48 |
49 | TextBlock b1 = textBlocks.get(0);
50 | for (Iterator it = textBlocks.listIterator(1); it.hasNext();) {
51 | TextBlock b2 = it.next();
52 |
53 | final boolean similar = (b1.getTextDensity() == b2.getTextDensity());
54 |
55 | if (similar) {
56 | b1.mergeNext(b2);
57 | it.remove();
58 | changes = true;
59 | } else {
60 | b1 = b2;
61 | }
62 | }
63 |
64 | return changes;
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/heuristics/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * These BoilerpipeFilters are pure heuristics.
3 | */
4 | package com.kohlschutter.boilerpipe.filters.heuristics;
5 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/InvertedFilter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.filters.simple;
19 |
20 | import java.util.List;
21 |
22 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
23 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
24 | import com.kohlschutter.boilerpipe.document.TextBlock;
25 | import com.kohlschutter.boilerpipe.document.TextDocument;
26 |
27 | /**
28 | * Reverts the "isContent" flag for all {@link TextBlock}s
29 | */
30 | public final class InvertedFilter implements BoilerpipeFilter {
31 | public static final InvertedFilter INSTANCE = new InvertedFilter();
32 |
33 | private InvertedFilter() {
34 | }
35 |
36 | public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
37 |
38 | List tbs = doc.getTextBlocks();
39 | if (tbs.isEmpty()) {
40 | return false;
41 | }
42 | for (TextBlock tb : tbs) {
43 | tb.setIsContent(!tb.isContent());
44 | }
45 |
46 | return true;
47 | }
48 |
49 | }
50 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/LabelToBoilerplateFilter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.filters.simple;
19 |
20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 | import com.kohlschutter.boilerpipe.document.TextDocument;
24 | import com.kohlschutter.boilerpipe.labels.DefaultLabels;
25 |
26 | /**
27 | * Marks all blocks that contain a given label as "boilerplate".
28 | */
29 | public final class LabelToBoilerplateFilter implements BoilerpipeFilter {
30 | public static final LabelToBoilerplateFilter INSTANCE_STRICTLY_NOT_CONTENT =
31 | new LabelToBoilerplateFilter(DefaultLabels.STRICTLY_NOT_CONTENT);
32 |
33 | private String[] labels;
34 |
35 | public LabelToBoilerplateFilter(final String... label) {
36 | this.labels = label;
37 | }
38 |
39 | public boolean process(final TextDocument doc) throws BoilerpipeProcessingException {
40 |
41 | boolean changes = false;
42 |
43 | BLOCK_LOOP : for (TextBlock tb : doc.getTextBlocks()) {
44 | if (tb.isContent()) {
45 | for (String label : labels) {
46 | if (tb.hasLabel(label)) {
47 | tb.setIsContent(false);
48 | changes = true;
49 | continue BLOCK_LOOP;
50 | }
51 | }
52 | }
53 | }
54 |
55 | return changes;
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/LabelToContentFilter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.filters.simple;
19 |
20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 | import com.kohlschutter.boilerpipe.document.TextDocument;
24 |
25 | /**
26 | * Marks all blocks that contain a given label as "content".
27 | */
28 | public final class LabelToContentFilter implements BoilerpipeFilter {
29 | private String[] labels;
30 |
31 | public LabelToContentFilter(final String... label) {
32 | this.labels = label;
33 | }
34 |
35 | public boolean process(final TextDocument doc) throws BoilerpipeProcessingException {
36 |
37 | boolean changes = false;
38 |
39 | BLOCK_LOOP : for (TextBlock tb : doc.getTextBlocks()) {
40 | if (!tb.isContent()) {
41 | for (String label : labels) {
42 | if (tb.hasLabel(label)) {
43 | tb.setIsContent(true);
44 | changes = true;
45 | continue BLOCK_LOOP;
46 | }
47 | }
48 | }
49 | }
50 |
51 | return changes;
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/MarkEverythingBoilerplateFilter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.filters.simple;
19 |
20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 | import com.kohlschutter.boilerpipe.document.TextDocument;
24 |
25 | /**
26 | * Marks all blocks as boilerplate.
27 | */
28 | public final class MarkEverythingBoilerplateFilter implements BoilerpipeFilter {
29 | public static final MarkEverythingBoilerplateFilter INSTANCE =
30 | new MarkEverythingBoilerplateFilter();
31 |
32 | private MarkEverythingBoilerplateFilter() {
33 | }
34 |
35 | public boolean process(final TextDocument doc) throws BoilerpipeProcessingException {
36 |
37 | boolean changes = false;
38 |
39 | for (TextBlock tb : doc.getTextBlocks()) {
40 | if (tb.isContent()) {
41 | tb.setIsContent(false);
42 | changes = true;
43 | }
44 | }
45 |
46 | return changes;
47 |
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/MarkEverythingContentFilter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.filters.simple;
19 |
20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 | import com.kohlschutter.boilerpipe.document.TextDocument;
24 |
25 | /**
26 | * Marks all blocks as content.
27 | */
28 | public final class MarkEverythingContentFilter implements BoilerpipeFilter {
29 | public static final MarkEverythingContentFilter INSTANCE = new MarkEverythingContentFilter();
30 |
31 | private MarkEverythingContentFilter() {
32 | }
33 |
34 | public boolean process(final TextDocument doc) throws BoilerpipeProcessingException {
35 |
36 | boolean changes = false;
37 |
38 | for (TextBlock tb : doc.getTextBlocks()) {
39 | if (!tb.isContent()) {
40 | tb.setIsContent(true);
41 | changes = true;
42 | }
43 | }
44 |
45 | return changes;
46 |
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/MinWordsFilter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.filters.simple;
19 |
20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 | import com.kohlschutter.boilerpipe.document.TextDocument;
24 |
25 | /**
26 | * Keeps only those content blocks which contain at least k words.
27 | */
28 | public final class MinWordsFilter implements BoilerpipeFilter {
29 | private final int minWords;
30 |
31 | public MinWordsFilter(final int minWords) {
32 | this.minWords = minWords;
33 | }
34 |
35 | public boolean process(final TextDocument doc) throws BoilerpipeProcessingException {
36 |
37 | boolean changes = false;
38 |
39 | for (TextBlock tb : doc.getTextBlocks()) {
40 | if (!tb.isContent()) {
41 | continue;
42 | }
43 | if (tb.getNumWords() < minWords) {
44 | tb.setIsContent(false);
45 | changes = true;
46 | }
47 |
48 | }
49 |
50 | return changes;
51 |
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * These BoilerpipeFilters are straight-forward and probably not really specific to English.
3 | */
4 | package com.kohlschutter.boilerpipe.filters.simple;
5 |
6 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/labels/ConditionalLabelAction.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.labels;
19 |
20 | import com.kohlschutter.boilerpipe.conditions.TextBlockCondition;
21 | import com.kohlschutter.boilerpipe.document.TextBlock;
22 |
23 | /**
24 | * Adds labels to a {@link TextBlock} if the given criteria are met.
25 | */
26 | public final class ConditionalLabelAction extends LabelAction {
27 |
28 | private final TextBlockCondition condition;
29 |
30 | public ConditionalLabelAction(TextBlockCondition condition, String... labels) {
31 | super(labels);
32 | this.condition = condition;
33 | }
34 |
35 | public void addTo(final TextBlock tb) {
36 | if (condition.meetsCondition(tb)) {
37 | addLabelsTo(tb);
38 | }
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/labels/DefaultLabels.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.labels;
19 |
20 | import com.kohlschutter.boilerpipe.document.TextBlock;
21 |
22 | /**
23 | * Some pre-defined labels which can be used in conjunction with {@link TextBlock#addLabel(String)}
24 | * and {@link TextBlock#hasLabel(String)}.
25 | */
26 | public final class DefaultLabels {
27 | public static final String TITLE = "de.l3s.boilerpipe/TITLE";
28 | public static final String ARTICLE_METADATA = "de.l3s.boilerpipe/ARTICLE_METADATA";
29 | public static final String INDICATES_END_OF_TEXT = "de.l3s.boilerpipe/INDICATES_END_OF_TEXT";
30 | public static final String MIGHT_BE_CONTENT = "de.l3s.boilerpipe/MIGHT_BE_CONTENT";
31 | public static final String VERY_LIKELY_CONTENT = "de.l3s.boilerpipe/VERY_LIKELY_CONTENT";
32 | public static final String STRICTLY_NOT_CONTENT = "de.l3s.boilerpipe/STRICTLY_NOT_CONTENT";
33 | public static final String HR = "de.l3s.boilerpipe/HR";
34 | public static final String LI = "de.l3s.boilerpipe/LI";
35 |
36 | public static final String HEADING = "de.l3s.boilerpipe/HEADING";
37 | public static final String H1 = "de.l3s.boilerpipe/H1";
38 | public static final String H2 = "de.l3s.boilerpipe/H2";
39 | public static final String H3 = "de.l3s.boilerpipe/H3";
40 |
41 | public static final String MARKUP_PREFIX = "<";
42 |
43 | private DefaultLabels() {
44 | // not to be instantiated
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/labels/LabelAction.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.labels;
19 |
20 | import java.util.Arrays;
21 |
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 |
24 | /**
25 | * Helps adding labels to {@link TextBlock}s.
26 | *
27 | * @see ConditionalLabelAction
28 | */
29 | public class LabelAction {
30 | protected final String[] labels;
31 |
32 | public LabelAction(String... labels) {
33 | this.labels = labels;
34 | }
35 |
36 | public void addTo(final TextBlock tb) {
37 | addLabelsTo(tb);
38 | }
39 |
40 | protected final void addLabelsTo(final TextBlock tb) {
41 | tb.addLabels(labels);
42 | }
43 |
44 | public String toString() {
45 | return super.toString() + "{" + Arrays.asList(labels) + "}";
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * The Boilerpipe top-level package.
3 | */
4 | package com.kohlschutter.boilerpipe;
5 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/sax/HTMLDocument.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.sax;
19 |
20 | import java.io.ByteArrayInputStream;
21 | import java.nio.charset.Charset;
22 |
23 | import org.xml.sax.InputSource;
24 |
25 | /**
26 | * An {@link InputSourceable} for {@link HTMLFetcher}.
27 | */
28 | public class HTMLDocument implements InputSourceable {
29 | private final Charset charset;
30 | private final byte[] data;
31 |
32 | public HTMLDocument(final byte[] data, final Charset charset) {
33 | this.data = data;
34 | this.charset = charset;
35 | }
36 |
37 | public HTMLDocument(final String data) {
38 | Charset cs = Charset.forName("utf-8");
39 | this.data = data.getBytes(cs);
40 | this.charset = cs;
41 | }
42 |
43 | public Charset getCharset() {
44 | return charset;
45 | }
46 |
47 | public byte[] getData() {
48 | return data;
49 | }
50 |
51 | public InputSource toInputSource() {
52 | final InputSource is = new InputSource(new ByteArrayInputStream(data));
53 | is.setEncoding(charset.name());
54 | return is;
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/sax/InputSourceable.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.sax;
19 |
20 | import org.xml.sax.InputSource;
21 |
22 | /**
23 | * An InputSourceable can return an arbitrary number of new {@link InputSource}s for a given
24 | * document.
25 | */
26 | public interface InputSourceable {
27 | InputSource toInputSource();
28 | }
29 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/sax/TagAction.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.sax;
19 |
20 | import org.xml.sax.Attributes;
21 | import org.xml.sax.SAXException;
22 |
23 | /**
24 | * Defines an action that is to be performed whenever a particular tag occurs during HTML parsing.
25 | */
26 | public interface TagAction {
27 |
28 | boolean start(final BoilerpipeHTMLContentHandler instance, final String localName,
29 | final String qName, final Attributes atts) throws SAXException;
30 |
31 | boolean end(final BoilerpipeHTMLContentHandler instance, final String localName,
32 | final String qName) throws SAXException;
33 |
34 | boolean changesTagLevel();
35 | }
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/sax/TagActionMap.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.sax;
19 |
20 | import java.util.HashMap;
21 |
22 | /**
23 | * Base class for definition a set of {@link TagAction}s that are to be used for the HTML parsing
24 | * process.
25 | *
26 | * @see DefaultTagActionMap
27 | */
28 | public abstract class TagActionMap extends HashMap {
29 | private static final long serialVersionUID = 1L;
30 |
31 | /**
32 | * Sets a particular {@link TagAction} for a given tag. Any existing TagAction for that tag will
33 | * be removed and overwritten.
34 | *
35 | * @param tag The tag (will be stored internally 1. as it is, 2. lower-case, 3. upper-case)
36 | * @param action The {@link TagAction}
37 | */
38 | protected void setTagAction(final String tag, final TagAction action) {
39 | put(tag.toUpperCase(), action);
40 | put(tag.toLowerCase(), action);
41 | put(tag, action);
42 | }
43 |
44 | /**
45 | * Adds a particular {@link TagAction} for a given tag. If a TagAction already exists for that
46 | * tag, a chained action, consisting of the previous and the new {@link TagAction} is created.
47 | *
48 | * @param tag The tag (will be stored internally 1. as it is, 2. lower-case, 3. upper-case)
49 | * @param action The {@link TagAction}
50 | */
51 | protected void addTagAction(final String tag, final TagAction action) {
52 | TagAction previousAction = get(tag);
53 | if (previousAction == null) {
54 | setTagAction(tag, action);
55 | } else {
56 | setTagAction(tag, new CommonTagActions.Chained(previousAction, action));
57 | }
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/sax/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Classes related to parsing and producing HTML from/to Boilerpipe TextDocuments.
3 | */
4 | package com.kohlschutter.boilerpipe.sax;
5 |
6 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/util/UnicodeTokenizer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * boilerpipe
3 | *
4 | * Copyright (c) 2009, 2014 Christian Kohlschütter
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.kohlschutter.boilerpipe.util;
19 |
20 | import java.util.regex.Pattern;
21 |
22 | /**
23 | * Tokenizes text according to Unicode word boundaries and strips off non-word characters.
24 | */
25 | public class UnicodeTokenizer {
26 | private static final Pattern PAT_WORD_BOUNDARY = Pattern.compile("\\b");
27 | private static final Pattern PAT_NOT_WORD_BOUNDARY = Pattern
28 | .compile("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)/])[\u2063]*");
29 |
30 | /**
31 | * Tokenizes the text and returns an array of tokens.
32 | *
33 | * @param text The text
34 | * @return The tokens
35 | */
36 | public static String[] tokenize(final CharSequence text) {
37 | return PAT_NOT_WORD_BOUNDARY.matcher(PAT_WORD_BOUNDARY.matcher(text).replaceAll("\u2063"))
38 | .replaceAll("$1").replaceAll("[ \u2063]+", " ").trim().split("[ ]+");
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/util/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Some helper classes.
3 | */
4 | package com.kohlschutter.boilerpipe.util;
5 |
6 |
--------------------------------------------------------------------------------
/boilerpipe/nekohtml/dependency-reduced-pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | cc-dbp-parent-pom
5 | com.ibm.research.ai.ki
6 | 1.0.0-SNAPSHOT
7 | ../../pom.xml
8 |
9 | 4.0.0
10 | nekohtml
11 | 1.9.13-SNAPSHOT
12 |
13 |
14 |
15 | maven-shade-plugin
16 | 2.3
17 |
18 |
19 | package
20 |
21 | shade
22 |
23 |
24 |
25 |
26 | net.sourceforge.nekohtml:nekohtml
27 |
28 |
29 | true
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 | xerces
39 | xercesImpl
40 | 2.9.1
41 | compile
42 |
43 |
44 | xml-apis
45 | xml-apis
46 | 1.3.04
47 | compile
48 |
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/boilerpipe/nekohtml/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 |
6 |
7 | com.ibm.research.ai.ki
8 | cc-dbp-parent-pom
9 | 1.0.0-SNAPSHOT
10 | ../..
11 |
12 |
13 | nekohtml
14 | 1.9.13-SNAPSHOT
15 |
16 |
17 |
18 |
19 | maven-shade-plugin
20 | 2.3
21 |
22 |
23 | package
24 |
25 | shade
26 |
27 |
28 |
29 |
30 | net.sourceforge.nekohtml:nekohtml
31 |
32 |
33 | true
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 | net.sourceforge.nekohtml
44 | nekohtml
45 | 1.9.13
46 |
47 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.corpus/src/main/java/com/ibm/research/ai/ki/corpora/crawl/CharsetDetect.java:
--------------------------------------------------------------------------------
1 | /**
2 | * cc-dbp-dataset
3 | *
4 | * Copyright (c) 2017 IBM
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.ibm.research.ai.ki.corpora.crawl;
19 |
20 | import java.io.*;
21 | import java.nio.charset.*;
22 |
23 | import org.mozilla.universalchardet.*;
24 |
25 | public class CharsetDetect {
26 | static String mapCharset(String charsetName) {
27 | try {
28 | if (Charset.isSupported(charsetName))
29 | return charsetName;
30 | String lc = charsetName.toLowerCase();
31 | if(lc.contains("iso8859-1") || lc.contains("iso-8859-1")) {
32 | return "cp1252";
33 | }
34 | return charsetName;
35 | } catch (Throwable t) {
36 | return "UTF-8";
37 | }
38 | }
39 |
40 | public static String getCharsetFromBytes(byte buffer[]) throws IOException {
41 | UniversalDetector detector = new UniversalDetector(null);
42 | detector.handleData(buffer, 0, buffer.length);
43 | detector.dataEnd();
44 | String charsetName = detector.getDetectedCharset();
45 | detector.reset();
46 | return mapCharset(charsetName);
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.corpus/src/main/java/com/ibm/research/ai/ki/corpora/crawl/CommonCrawlConfig.java:
--------------------------------------------------------------------------------
1 | package com.ibm.research.ai.ki.corpora.crawl;
2 |
3 | import com.ibm.research.ai.ki.util.*;
4 |
5 | public class CommonCrawlConfig extends PropertyStruct {
6 | private static final long serialVersionUID = 1L;
7 |
8 | /**
9 | * See https://github.com/optimaize/language-detector for langauge options
10 | */
11 | public String language = "en";
12 | /**
13 | * The language detector is typically very confident, most values are close to one or zero
14 | */
15 | public double minLanguageConfidence = 0.8;
16 | /**
17 | * Possible options are LinkAnnotation, SectionHeader, Paragraph and TextFormating.
18 | * LinkAnnotation retains the anchor tag information (which spans of text are links and where they link to).
19 | */
20 | public String[] annotationTypes = new String[] {"LinkAnnotation"};
21 | /**
22 | * Number of threads downloading parts of Common Crawl, also the number of part files that will be created.
23 | */
24 | public int numThreads = 8;
25 | /**
26 | * URL prefix to add to the WARC file list
27 | */
28 | public String urlPrefix = "https://commoncrawl.s3.amazonaws.com/";
29 |
30 | /**
31 | * To download only a portion of common crawl, limited to this many files.
32 | */
33 | public int warcFileLimit;
34 | }
35 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.corpus/src/main/resources/cc-dbp/cc-dbp.properties:
--------------------------------------------------------------------------------
1 | #CommonCrawlConfig
2 |
3 | language=en
4 | minLanguageConfidence=0.8
5 | numThreads=8
6 | annotationTypes = [LinkAnnotation]
7 | urlPrefix = https://commoncrawl.s3.amazonaws.com/
8 |
9 |
10 |
11 | #support downloading only a portion with
12 | warcFileLimit=10
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.corpus/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=WARN, stdout
3 |
4 | # Redirect log messages to console
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.corpus/src/main/resources/simplelogger.properties:
--------------------------------------------------------------------------------
1 | # SLF4J's SimpleLogger configuration file
2 | # Simple implementation of Logger that sends all enabled log messages, for all defined loggers, to System.err.
3 |
4 | # Default logging detail level for all instances of SimpleLogger.
5 | # Must be one of ("trace", "debug", "info", "warn", or "error").
6 | # If not specified, defaults to "info".
7 | org.slf4j.simpleLogger.defaultLogLevel=warn
8 |
9 | # Logging detail level for a SimpleLogger instance named "xxxxx".
10 | # Must be one of ("trace", "debug", "info", "warn", or "error").
11 | # If not specified, the default logging detail level is used.
12 | #org.slf4j.simpleLogger.log.xxxxx=
13 |
14 | # Set to true if you want the current date and time to be included in output messages.
15 | # Default is false, and will output the number of milliseconds elapsed since startup.
16 | #org.slf4j.simpleLogger.showDateTime=false
17 |
18 | # The date and time format to be used in the output messages.
19 | # The pattern describing the date and time format is the same that is used in java.text.SimpleDateFormat.
20 | # If the format is not specified or is invalid, the default format is used.
21 | # The default format is yyyy-MM-dd HH:mm:ss:SSS Z.
22 | #org.slf4j.simpleLogger.dateTimeFormat=yyyy-MM-dd HH:mm:ss:SSS Z
23 |
24 | # Set to true if you want to output the current thread name.
25 | # Defaults to true.
26 | #org.slf4j.simpleLogger.showThreadName=true
27 |
28 | # Set to true if you want the Logger instance name to be included in output messages.
29 | # Defaults to true.
30 | #org.slf4j.simpleLogger.showLogName=true
31 |
32 | # Set to true if you want the last component of the name to be included in output messages.
33 | # Defaults to false.
34 | #org.slf4j.simpleLogger.showShortLogName=false
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/README.md:
--------------------------------------------------------------------------------
1 | 1) ConvertDBpedia
2 | deal with the whole 'M' suffix thing
3 |
4 | Optional: get idCounts.tsv (only have spark version for this right now) this requires running BuildGazetteer on the unfiltered
5 |
6 | 2) BuildGroundTruth
7 | 3) BuildGazetteer
8 | 4) TypePairFilter
9 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 |
4 |
5 | com.ibm.research.ai.ki
6 | cc-dbp-parent-pom
7 | 1.0.0-SNAPSHOT
8 |
9 |
10 | kb
11 | 1.0.0-SNAPSHOT
12 |
13 |
14 |
15 | com.ibm.research.ai.ki
16 | util
17 | 1.0.0-SNAPSHOT
18 |
19 |
20 |
21 | com.ibm.research.ai.ki
22 | nlp
23 | 1.0.0-SNAPSHOT
24 |
25 |
26 |
27 | com.ibm.research.ai.ki
28 | kbp
29 | 1.0.0-SNAPSHOT
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/ConfigureMinMaxEntityFreq.java:
--------------------------------------------------------------------------------
1 | /**
2 | * cc-dbp-dataset
3 | *
4 | * Copyright (c) 2017 IBM
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.ibm.research.ai.ki.kb;
19 |
20 | import java.io.*;
21 | import java.util.*;
22 |
23 | import com.ibm.research.ai.ki.util.*;
24 |
25 | /**
26 | * Shows examples of entities that occur at different frequency ranges, so that a sensible maximum occurrence frequency can be selected,
27 | * and possibly a minimum occurrence frequency.
28 | * @author mrglass
29 | *
30 | */
31 | public class ConfigureMinMaxEntityFreq {
32 | public static void main(String[] args) {
33 | String kbDir = args[0];
34 |
35 | RandomUtil.Sample[] termsByFreq = new RandomUtil.Sample[20];
36 | for (int i = 0; i < termsByFreq.length; ++i) {
37 | termsByFreq[i] = new RandomUtil.Sample<>(20);
38 | }
39 | Map idCounts = SparseVectors.fromString(FileUtil.readFileAsString(new File(kbDir, KBFiles.idCountsTsv)));
40 | for (Map.Entry e : idCounts.entrySet()) {
41 | int bucket = (int)Math.log(e.getValue().value);
42 | if (bucket < 0) bucket = 0;
43 | if (bucket >= termsByFreq.length) bucket = termsByFreq.length-1;
44 | termsByFreq[bucket].maybeSave(Lang.LPAD(""+((int)e.getValue().value), 10)+" "+e.getKey());
45 | }
46 | for (int i = 0; i < termsByFreq.length; ++i) {
47 | System.out.println("=======================================");
48 | System.out.println(Lang.stringList(termsByFreq[i], "\n"));
49 | }
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/GroundTruthConfig.java:
--------------------------------------------------------------------------------
1 | /**
2 | * cc-dbp-dataset
3 | *
4 | * Copyright (c) 2017 IBM
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.ibm.research.ai.ki.kb;
19 |
20 | import com.ibm.research.ai.ki.util.*;
21 |
22 | public class GroundTruthConfig extends PropertyStruct {
23 | private static final long serialVersionUID = 1L;
24 |
25 |
26 | public int minCorpusCount = 1;
27 | public int maxCorpusCount = 300000;
28 | public int minUnaryCount = 100;
29 | public boolean useRelationTaxonomy = true;
30 |
31 | //CONSIDER: also type selection config
32 | }
33 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/KBConfig.java:
--------------------------------------------------------------------------------
1 | package com.ibm.research.ai.ki.kb;
2 |
3 | import java.io.*;
4 |
5 | import com.ibm.research.ai.ki.util.*;
6 |
7 | public class KBConfig extends PropertyStruct {
8 | private static final long serialVersionUID = 1L;
9 |
10 | public String kbDir;
11 |
12 | /**
13 | * To avoid generic terms, we ignore terms that occur more than this many times.
14 | */
15 | public int maxNodeCorpusCount = 3000000;
16 | /**
17 | * We can ignore rare terms if desired.
18 | */
19 | public int minNodeCorpusCount = 1;
20 | /**
21 | *
22 | */
23 | public int minUnaryCount = 100;
24 | /**
25 | * Whether to consider super-relations in the labels for context sets.
26 | */
27 | public boolean useRelationTaxonomy = true;
28 |
29 | //for the coarse-grained type system
30 | /**
31 | * A type must have this many instances for which it is the most specific type
32 | */
33 | public int minTypeSize = 3000;
34 | /**
35 | * We will have no more than this many types in the coarse grained type system
36 | */
37 | public int maxNumberOfTypes = 100;
38 |
39 | //for the type filter
40 | /**
41 | * If an unordered type-pair does not have at least this many triples, it will not have any contexts generated.
42 | * So if number-number relations never occur, we will never generated contexts for a number-number node-pair.
43 | */
44 | public int minTypePairFreq = 1;
45 |
46 | public int minTypeFreqForUnary = 1;
47 |
48 |
49 | public File kbDir() {
50 | return new File(kbDir);
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/KBFiles.java:
--------------------------------------------------------------------------------
1 | /**
2 | * cc-dbp-dataset
3 | *
4 | * Copyright (c) 2017 IBM
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.ibm.research.ai.ki.kb;
19 |
20 | /**
21 | * Files that can be present in a kb directory
22 | * @author mrglass
23 | *
24 | */
25 | public class KBFiles {
26 | public static final String triplesTsv = "triples.tsv";
27 | public static final String labelsTsv = "labels.tsv";
28 | public static final String relationTaxonomyTsv = "relationTaxonomy.tsv";
29 | public static final String typesTsv = "types.tsv";
30 | public static final String popularityTsv = "popularity.tsv";
31 | //from DocEntityStats in ie.spark
32 | public static final String idCountsTsv = "idCounts.tsv";
33 | }
34 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/conversion/DBpediaKBConfig.java:
--------------------------------------------------------------------------------
1 | package com.ibm.research.ai.ki.kb.conversion;
2 |
3 | import java.io.*;
4 |
5 | import com.ibm.research.ai.ki.kb.*;
6 |
7 | public class DBpediaKBConfig extends KBConfig {
8 | private static final long serialVersionUID = 1L;
9 |
10 | public String dbpediaOwlUrl;
11 |
12 | public String objectsUrl;
13 |
14 | public String literalsUrl;
15 |
16 | public String labelsUrl;
17 |
18 | public String typesUrl;
19 |
20 | /**
21 | * We can construct the KB without using idCounts.tsv if desired. Since getting idCounts.tsv requires running a gazetteer over the corpus and is potentially slow.
22 | */
23 | public boolean noNodeCorpusCounts;
24 |
25 |
26 | protected File file(String url) {
27 | return new File(kbDir, url.substring(url.lastIndexOf('/')+1));
28 | }
29 |
30 | public File dbpediaOwlFile() {
31 | return file(dbpediaOwlUrl);
32 | }
33 |
34 | public File objectsFile() {
35 | return file(objectsUrl);
36 | }
37 |
38 | public File literalsFile() {
39 | return file(literalsUrl);
40 | }
41 |
42 | public File labelsFile() {
43 | return file(labelsUrl);
44 | }
45 |
46 | public File typesFile() {
47 | return file(typesUrl);
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/explore/FilterByCorpusCount.java:
--------------------------------------------------------------------------------
1 | /**
2 | * cc-dbp-dataset
3 | *
4 | * Copyright (c) 2017 IBM
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.ibm.research.ai.ki.kb.explore;
19 |
20 | import java.io.*;
21 | import java.util.*;
22 |
23 | import com.ibm.research.ai.ki.formats.*;
24 | import com.ibm.research.ai.ki.util.*;
25 |
26 |
27 | public class FilterByCorpusCount {
28 |
29 | public static void main(String[] args) {
30 | String kbDir = args[0];
31 | String kbDirFiltered = args[1];
32 | int minCount = 1;
33 | if (args.length > 2)
34 | minCount = Integer.parseInt(args[2]);
35 | Map idCounts = SparseVectors.fromString(FileUtil.readFileAsString(new File(kbDir, "idCounts.tsv")));
36 | try (PrintStream out = FileUtil.getFilePrintStream(new File(kbDirFiltered, "labels.tsv").getAbsolutePath())) {
37 | for (String[] lbl : new SimpleTsvIterable(new File(kbDir, "labels.tsv"))) {
38 | if (SparseVectors.getDefaultZero(idCounts, lbl[0]) >= minCount) {
39 | out.println(Lang.stringList(lbl, "\t"));
40 | }
41 | }
42 | }
43 | try (PrintStream out = FileUtil.getFilePrintStream(new File(kbDirFiltered, "triples.tsv").getAbsolutePath())) {
44 | for (String[] trip : new SimpleTsvIterable(new File(kbDir, "triples.tsv"))) {
45 | if (SparseVectors.getDefaultZero(idCounts, trip[0]) >= minCount && SparseVectors.getDefaultZero(idCounts, trip[2]) >= minCount) {
46 | out.println(Lang.stringList(trip, "\t"));
47 | }
48 | }
49 | }
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/src/main/resources/dbpediaConfig.properties:
--------------------------------------------------------------------------------
1 | dbpediaOwlUrl=http://downloads.dbpedia.org/2016-10/dbpedia_2016-10.owl
2 | objectsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/mappingbased_objects_en.ttl.bz2
3 | literalsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/mappingbased_literals_en.ttl.bz2
4 | labelsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/labels_en.ttl.bz2
5 | typesUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/instance_types_transitive_en.ttl.bz2
6 |
7 | #for ground truth
8 | maxNodeCorpusCount = 300000
9 | minNodeCorpusCount = 1
10 | useRelationTaxonomy = True
11 |
12 | #for the coarse-grained type system
13 | minTypeSize = 3000
14 | maxNumberOfTypes = 100
15 |
16 | #for the type filter
17 | minTypePairFreq = 1
18 |
19 | noNodeCorpusCounts = False
20 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 |
4 |
5 | com.ibm.research.ai.ki
6 | cc-dbp-parent-pom
7 | 1.0.0-SNAPSHOT
8 |
9 |
10 | kbp
11 | 1.0.0-SNAPSHOT
12 |
13 |
14 |
15 | com.ibm.research.ai.ki
16 | util
17 | 1.0.0-SNAPSHOT
18 |
19 |
20 |
21 | com.ibm.research.ai.ki
22 | nlp
23 | 1.0.0-SNAPSHOT
24 |
25 |
26 |
27 |
28 | org.apache.wink
29 | wink-json4j
30 | ${wink-json4j.version}
31 |
32 |
33 |
34 | com.google.guava
35 | guava
36 | ${guava.version}
37 |
38 |
39 |
40 | org.apache.commons
41 | commons-lang3
42 | ${commons-lang3.version}
43 |
44 |
45 |
46 | commons-cli
47 | commons-cli
48 | ${commons-cli.version}
49 |
50 |
51 |
52 | it.unimi.dsi
53 | fastutil
54 | 7.1.0
55 |
56 |
57 |
58 |
59 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/CoveredTextEntityId.java:
--------------------------------------------------------------------------------
1 | /**
2 | * cc-dbp-dataset
3 | *
4 | * Copyright (c) 2017 IBM
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.ibm.research.ai.ki.kbp;
19 |
20 | import java.util.*;
21 |
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | import com.ibm.reseach.ai.ki.nlp.types.*;
24 |
25 | /**
26 | * For those entities without an id, we simply give them an id equal to the covered text, case normalized.
27 | * So it is a text-equals entity linker.
28 | * @author mrglass
29 | *
30 | */
31 | public class CoveredTextEntityId implements IPostprocessEntityRecognition {
32 | private static final long serialVersionUID = 1L;
33 |
34 | @Override
35 | public void initialize(Properties config) {}
36 |
37 | @Override
38 | public void process(Document doc) {
39 | for (EntityWithId e : doc.getAnnotations(EntityWithId.class)) {
40 | if (e.id == null)
41 | e.id = e.coveredText(doc).toLowerCase().trim().replaceAll("\\s+", " ");
42 | }
43 | }
44 |
45 | @Override
46 | public void initialize(IGroundTruth gt, RelexConfig config) {}
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/DocumentFeatureString.java:
--------------------------------------------------------------------------------
1 | /**
2 | * cc-dbp-dataset
3 | *
4 | * Copyright (c) 2017 IBM
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.ibm.research.ai.ki.kbp;
19 |
20 | import com.ibm.reseach.ai.ki.nlp.*;
21 |
22 | public class DocumentFeatureString implements DocumentStructure {
23 | private static final long serialVersionUID = 1L;
24 |
25 | public String featureString;
26 |
27 | public DocumentFeatureString(String featureString) {
28 | this.featureString = featureString;
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/FilterEntsByGroundTruth.java:
--------------------------------------------------------------------------------
1 | /**
2 | * cc-dbp-dataset
3 | *
4 | * Copyright (c) 2017 IBM
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.ibm.research.ai.ki.kbp;
19 |
20 | import java.util.*;
21 |
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | import com.ibm.reseach.ai.ki.nlp.types.*;
24 |
25 | public class FilterEntsByGroundTruth implements IPostprocessEntityRecognition {
26 | private static final long serialVersionUID = 1L;
27 |
28 | protected Set relevantUrls;
29 |
30 | @Override
31 | public void initialize(Properties config) {}
32 |
33 | @Override
34 | public void process(Document doc) {
35 | doc.removeAnnotations(EntityWithId.class, e -> !relevantUrls.contains(e.id));
36 | }
37 |
38 | @Override
39 | public void initialize(IGroundTruth gt, RelexConfig config) {
40 | this.relevantUrls = gt.getRelevantIds();
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IEntityPairFilter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * cc-dbp-dataset
3 | *
4 | * Copyright (c) 2017 IBM
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.ibm.research.ai.ki.kbp;
19 |
20 | import java.io.*;
21 |
22 | /**
23 | * A class implementing this will be specified in the RelexConfig if some filtering of entity-pairs is desired.
24 | * Otherwise the tsv dataset will contain all pairs of EntityWithId that occur in the same sentence.
25 | * @author mrglass
26 | *
27 | */
28 | public interface IEntityPairFilter extends Serializable {
29 | /**
30 | * In Spark, initialize is called in the Spark head
31 | * @param gt
32 | * @param config
33 | */
34 | public void initialize(GroundTruth gt, RelexConfig config);
35 | /**
36 | * Return true if the entity-pair is a good candidate
37 | * @param id1
38 | * @param type1
39 | * @param id2
40 | * @param type2
41 | * @return
42 | */
43 | public boolean test(String id1, String type1, String id2, String type2);
44 | }
45 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IGroundTruth.java:
--------------------------------------------------------------------------------
1 | /**
2 | * cc-dbp-dataset
3 | *
4 | * Copyright (c) 2017 IBM
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.ibm.research.ai.ki.kbp;
19 |
20 | import java.io.*;
21 | import java.util.*;
22 |
23 | /**
24 | * A generic ground truth interface, the methods needed for preprocessing.
25 | * @author mrglass
26 | *
27 | */
28 | public interface IGroundTruth extends Serializable {
29 | public String getType(String id);
30 | public Set getRelevantIds();
31 |
32 | public Map buildEntitySetId2Relations();
33 | }
34 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IPostprocessEntityRecognition.java:
--------------------------------------------------------------------------------
1 | /**
2 | * cc-dbp-dataset
3 | *
4 | * Copyright (c) 2017 IBM
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.ibm.research.ai.ki.kbp;
19 |
20 | import com.ibm.reseach.ai.ki.nlp.*;
21 |
22 | /**
23 | * An annotator that postprocesses the entity recognition and linking.
24 | * Often to remove entities not of interest, or to fill in type based on id or id for NIL entity linking.
25 | * @author mrglass
26 | *
27 | */
28 | public interface IPostprocessEntityRecognition extends Annotator {
29 | public void initialize(IGroundTruth gt, RelexConfig config);
30 | }
31 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IRelexDatasetManager.java:
--------------------------------------------------------------------------------
1 | /**
2 | * cc-dbp-dataset
3 | *
4 | * Copyright (c) 2017 IBM
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.ibm.research.ai.ki.kbp;
19 |
20 | import java.io.*;
21 |
22 | import com.ibm.research.ai.ki.kbp.*;
23 |
24 | /**
25 | * Provides classes for representing and creating a dataset for training/evaluation/mass-apply of
26 | * a relational knowledge induction system.
27 | *
28 | * @author mrglass
29 | *
30 | * @param
31 | */
32 | public interface IRelexDatasetManager extends Serializable {
33 |
34 | public IRelexTsv getTsvMaker();
35 | public IGroundTruth getGroundTruth();
36 | public Class getMentionClass();
37 | public IRelexTensors getTensorMaker();
38 |
39 | /**
40 | * before this method is called, only getMentionClass is supposed to be called
41 | * @param config
42 | */
43 | public void initialize(RelexConfig config);
44 | }
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IRelexMention.java:
--------------------------------------------------------------------------------
1 | /**
2 | * cc-dbp-dataset
3 | *
4 | * Copyright (c) 2017 IBM
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.ibm.research.ai.ki.kbp;
19 |
20 | import java.io.*;
21 |
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 |
24 | /**
25 | * So we can unify the code for binary and unary relation mention
26 | * @author mrglass
27 | *
28 | */
29 | public interface IRelexMention extends Serializable {
30 | //for reduce by key
31 | public String groupId();
32 | public int groupSplit(int splitCount);
33 |
34 | //the canonically ordered list of ids, separated by '\t'; if group ids are enabled the group id is given here too
35 | public String entitySetId();
36 |
37 | //downsampling and splitting train/validate/test
38 | public double getNegativeDownsamplePriority();
39 | public double getDatasetSplitPosition();
40 | //for negative downsampling
41 | public boolean isNegative();
42 |
43 | //where the document the mention comes from appears in the x-axis of the document learning curve (0-1)
44 | public double getDocumentLearningCurvePosition();
45 |
46 | //for vocab construction
47 | public String[] getTypes();
48 | public String[] getRelations();
49 | public String[] getTokens(Annotator tokenizer);
50 |
51 | //saving and loading from tsv
52 | public void fromString(String tsvLine);
53 |
54 | public String toString();
55 |
56 | //to avoid duplicates in a mentionset, if non-null, two IRelexMentions that share a uniquenessString are duplicates.
57 | public String uniquenessString();
58 |
59 | /**
60 | * A human readable format for showing the support for an extracted relation.
61 | * @return
62 | */
63 | public String toSupportString();
64 |
65 | public void convertToPlaceholders();
66 | }
67 |
--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IRelexTensors.java:
--------------------------------------------------------------------------------
1 | /**
2 | * cc-dbp-dataset
3 | *
4 | * Copyright (c) 2017 IBM
5 | *
6 | * The author licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.ibm.research.ai.ki.kbp;
19 |
20 | import java.io.*;
21 | import java.util.*;
22 |
23 | import com.ibm.reseach.ai.ki.nlp.*;
24 |
25 | /**
26 | * Creates the deep learning input tensors from a set of RelexMentions
27 | * @author mrglass
28 | *
29 | * @param
30 | */
31 | public interface IRelexTensors extends Serializable {
32 | public String[] getTypes();
33 | public String[] getRelations();
34 | /**
35 | * The first object is assumed to be the String groupId.
36 | * @param tokenizer
37 | * @param fullMentionSet
38 | * @return
39 | */
40 | public List