├── .gitignore
├── LICENSE
├── README.md
├── boilerpipe
    ├── boilerpipe-common
    │   ├── LICENSE
    │   ├── README.md
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── com
    │   │               └── kohlschutter
    │   │                   └── boilerpipe
    │   │                       ├── BoilerpipeDocumentSource.java
    │   │                       ├── BoilerpipeExtractor.java
    │   │                       ├── BoilerpipeFilter.java
    │   │                       ├── BoilerpipeInput.java
    │   │                       ├── BoilerpipeProcessingException.java
    │   │                       ├── conditions
    │   │                           └── TextBlockCondition.java
    │   │                       ├── document
    │   │                           ├── BPAnnotation.java
    │   │                           ├── HeaderAnnotation.java
    │   │                           ├── Image.java
    │   │                           ├── Link.java
    │   │                           ├── ParagraphAnnotation.java
    │   │                           ├── TextBlock.java
    │   │                           ├── TextDocument.java
    │   │                           ├── TextDocumentStatistics.java
    │   │                           ├── TextFormatAnnotation.java
    │   │                           └── package-info.java
    │   │                       ├── estimators
    │   │                           └── SimpleEstimator.java
    │   │                       ├── extractors
    │   │                           ├── ArticleExtractor.java
    │   │                           ├── ArticleSentencesExtractor.java
    │   │                           ├── CanolaExtractor.java
    │   │                           ├── CommonExtractors.java
    │   │                           ├── DefaultExtractor.java
    │   │                           ├── ExtractorBase.java
    │   │                           ├── KeepEverythingExtractor.java
    │   │                           ├── KeepEverythingWithMinKWordsExtractor.java
    │   │                           ├── LargestContentExtractor.java
    │   │                           ├── NumWordsRulesExtractor.java
    │   │                           └── package-info.java
    │   │                       ├── filters
    │   │                           ├── debug
    │   │                           │   └── PrintDebugFilter.java
    │   │                           ├── english
    │   │                           │   ├── DensityRulesClassifier.java
    │   │                           │   ├── HeuristicFilterBase.java
    │   │                           │   ├── IgnoreBlocksAfterContentFilter.java
    │   │                           │   ├── IgnoreBlocksAfterContentFromEndFilter.java
    │   │                           │   ├── KeepLargestFulltextBlockFilter.java
    │   │                           │   ├── MinFulltextWordsFilter.java
    │   │                           │   ├── NumWordsRulesClassifier.java
    │   │                           │   ├── TerminatingBlocksFinder.java
    │   │                           │   └── package-info.java
    │   │                           ├── heuristics
    │   │                           │   ├── AddPrecedingLabelsFilter.java
    │   │                           │   ├── ArticleMetadataFilter.java
    │   │                           │   ├── BlockProximityFusion.java
    │   │                           │   ├── ContentFusion.java
    │   │                           │   ├── DocumentTitleMatchClassifier.java
    │   │                           │   ├── ExpandTitleToContentFilter.java
    │   │                           │   ├── KeepLargestBlockFilter.java
    │   │                           │   ├── LabelFusion.java
    │   │                           │   ├── LargeBlockSameTagLevelToContentFilter.java
    │   │                           │   ├── ListAtEndFilter.java
    │   │                           │   ├── SimpleBlockFusionProcessor.java
    │   │                           │   ├── TrailingHeadlineToBoilerplateFilter.java
    │   │                           │   └── package-info.java
    │   │                           └── simple
    │   │                           │   ├── BoilerplateBlockFilter.java
    │   │                           │   ├── InvertedFilter.java
    │   │                           │   ├── LabelToBoilerplateFilter.java
    │   │                           │   ├── LabelToContentFilter.java
    │   │                           │   ├── MarkEverythingBoilerplateFilter.java
    │   │                           │   ├── MarkEverythingContentFilter.java
    │   │                           │   ├── MinClauseWordsFilter.java
    │   │                           │   ├── MinWordsFilter.java
    │   │                           │   ├── SplitParagraphBlocksFilter.java
    │   │                           │   ├── SurroundingToContentFilter.java
    │   │                           │   └── package-info.java
    │   │                       ├── labels
    │   │                           ├── ConditionalLabelAction.java
    │   │                           ├── DefaultLabels.java
    │   │                           └── LabelAction.java
    │   │                       ├── package-info.java
    │   │                       ├── sax
    │   │                           ├── BoilerpipeHTMLContentHandler.java
    │   │                           ├── BoilerpipeHTMLParser.java
    │   │                           ├── BoilerpipeSAXInput.java
    │   │                           ├── CommonTagActions.java
    │   │                           ├── DefaultTagActionMap.java
    │   │                           ├── HTMLDocument.java
    │   │                           ├── HTMLFetcher.java
    │   │                           ├── HTMLHighlighter.java
    │   │                           ├── ImageExtractor.java
    │   │                           ├── InputSourceable.java
    │   │                           ├── MarkupTagAction.java
    │   │                           ├── TagAction.java
    │   │                           ├── TagActionMap.java
    │   │                           └── package-info.java
    │   │                       └── util
    │   │                           ├── UnicodeTokenizer.java
    │   │                           └── package-info.java
    └── nekohtml
    │   ├── dependency-reduced-pom.xml
    │   ├── pom.xml
    │   └── src
    │       └── main
    │           └── java
    │               └── org
    │                   └── cyberneko
    │                       └── html
    │                           ├── HTMLElements.java
    │                           └── HTMLTagBalancer.java
├── com.ibm.research.ai.ki.corpus
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── com
    │           │   └── ibm
    │           │       └── research
    │           │           └── ai
    │           │               └── ki
    │           │                   └── corpora
    │           │                       └── crawl
    │           │                           ├── CharsetDetect.java
    │           │                           ├── CommonCrawlConfig.java
    │           │                           ├── HtmlToDocument.java
    │           │                           ├── LanguageScorer.java
    │           │                           ├── SaveCommonCrawl.java
    │           │                           ├── SaveCommonCrawlBase.java
    │           │                           └── SaveCommonCrawlHdfs.java
    │       └── resources
    │           ├── cc-dbp
    │               └── cc-dbp.properties
    │           ├── log4j.properties
    │           └── simplelogger.properties
├── com.ibm.research.ai.ki.kb
    ├── README.md
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── com
    │           │   └── ibm
    │           │       └── research
    │           │           └── ai
    │           │               └── ki
    │           │                   └── kb
    │           │                       ├── BuildGazetteer.java
    │           │                       ├── BuildGroundTruth.java
    │           │                       ├── ConfigureMinMaxEntityFreq.java
    │           │                       ├── FindUnary.java
    │           │                       ├── GroundTruthConfig.java
    │           │                       ├── KBConfig.java
    │           │                       ├── KBFiles.java
    │           │                       ├── NodePopularity.java
    │           │                       ├── RelationTaxonomy.java
    │           │                       ├── SelectTypes.java
    │           │                       ├── TypePairFilter.java
    │           │                       ├── conversion
    │           │                           ├── ConvertDBpedia.java
    │           │                           ├── DBpediaKBConfig.java
    │           │                           ├── MergeNodesDBpedia.java
    │           │                           ├── SelectRelations.java
    │           │                           └── SummaryCharts.java
    │           │                       └── explore
    │           │                           ├── CheckLabelCollisions.java
    │           │                           └── FilterByCorpusCount.java
    │       └── resources
    │           ├── dbpediaConfig.properties
    │           └── relationSample.txt
├── com.ibm.research.ai.ki.kbp
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── com
    │           │   └── ibm
    │           │       └── research
    │           │           └── ai
    │           │               └── ki
    │           │                   └── kbp
    │           │                       ├── CoveredTextEntityId.java
    │           │                       ├── CreateTsvDataset.java
    │           │                       ├── CreateTsvDatasetTokenWindow.java
    │           │                       ├── DocumentFeatureString.java
    │           │                       ├── DocumentPreprocessing.java
    │           │                       ├── FilterEntsByGroundTruth.java
    │           │                       ├── GazetteerEDL.java
    │           │                       ├── GroundTruth.java
    │           │                       ├── GroupRelexMentionTsvDataset.java
    │           │                       ├── IEntityPairFilter.java
    │           │                       ├── IGroundTruth.java
    │           │                       ├── IPostprocessEntityRecognition.java
    │           │                       ├── IRelexDatasetManager.java
    │           │                       ├── IRelexMention.java
    │           │                       ├── IRelexTensors.java
    │           │                       ├── IRelexTsv.java
    │           │                       ├── KBPBuildDataset.java
    │           │                       ├── NounPhraseEntityWithId.java
    │           │                       ├── RelexConfig.java
    │           │                       ├── RelexDatasetFiles.java
    │           │                       ├── RelexDatasetManagerBinary.java
    │           │                       ├── RelexMention.java
    │           │                       ├── RelexMentionReader.java
    │           │                       ├── RelexStats.java
    │           │                       ├── RelexTensors.java
    │           │                       ├── RelexVocab.java
    │           │                       ├── ShowExamples.java
    │           │                       ├── Tokenizer.java
    │           │                       ├── TypePairEntityPairFilter.java
    │           │                       ├── baselines
    │           │                           └── NREConvert.java
    │           │                       ├── embeddings
    │           │                           ├── EmbeddingFormat.java
    │           │                           └── Word2VecConverter.java
    │           │                       └── unary
    │           │                           ├── DownsampleEntityFilter.java
    │           │                           ├── IEntityFilter.java
    │           │                           ├── RelexDatasetManagerUnary.java
    │           │                           ├── UnaryGroundTruth.java
    │           │                           ├── UnaryRelexMention.java
    │           │                           ├── UnaryRelexTensors.java
    │           │                           └── UnaryRelexTsvDataset.java
    │       └── resources
    │           └── relexConfigNonSpark.properties
├── com.ibm.research.ai.ki.nlp
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── com
    │       │   │   └── ibm
    │       │   │       ├── reseach
    │       │   │           └── ai
    │       │   │           │   └── ki
    │       │   │           │       └── nlp
    │       │   │           │           ├── AnnoRef.java
    │       │   │           │           ├── Annotation.java
    │       │   │           │           ├── Annotator.java
    │       │   │           │           ├── Document.java
    │       │   │           │           ├── DocumentJSONDeserializer.java
    │       │   │           │           ├── DocumentJSONSerializer.java
    │       │   │           │           ├── DocumentReader.java
    │       │   │           │           ├── DocumentSerialize.java
    │       │   │           │           ├── DocumentStructure.java
    │       │   │           │           ├── DocumentWriter.java
    │       │   │           │           ├── DocumentWriter2.java
    │       │   │           │           ├── OffsetCorrection.java
    │       │   │           │           ├── Pipeline.java
    │       │   │           │           ├── PipelinedDocuments.java
    │       │   │           │           ├── ResettingAnnotator.java
    │       │   │           │           ├── TransformBase.java
    │       │   │           │           ├── TransformRegex.java
    │       │   │           │           ├── TransformString.java
    │       │   │           │           ├── conversion
    │       │   │           │               └── NIFSerialization.java
    │       │   │           │           └── types
    │       │   │           │               ├── Author.java
    │       │   │           │               ├── Categories.java
    │       │   │           │               ├── Chunk.java
    │       │   │           │               ├── CorefIndex.java
    │       │   │           │               ├── DocDate.java
    │       │   │           │               ├── DocRelations.java
    │       │   │           │               ├── DocumentContentType.java
    │       │   │           │               ├── DocumentSource.java
    │       │   │           │               ├── Entity.java
    │       │   │           │               ├── EntityWithId.java
    │       │   │           │               ├── Event.java
    │       │   │           │               ├── LinkAnnotation.java
    │       │   │           │               ├── LinkedEntity.java
    │       │   │           │               ├── ListAnnotation.java
    │       │   │           │               ├── ListItem.java
    │       │   │           │               ├── Paragraph.java
    │       │   │           │               ├── Relation.java
    │       │   │           │               ├── Section.java
    │       │   │           │               ├── SectionHeader.java
    │       │   │           │               ├── Sentence.java
    │       │   │           │               ├── TextFormatting.java
    │       │   │           │               ├── Title.java
    │       │   │           │               ├── Token.java
    │       │   │           │               └── XmlTag.java
    │       │   │       └── research
    │       │   │           └── ai
    │       │   │               └── ki
    │       │   │                   └── nlp
    │       │   │                       └── parse
    │       │   │                           ├── ClearNLPNER.java
    │       │   │                           ├── ClearNLPPOS.java
    │       │   │                           ├── ClearNLPParse.java
    │       │   │                           ├── ClearNLPSentence.java
    │       │   │                           ├── ClearNLPTokenize.java
    │       │   │                           ├── ClearNLPTransform.java
    │       │   │                           ├── DigitSequenceTokenize.java
    │       │   │                           ├── EntityToOccurrences.java
    │       │   │                           ├── GazetteerMatcher.java
    │       │   │                           ├── NormalizeTextTransform.java
    │       │   │                           ├── OpenNLPChunk.java
    │       │   │                           ├── OpenNLPNER.java
    │       │   │                           ├── OpenNLPPOS.java
    │       │   │                           ├── OpenNLPSentence.java
    │       │   │                           ├── OpenNLPTokenize.java
    │       │   │                           ├── RegexParagraph.java
    │       │   │                           ├── RegexTokenize.java
    │       │   │                           └── TokensSnapToEntities.java
    │       └── resources
    │       │   ├── com
    │       │       └── ibm
    │       │       │   └── research
    │       │       │       └── ai
    │       │       │           └── ki
    │       │       │               └── nlp
    │       │       │                   └── parse
    │       │       │                       ├── clearNLP-replace.tsv
    │       │       │                       └── normalizeText-replace.tsv
    │       │   ├── downloadOpenNLPModels.sh
    │       │   ├── en-sent.bin
    │       │   └── log4j.properties
    │   └── test
    │       └── java
    │           └── com
    │               └── ibm
    │                   └── research
    │                       └── ai
    │                           └── ki
    │                               └── nlp
    │                                   ├── OverlappingSpansTest.java
    │                                   ├── TestJSON.java
    │                                   ├── TransformStringTest.java
    │                                   └── parse
    │                                       ├── TestClearNLP.java
    │                                       ├── TestGazetteerMatcher.java
    │                                       └── TestNER.java
├── com.ibm.research.ai.ki.spark
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── com
    │           │   └── ibm
    │           │       └── research
    │           │           └── ai
    │           │               └── ki
    │           │                   └── spark
    │           │                       ├── Base64ToBinary.java
    │           │                       ├── CorpusStatistics.java
    │           │                       ├── CreateW2VFile.java
    │           │                       ├── DocEntityStats.java
    │           │                       ├── GatherRelexStats.java
    │           │                       ├── GatherRelexVocab.java
    │           │                       ├── GazetteerPreprocess.java
    │           │                       ├── NonSparkGatherVocab.java
    │           │                       ├── RelexBuildDataset.java
    │           │                       ├── RelexTensorDataset.java
    │           │                       ├── RelexTsvDataset.java
    │           │                       ├── RunPipelineSpark.java
    │           │                       └── SimpleSparkJob.java
    │       ├── resources
    │           └── relexConfig.properties
    │       └── scripts
    │           ├── java-viacloud
    │           └── java-viaspark
├── com.ibm.research.ai.ki.util
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── com
    │       │   │   └── ibm
    │       │   │       └── research
    │       │   │           └── ai
    │       │   │               └── ki
    │       │   │                   ├── formats
    │       │   │                       ├── ArchiveEntryIterable.java
    │       │   │                       ├── NTriples.java
    │       │   │                       └── SimpleTsvIterable.java
    │       │   │                   └── util
    │       │   │                       ├── BlockShuffler.java
    │       │   │                       ├── CollectionUtil.java
    │       │   │                       ├── CombinedSpans.java
    │       │   │                       ├── DenseVectors.java
    │       │   │                       ├── Distribution.java
    │       │   │                       ├── FileUtil.java
    │       │   │                       ├── FirstPairComparator.java
    │       │   │                       ├── HashMapUtil.java
    │       │   │                       ├── Lang.java
    │       │   │                       ├── LogLinear.java
    │       │   │                       ├── MutableDouble.java
    │       │   │                       ├── MutableInteger.java
    │       │   │                       ├── NBest.java
    │       │   │                       ├── NestedIterable.java
    │       │   │                       ├── NextOnlyIterator.java
    │       │   │                       ├── NonOverlappingSpans.java
    │       │   │                       ├── OverlappingSpans.java
    │       │   │                       ├── Pair.java
    │       │   │                       ├── PeriodicChecker.java
    │       │   │                       ├── PropertyLoader.java
    │       │   │                       ├── PropertyStruct.java
    │       │   │                       ├── RandomUtil.java
    │       │   │                       ├── SecondPairComparator.java
    │       │   │                       ├── Span.java
    │       │   │                       ├── SparseVectors.java
    │       │   │                       ├── ThreadedLoopIterator.java
    │       │   │                       ├── Warnings.java
    │       │   │                       ├── eval
    │       │   │                           ├── BootstrappingConfidenceInterval.java
    │       │   │                           ├── MultiPrecisionRecall.java
    │       │   │                           ├── PrecisionRecall.java
    │       │   │                           └── SamplingPermutationTest.java
    │       │   │                       ├── graphs
    │       │   │                           ├── GraphAlgorithms.java
    │       │   │                           ├── SnowballSampler.java
    │       │   │                           └── TreeAlgorithms.java
    │       │   │                       ├── io
    │       │   │                           ├── DataIO.java
    │       │   │                           ├── MultiFileWriter.java
    │       │   │                           ├── OldVersionOf.java
    │       │   │                           ├── RefactoringObjectInputStream.java
    │       │   │                           ├── TensorFileReader.java
    │       │   │                           └── TensorFileWriter.java
    │       │   │                       └── parallel
    │       │   │                           ├── BlockingThreadedExecutor.java
    │       │   │                           ├── ISimpleExecutor.java
    │       │   │                           ├── PollingThreadedExecutor.java
    │       │   │                           ├── SingleThreadedExecutor.java
    │       │   │                           └── StreamEater.java
    │       └── resources
    │       │   └── com
    │       │       └── ibm
    │       │           └── research
    │       │               └── ai
    │       │                   └── ki
    │       │                       └── util
    │       │                           └── serializedMappings.properties
    │   └── test
    │       ├── java
    │           └── com
    │           │   └── ibm
    │           │       └── research
    │           │           └── ai
    │           │               └── ki
    │           │                   └── util
    │           │                       ├── BjUtilTestCounter.java
    │           │                       ├── ExecuteJavaProc.java
    │           │                       ├── FileIteratorTest.java
    │           │                       ├── FileUtilTest.java
    │           │                       ├── HashMapUtilTest.java
    │           │                       ├── LangTest.java
    │           │                       ├── NBestTest.java
    │           │                       ├── NonOverlappingTest.java
    │           │                       ├── OverlappingSpansTest.java
    │           │                       ├── PrecisionRecallTest.java
    │           │                       ├── PropertyLoaderTest.java
    │           │                       ├── RandomUtilTest.java
    │           │                       ├── SpanTest.java
    │           │                       ├── SparseVectorsTest.java
    │           │                       └── TestTreeAlgorithms.java
    │       └── resources
    │           └── com
    │               └── ibm
    │                   └── research
    │                       └── ai
    │                           └── ki
    │                               └── util
    │                                   └── 1.properties
├── config.properties
├── configSmall-de.properties
├── configSmall.properties
├── create.sh
├── createSmall-de.sh
├── createSmall.sh
├── pom.xml
├── unaryConfig.properties
└── unaryCreate.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | .classpath
  2 | .project
  3 | .settings
  4 | **/.classpath
  5 | **/.project
  6 | **/.settings
  7 | target/**
  8 | */target/**
  9 | */target/*
 10 | .metadata
 11 | clientdb.xml
 12 | release.properties
 13 | pom.xml.releaseBackup
 14 | *~
 15 | 
 16 | 
 17 | # User specified git ignore directories (works recursively).
 18 | *.DS_Store
 19 | .metadata
 20 | .recommenders
 21 | 
 22 | .idea/
 23 | **/.idea/
 24 | 
 25 | 
 26 | # Byte-compiled / optimized / DLL files
 27 | __pycache__/
 28 | *.py[cod]
 29 | *$py.class
 30 | 
 31 | # C extensions
 32 | *.so
 33 | 
 34 | # Distribution / packaging
 35 | .Python
 36 | env/
 37 | build/
 38 | develop-eggs/
 39 | dist/
 40 | downloads/
 41 | eggs/
 42 | .eggs/
 43 | lib/
 44 | lib64/
 45 | parts/
 46 | sdist/
 47 | var/
 48 | *.egg-info/
 49 | .installed.cfg
 50 | *.egg
 51 | 
 52 | # PyInstaller
 53 | #  Usually these files are written by a python script from a template
 54 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 55 | *.manifest
 56 | *.spec
 57 | 
 58 | # Installer logs
 59 | pip-log.txt
 60 | pip-delete-this-directory.txt
 61 | 
 62 | # Unit test / coverage reports
 63 | htmlcov/
 64 | .tox/
 65 | .coverage
 66 | .coverage.*
 67 | .cache
 68 | nosetests.xml
 69 | coverage.xml
 70 | *,cover
 71 | .hypothesis/
 72 | 
 73 | # Translations
 74 | *.mo
 75 | *.pot
 76 | 
 77 | # Django stuff:
 78 | *.log
 79 | local_settings.py
 80 | 
 81 | # Flask stuff:
 82 | instance/
 83 | .webassets-cache
 84 | 
 85 | # Scrapy stuff:
 86 | .scrapy
 87 | 
 88 | # Sphinx documentation
 89 | docs/_build/
 90 | 
 91 | # PyBuilder
 92 | target/
 93 | 
 94 | # IPython Notebook
 95 | .ipynb_checkpoints
 96 | 
 97 | # pyenv
 98 | .python-version
 99 | 
100 | # celery beat schedule file
101 | celerybeat-schedule
102 | 
103 | # dotenv
104 | .env
105 | 
106 | # virtualenv
107 | venv/
108 | ENV/
109 | 
110 | # Spyder project settings
111 | .spyderproject
112 | 
113 | # Rope project settings
114 | .ropeproject
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/LICENSE:
--------------------------------------------------------------------------------
 1 | boilerpipe
 2 | 
 3 | Copyright (c) 2009, 2014 Christian Kohlschütter
 4 | 
 5 | The author licenses this file to You under the Apache License, Version 2.0
 6 | (the "License"); you may not use this file except in compliance with
 7 | the License.  You may obtain a copy of the License at
 8 | 
 9 |     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/README.md:
--------------------------------------------------------------------------------
1 | Fork of boilerpipe from https://github.com/kohlschutter/boilerpipe.
2 | 
3 | This version produces offset annotation for links in the extracted TextBlocks. 
4 | It also places a double newline between disconnected text blocks to help in paragraph and sentence segementation.
5 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 4 | 	<modelVersion>4.0.0</modelVersion>
 5 | 
 6 | 	<parent>
 7 | 		<groupId>com.ibm.research.ai.ki</groupId>
 8 | 		<artifactId>cc-dbp-parent-pom</artifactId>
 9 | 		<version>1.0.0-SNAPSHOT</version>
10 | 		<relativePath>../..</relativePath>
11 | 	</parent>
12 | 
13 | 	<artifactId>boilerpipe-common</artifactId>
14 | 	<version>1.0.0-SNAPSHOT</version>
15 | 
16 | 	<dependencies>
17 | 		<dependency>
18 | 			<groupId>com.ibm.research.ai.ki</groupId>
19 | 			<artifactId>nekohtml</artifactId>
20 | 			<version>1.9.13-SNAPSHOT</version>
21 | 		</dependency>
22 | 
23 | 		<dependency>
24 | 			<groupId>xerces</groupId>
25 | 			<artifactId>xercesImpl</artifactId>
26 | 			<version>2.12.0</version>
27 | 			<!-- Apache License, Version 2.0 -->
28 | 		</dependency>
29 | 
30 | 	</dependencies>
31 | </project>
32 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/BoilerpipeDocumentSource.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe;
19 | 
20 | import com.kohlschutter.boilerpipe.document.TextDocument;
21 | 
22 | /**
23 |  * Something that can be represented as a {@link TextDocument}.
24 |  */
25 | public interface BoilerpipeDocumentSource {
26 |   TextDocument toTextDocument() throws BoilerpipeProcessingException;
27 | }
28 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/BoilerpipeFilter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe;
19 | 
20 | import com.kohlschutter.boilerpipe.document.TextDocument;
21 | 
22 | /**
23 |  * A generic {@link BoilerpipeFilter}. Takes a {@link TextDocument} and processes it somehow.
24 |  */
25 | public interface BoilerpipeFilter {
26 |   /**
27 |    * Processes the given document <code>doc</code>.
28 |    * 
29 |    * @param doc The {@link TextDocument} that is to be processed.
30 |    * @return <code>true</code> if changes have been made to the {@link TextDocument}.
31 |    * @throws BoilerpipeProcessingException
32 |    */
33 |   boolean process(final TextDocument doc) throws BoilerpipeProcessingException;
34 | }
35 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/BoilerpipeInput.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe;
19 | 
20 | import com.kohlschutter.boilerpipe.document.TextDocument;
21 | 
22 | /**
23 |  * A source that returns {@link TextDocument}s.
24 |  */
25 | public interface BoilerpipeInput {
26 |   /**
27 |    * Returns (somehow) a {@link TextDocument}.
28 |    * 
29 |    * @return A {@link TextDocument}.
30 |    * @throws BoilerpipeProcessingException
31 |    */
32 |   TextDocument getTextDocument() throws BoilerpipeProcessingException;
33 | }
34 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/BoilerpipeProcessingException.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe;
19 | 
20 | /**
21 |  * Exception for signaling failure in the processing pipeline.
22 |  */
23 | public class BoilerpipeProcessingException extends Exception {
24 |   private static final long serialVersionUID = 1L;
25 | 
26 |   public BoilerpipeProcessingException() {
27 |     super();
28 |   }
29 | 
30 |   public BoilerpipeProcessingException(String message, Throwable cause) {
31 |     super(message, cause);
32 |   }
33 | 
34 |   public BoilerpipeProcessingException(String message) {
35 |     super(message);
36 |   }
37 | 
38 |   public BoilerpipeProcessingException(Throwable cause) {
39 |     super(cause);
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/conditions/TextBlockCondition.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.conditions;
19 | 
20 | import com.kohlschutter.boilerpipe.document.TextBlock;
21 | import com.kohlschutter.boilerpipe.labels.ConditionalLabelAction;
22 | 
23 | /**
24 |  * Evaluates whether a given {@link TextBlock} meets a certain condition.
25 |  * 
26 |  * Useful in combination with {@link ConditionalLabelAction}.
27 |  */
28 | public interface TextBlockCondition {
29 |   /**
30 |    * Returns <code>true</code> iff the given {@link TextBlock} tb meets the defined condition.
31 |    * 
32 |    * @param tb
33 |    * @return <code><true</code> iff the condition is met.
34 |    */
35 |   boolean meetsCondition(final TextBlock tb);
36 | }
37 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/BPAnnotation.java:
--------------------------------------------------------------------------------
 1 | package com.kohlschutter.boilerpipe.document;
 2 | 
 3 | /**
 4 |  * Used to represent structured elements of the html page that will be retained as offset annotations on the document.
 5 |  * @author mrglass
 6 |  *
 7 |  */
 8 | public abstract class BPAnnotation implements Cloneable {
 9 |     public static final boolean debug = false;
10 |     
11 |     //CONSIDER: tag type? like 'a' or 'h1' or 'b'
12 |     public int start;
13 |     public int end;
14 |     
15 |     public final String localName;
16 |     
17 |     protected BPAnnotation(String localName) {
18 |         this.start = 10000000;
19 |         this.end = -10000000;
20 |         this.localName = localName.toLowerCase();
21 |     }
22 |     
23 |     public boolean isValid() {
24 |         return end > start;
25 |     }
26 | 
27 |     public void addOffset(int offset) {
28 |         this.start += offset;
29 |         this.end += offset;
30 |     }
31 |     
32 |     public BPAnnotation clone() {
33 |         try {
34 |             return (BPAnnotation)super.clone();
35 |         } catch (CloneNotSupportedException e) {
36 |            throw new Error(e);
37 |         }
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/HeaderAnnotation.java:
--------------------------------------------------------------------------------
 1 | package com.kohlschutter.boilerpipe.document;
 2 | 
 3 | /**
 4 |  * HTML h* header annotation.
 5 |  * @author mrglass
 6 |  *
 7 |  */
 8 | public class HeaderAnnotation extends BPAnnotation {
 9 |     public HeaderAnnotation(String localName) {
10 |         super(localName);
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/Link.java:
--------------------------------------------------------------------------------
 1 | package com.kohlschutter.boilerpipe.document;
 2 | 
 3 | /**
 4 |  * HTML anchor tag as offset annotation
 5 |  * @author mrglass
 6 |  *
 7 |  */
 8 | public class Link extends BPAnnotation {
 9 |     public String href;
10 |     
11 |     public Link(String href) {
12 |         super("a");
13 |         this.href = href;
14 |     }
15 |     
16 |     public boolean isValid() {
17 |         return start < end && href != null;
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/ParagraphAnnotation.java:
--------------------------------------------------------------------------------
 1 | package com.kohlschutter.boilerpipe.document;
 2 | 
 3 | /**
 4 |  * HTML paragraph tag
 5 |  * @author mrglass
 6 |  *
 7 |  */
 8 | public class ParagraphAnnotation extends BPAnnotation {
 9 |     public ParagraphAnnotation() {
10 |         super("p");
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/TextDocumentStatistics.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.document;
19 | 
20 | /**
21 |  * Provides shallow statistics on a given {@link TextDocument}
22 |  */
23 | public final class TextDocumentStatistics {
24 |   private int numWords = 0;
25 |   private int numBlocks = 0;
26 | 
27 |   /**
28 |    * Computes statistics on a given {@link TextDocument}.
29 |    * 
30 |    * @param doc The {@link TextDocument}.
31 |    * @param contentOnly if true then o
32 |    */
33 |   public TextDocumentStatistics(final TextDocument doc, final boolean contentOnly) {
34 |     for (TextBlock tb : doc.getTextBlocks()) {
35 |       if (contentOnly && !tb.isContent()) {
36 |         continue;
37 |       }
38 | 
39 |       numWords += tb.getNumWords();
40 |       numBlocks++;
41 |     }
42 |   }
43 | 
44 |   /**
45 |    * Returns the average number of words at block-level (= overall number of words divided by the
46 |    * number of blocks).
47 |    * 
48 |    * @return Average
49 |    */
50 |   public float avgNumWords() {
51 |     return numWords / (float) numBlocks;
52 |   }
53 | 
54 |   /**
55 |    * Returns the overall number of words in all blocks.
56 |    * 
57 |    * @return Sum
58 |    */
59 |   public int getNumWords() {
60 |     return numWords;
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/TextFormatAnnotation.java:
--------------------------------------------------------------------------------
1 | package com.kohlschutter.boilerpipe.document;
2 | 
3 | public class TextFormatAnnotation extends BPAnnotation {
4 |     public TextFormatAnnotation(String localName) {
5 |         super(localName);
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/document/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * The Boilerpipe document model.
3 |  */
4 | package com.kohlschutter.boilerpipe.document;
5 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/estimators/SimpleEstimator.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.estimators;
19 | 
20 | import com.kohlschutter.boilerpipe.BoilerpipeExtractor;
21 | import com.kohlschutter.boilerpipe.document.TextDocumentStatistics;
22 | import com.kohlschutter.boilerpipe.extractors.ArticleExtractor;
23 | import com.kohlschutter.boilerpipe.extractors.DefaultExtractor;
24 | 
25 | /**
26 |  * Estimates the "goodness" of a {@link BoilerpipeExtractor} on a given document.
27 |  */
28 | public final class SimpleEstimator {
29 | 
30 |   /**
31 |    * Returns the singleton instance of {@link SimpleEstimator}
32 |    */
33 |   public static final SimpleEstimator INSTANCE = new SimpleEstimator();
34 | 
35 |   private SimpleEstimator() {
36 |   }
37 | 
38 |   /**
39 |    * Given the statistics of the document before and after applying the {@link BoilerpipeExtractor},
40 |    * can we regard the extraction quality (too) low?
41 |    * 
42 |    * Works well with {@link DefaultExtractor}, {@link ArticleExtractor} and others.
43 |    * 
44 |    * @param dsBefore
45 |    * @param dsAfter
46 |    * @return true if low quality is to be expected.
47 |    */
48 |   public boolean isLowQuality(final TextDocumentStatistics dsBefore,
49 |       final TextDocumentStatistics dsAfter) {
50 |     if (dsBefore.getNumWords() < 90 || dsAfter.getNumWords() < 70) {
51 |       return true;
52 |     }
53 | 
54 |     if (dsAfter.avgNumWords() < 25) {
55 |       return true;
56 |     }
57 | 
58 |     return false;
59 |   }
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/ArticleSentencesExtractor.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.extractors;
19 | 
20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
21 | import com.kohlschutter.boilerpipe.document.TextDocument;
22 | import com.kohlschutter.boilerpipe.filters.simple.MinClauseWordsFilter;
23 | import com.kohlschutter.boilerpipe.filters.simple.SplitParagraphBlocksFilter;
24 | 
25 | /**
26 |  * A full-text extractor which is tuned towards extracting sentences from news articles.
27 |  */
28 | public final class ArticleSentencesExtractor extends ExtractorBase {
29 |   public static final ArticleSentencesExtractor INSTANCE = new ArticleSentencesExtractor();
30 | 
31 |   /**
32 |    * Returns the singleton instance for {@link ArticleSentencesExtractor}.
33 |    */
34 |   public static ArticleSentencesExtractor getInstance() {
35 |     return INSTANCE;
36 |   }
37 | 
38 |   public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
39 |     return
40 | 
41 |     ArticleExtractor.INSTANCE.process(doc) | SplitParagraphBlocksFilter.INSTANCE.process(doc)
42 |         | MinClauseWordsFilter.INSTANCE.process(doc);
43 |   }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/CommonExtractors.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.extractors;
19 | 
20 | import com.kohlschutter.boilerpipe.BoilerpipeExtractor;
21 | 
22 | /**
23 |  * Provides quick access to common {@link BoilerpipeExtractor}s.
24 |  */
25 | public final class CommonExtractors {
26 |   private CommonExtractors() {
27 |   }
28 | 
29 |   /**
30 |    * Works very well for most types of Article-like HTML.
31 |    */
32 |   public static final ArticleExtractor ARTICLE_EXTRACTOR = ArticleExtractor.INSTANCE;
33 | 
34 |   /**
35 |    * Usually worse than {@link ArticleExtractor}, but simpler/no heuristics.
36 |    */
37 |   public static final DefaultExtractor DEFAULT_EXTRACTOR = DefaultExtractor.INSTANCE;
38 | 
39 |   /**
40 |    * Like {@link DefaultExtractor}, but keeps the largest text block only.
41 |    */
42 |   public static final LargestContentExtractor LARGEST_CONTENT_EXTRACTOR =
43 |       LargestContentExtractor.INSTANCE;
44 | 
45 |   /**
46 |    * Trained on krdwrd Canola (different definition of "boilerplate"). You may give it a try.
47 |    */
48 |   public static final CanolaExtractor CANOLA_EXTRACTOR = CanolaExtractor.INSTANCE;
49 | 
50 |   /**
51 |    * Dummy Extractor; should return the input text. Use this to double-check that your problem is
52 |    * within a particular {@link BoilerpipeExtractor}, or somewhere else.
53 |    */
54 |   public static final KeepEverythingExtractor KEEP_EVERYTHING_EXTRACTOR =
55 |       KeepEverythingExtractor.INSTANCE;
56 | }
57 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/DefaultExtractor.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.extractors;
19 | 
20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
21 | import com.kohlschutter.boilerpipe.document.TextDocument;
22 | import com.kohlschutter.boilerpipe.filters.english.DensityRulesClassifier;
23 | import com.kohlschutter.boilerpipe.filters.heuristics.BlockProximityFusion;
24 | import com.kohlschutter.boilerpipe.filters.heuristics.SimpleBlockFusionProcessor;
25 | 
26 | /**
27 |  * A quite generic full-text extractor.
28 |  */
29 | public class DefaultExtractor extends ExtractorBase {
30 |   public static final DefaultExtractor INSTANCE = new DefaultExtractor();
31 | 
32 |   /**
33 |    * Returns the singleton instance for {@link DefaultExtractor}.
34 |    */
35 |   public static DefaultExtractor getInstance() {
36 |     return INSTANCE;
37 |   }
38 | 
39 |   public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
40 | 
41 |     return
42 | 
43 |     SimpleBlockFusionProcessor.INSTANCE.process(doc)
44 |         | BlockProximityFusion.MAX_DISTANCE_1.process(doc)
45 |         | DensityRulesClassifier.INSTANCE.process(doc);
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/KeepEverythingExtractor.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.extractors;
19 | 
20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
21 | import com.kohlschutter.boilerpipe.document.TextDocument;
22 | import com.kohlschutter.boilerpipe.filters.simple.MarkEverythingContentFilter;
23 | 
24 | /**
25 |  * Marks everything as content.
26 |  */
27 | public final class KeepEverythingExtractor extends ExtractorBase {
28 | 
29 |   public static final KeepEverythingExtractor INSTANCE = new KeepEverythingExtractor();
30 | 
31 |   private KeepEverythingExtractor() {
32 | 
33 |   }
34 | 
35 |   public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
36 |     return MarkEverythingContentFilter.INSTANCE.process(doc);
37 |   }
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/KeepEverythingWithMinKWordsExtractor.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.extractors;
19 | 
20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
21 | import com.kohlschutter.boilerpipe.document.TextDocument;
22 | import com.kohlschutter.boilerpipe.filters.heuristics.SimpleBlockFusionProcessor;
23 | import com.kohlschutter.boilerpipe.filters.simple.MarkEverythingContentFilter;
24 | import com.kohlschutter.boilerpipe.filters.simple.MinWordsFilter;
25 | 
26 | /**
27 |  * A full-text extractor which extracts the largest text component of a page. For news articles, it
28 |  * may perform better than the {@link DefaultExtractor}, but usually worse than
29 |  * {@link ArticleExtractor}.
30 |  */
31 | public final class KeepEverythingWithMinKWordsExtractor extends ExtractorBase {
32 | 
33 |   private final MinWordsFilter filter;
34 | 
35 |   public KeepEverythingWithMinKWordsExtractor(final int kMin) {
36 |     this.filter = new MinWordsFilter(kMin);
37 |   }
38 | 
39 |   public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
40 |     return SimpleBlockFusionProcessor.INSTANCE.process(doc)
41 |         | MarkEverythingContentFilter.INSTANCE.process(doc) | filter.process(doc);
42 |   }
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/LargestContentExtractor.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.extractors;
19 | 
20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
21 | import com.kohlschutter.boilerpipe.document.TextDocument;
22 | import com.kohlschutter.boilerpipe.filters.english.NumWordsRulesClassifier;
23 | import com.kohlschutter.boilerpipe.filters.heuristics.BlockProximityFusion;
24 | import com.kohlschutter.boilerpipe.filters.heuristics.KeepLargestBlockFilter;
25 | 
26 | /**
27 |  * A full-text extractor which extracts the largest text component of a page. For news articles, it
28 |  * may perform better than the {@link DefaultExtractor}, but usually worse than
29 |  * {@link ArticleExtractor}.
30 |  */
31 | public final class LargestContentExtractor extends ExtractorBase {
32 |   public static final LargestContentExtractor INSTANCE = new LargestContentExtractor();
33 | 
34 |   private LargestContentExtractor() {
35 |   }
36 | 
37 |   /**
38 |    * Returns the singleton instance for {@link LargestContentExtractor}.
39 |    */
40 |   public static LargestContentExtractor getInstance() {
41 |     return INSTANCE;
42 |   }
43 | 
44 |   public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
45 |     return NumWordsRulesClassifier.INSTANCE.process(doc)
46 |         | BlockProximityFusion.MAX_DISTANCE_1.process(doc)
47 |         | KeepLargestBlockFilter.INSTANCE.process(doc);
48 |   }
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/NumWordsRulesExtractor.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.extractors;
19 | 
20 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
21 | import com.kohlschutter.boilerpipe.document.TextDocument;
22 | import com.kohlschutter.boilerpipe.filters.english.NumWordsRulesClassifier;
23 | 
24 | /**
25 |  * A quite generic full-text extractor solely based upon the number of words per block (the current,
26 |  * the previous and the next block).
27 |  */
28 | public class NumWordsRulesExtractor extends ExtractorBase {
29 |   public static final NumWordsRulesExtractor INSTANCE = new NumWordsRulesExtractor();
30 | 
31 |   /**
32 |    * Returns the singleton instance for {@link NumWordsRulesExtractor}.
33 |    */
34 |   public static NumWordsRulesExtractor getInstance() {
35 |     return INSTANCE;
36 |   }
37 | 
38 |   public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
39 | 
40 |     return NumWordsRulesClassifier.INSTANCE.process(doc);
41 |   }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/extractors/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Some standard extractors (i.e., completely piped BoilerpipeFilters)
3 |  */
4 | package com.kohlschutter.boilerpipe.extractors;
5 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/english/HeuristicFilterBase.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.filters.english;
19 | 
20 | import com.kohlschutter.boilerpipe.document.TextBlock;
21 | 
22 | /**
23 |  * Base class for some heuristics that are used by boilerpipe filters.
24 |  */
25 | abstract class HeuristicFilterBase {
26 | 
27 |   protected static int getNumFullTextWords(final TextBlock tb) {
28 |     return getNumFullTextWords(tb, 9);
29 |   }
30 | 
31 |   protected static int getNumFullTextWords(final TextBlock tb, float minTextDensity) {
32 |     if (tb.getTextDensity() >= minTextDensity) {
33 |       return tb.getNumWords();
34 |     } else {
35 |       return 0;
36 |     }
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/english/MinFulltextWordsFilter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.filters.english;
19 | 
20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 | import com.kohlschutter.boilerpipe.document.TextDocument;
24 | 
25 | /**
26 |  * Keeps only those content blocks which contain at least k full-text words (measured by
27 |  * {@link HeuristicFilterBase#getNumFullTextWords(TextBlock)}). k is 30 by default.
28 |  */
29 | public final class MinFulltextWordsFilter extends HeuristicFilterBase implements BoilerpipeFilter {
30 |   public static final MinFulltextWordsFilter DEFAULT_INSTANCE = new MinFulltextWordsFilter(30);
31 |   private final int minWords;
32 | 
33 |   public static MinFulltextWordsFilter getDefaultInstance() {
34 |     return DEFAULT_INSTANCE;
35 |   }
36 | 
37 |   public MinFulltextWordsFilter(final int minWords) {
38 |     this.minWords = minWords;
39 |   }
40 | 
41 |   public boolean process(final TextDocument doc) throws BoilerpipeProcessingException {
42 | 
43 |     boolean changes = false;
44 | 
45 |     for (TextBlock tb : doc.getTextBlocks()) {
46 |       if (!tb.isContent()) {
47 |         continue;
48 |       }
49 |       if (getNumFullTextWords(tb) < minWords) {
50 |         tb.setIsContent(false);
51 |         changes = true;
52 |       }
53 | 
54 |     }
55 | 
56 |     return changes;
57 | 
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/english/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * These BoilerpipeFilters have only been tested on English text.
3 |  * 
4 |  * That is, they will probably work with other Western languages, but maybe need some parameter tuning to perform well.
5 |  */
6 | package com.kohlschutter.boilerpipe.filters.english;
7 | 
8 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/heuristics/ListAtEndFilter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.filters.heuristics;
19 | 
20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 | import com.kohlschutter.boilerpipe.document.TextDocument;
24 | import com.kohlschutter.boilerpipe.labels.DefaultLabels;
25 | 
26 | /**
27 |  * Marks nested list-item blocks after the end of the main content.
28 |  */
29 | public final class ListAtEndFilter implements BoilerpipeFilter {
30 |   public static final ListAtEndFilter INSTANCE = new ListAtEndFilter();
31 | 
32 |   private ListAtEndFilter() {
33 |   }
34 | 
35 |   public boolean process(final TextDocument doc) throws BoilerpipeProcessingException {
36 | 
37 |     boolean changes = false;
38 | 
39 |     int tagLevel = Integer.MAX_VALUE;
40 |     for (TextBlock tb : doc.getTextBlocks()) {
41 |       if (tb.isContent() && tb.hasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) {
42 |         tagLevel = tb.getTagLevel();
43 |       } else {
44 |         if (tb.getTagLevel() > tagLevel && tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT)
45 |             && tb.hasLabel(DefaultLabels.LI) && tb.getLinkDensity() == 0) {
46 |           tb.setIsContent(true);
47 |           changes = true;
48 |         } else {
49 |           tagLevel = Integer.MAX_VALUE;
50 |         }
51 |       }
52 |     }
53 | 
54 |     return changes;
55 | 
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/heuristics/SimpleBlockFusionProcessor.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.filters.heuristics;
19 | 
20 | import java.util.Iterator;
21 | import java.util.List;
22 | 
23 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
24 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
25 | import com.kohlschutter.boilerpipe.document.TextBlock;
26 | import com.kohlschutter.boilerpipe.document.TextDocument;
27 | 
28 | /**
29 |  * Merges two subsequent blocks if their text densities are equal.
30 |  */
31 | public class SimpleBlockFusionProcessor implements BoilerpipeFilter {
32 |   public static final SimpleBlockFusionProcessor INSTANCE = new SimpleBlockFusionProcessor();
33 | 
34 |   /**
35 |    * Returns the singleton instance for BlockFusionProcessor.
36 |    */
37 |   public static SimpleBlockFusionProcessor getInstance() {
38 |     return INSTANCE;
39 |   }
40 | 
41 |   public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
42 |     List<TextBlock> textBlocks = doc.getTextBlocks();
43 |     boolean changes = false;
44 | 
45 |     if (textBlocks.size() < 2) {
46 |       return false;
47 |     }
48 | 
49 |     TextBlock b1 = textBlocks.get(0);
50 |     for (Iterator<TextBlock> it = textBlocks.listIterator(1); it.hasNext();) {
51 |       TextBlock b2 = it.next();
52 | 
53 |       final boolean similar = (b1.getTextDensity() == b2.getTextDensity());
54 | 
55 |       if (similar) {
56 |         b1.mergeNext(b2);
57 |         it.remove();
58 |         changes = true;
59 |       } else {
60 |         b1 = b2;
61 |       }
62 |     }
63 | 
64 |     return changes;
65 |   }
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/heuristics/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * These BoilerpipeFilters are pure heuristics.
3 |  */
4 | package com.kohlschutter.boilerpipe.filters.heuristics;
5 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/InvertedFilter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.filters.simple;
19 | 
20 | import java.util.List;
21 | 
22 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
23 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
24 | import com.kohlschutter.boilerpipe.document.TextBlock;
25 | import com.kohlschutter.boilerpipe.document.TextDocument;
26 | 
27 | /**
28 |  * Reverts the "isContent" flag for all {@link TextBlock}s
29 |  */
30 | public final class InvertedFilter implements BoilerpipeFilter {
31 |   public static final InvertedFilter INSTANCE = new InvertedFilter();
32 | 
33 |   private InvertedFilter() {
34 |   }
35 | 
36 |   public boolean process(TextDocument doc) throws BoilerpipeProcessingException {
37 | 
38 |     List<TextBlock> tbs = doc.getTextBlocks();
39 |     if (tbs.isEmpty()) {
40 |       return false;
41 |     }
42 |     for (TextBlock tb : tbs) {
43 |       tb.setIsContent(!tb.isContent());
44 |     }
45 | 
46 |     return true;
47 |   }
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/LabelToBoilerplateFilter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.filters.simple;
19 | 
20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 | import com.kohlschutter.boilerpipe.document.TextDocument;
24 | import com.kohlschutter.boilerpipe.labels.DefaultLabels;
25 | 
26 | /**
27 |  * Marks all blocks that contain a given label as "boilerplate".
28 |  */
29 | public final class LabelToBoilerplateFilter implements BoilerpipeFilter {
30 |   public static final LabelToBoilerplateFilter INSTANCE_STRICTLY_NOT_CONTENT =
31 |       new LabelToBoilerplateFilter(DefaultLabels.STRICTLY_NOT_CONTENT);
32 | 
33 |   private String[] labels;
34 | 
35 |   public LabelToBoilerplateFilter(final String... label) {
36 |     this.labels = label;
37 |   }
38 | 
39 |   public boolean process(final TextDocument doc) throws BoilerpipeProcessingException {
40 | 
41 |     boolean changes = false;
42 | 
43 |     BLOCK_LOOP : for (TextBlock tb : doc.getTextBlocks()) {
44 |       if (tb.isContent()) {
45 |         for (String label : labels) {
46 |           if (tb.hasLabel(label)) {
47 |             tb.setIsContent(false);
48 |             changes = true;
49 |             continue BLOCK_LOOP;
50 |           }
51 |         }
52 |       }
53 |     }
54 | 
55 |     return changes;
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/LabelToContentFilter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.filters.simple;
19 | 
20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 | import com.kohlschutter.boilerpipe.document.TextDocument;
24 | 
25 | /**
26 |  * Marks all blocks that contain a given label as "content".
27 |  */
28 | public final class LabelToContentFilter implements BoilerpipeFilter {
29 |   private String[] labels;
30 | 
31 |   public LabelToContentFilter(final String... label) {
32 |     this.labels = label;
33 |   }
34 | 
35 |   public boolean process(final TextDocument doc) throws BoilerpipeProcessingException {
36 | 
37 |     boolean changes = false;
38 | 
39 |     BLOCK_LOOP : for (TextBlock tb : doc.getTextBlocks()) {
40 |       if (!tb.isContent()) {
41 |         for (String label : labels) {
42 |           if (tb.hasLabel(label)) {
43 |             tb.setIsContent(true);
44 |             changes = true;
45 |             continue BLOCK_LOOP;
46 |           }
47 |         }
48 |       }
49 |     }
50 | 
51 |     return changes;
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/MarkEverythingBoilerplateFilter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.filters.simple;
19 | 
20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 | import com.kohlschutter.boilerpipe.document.TextDocument;
24 | 
25 | /**
26 |  * Marks all blocks as boilerplate.
27 |  */
28 | public final class MarkEverythingBoilerplateFilter implements BoilerpipeFilter {
29 |   public static final MarkEverythingBoilerplateFilter INSTANCE =
30 |       new MarkEverythingBoilerplateFilter();
31 | 
32 |   private MarkEverythingBoilerplateFilter() {
33 |   }
34 | 
35 |   public boolean process(final TextDocument doc) throws BoilerpipeProcessingException {
36 | 
37 |     boolean changes = false;
38 | 
39 |     for (TextBlock tb : doc.getTextBlocks()) {
40 |       if (tb.isContent()) {
41 |         tb.setIsContent(false);
42 |         changes = true;
43 |       }
44 |     }
45 | 
46 |     return changes;
47 | 
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/MarkEverythingContentFilter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.filters.simple;
19 | 
20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 | import com.kohlschutter.boilerpipe.document.TextDocument;
24 | 
25 | /**
26 |  * Marks all blocks as content.
27 |  */
28 | public final class MarkEverythingContentFilter implements BoilerpipeFilter {
29 |   public static final MarkEverythingContentFilter INSTANCE = new MarkEverythingContentFilter();
30 | 
31 |   private MarkEverythingContentFilter() {
32 |   }
33 | 
34 |   public boolean process(final TextDocument doc) throws BoilerpipeProcessingException {
35 | 
36 |     boolean changes = false;
37 | 
38 |     for (TextBlock tb : doc.getTextBlocks()) {
39 |       if (!tb.isContent()) {
40 |         tb.setIsContent(true);
41 |         changes = true;
42 |       }
43 |     }
44 | 
45 |     return changes;
46 | 
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/MinWordsFilter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.filters.simple;
19 | 
20 | import com.kohlschutter.boilerpipe.BoilerpipeFilter;
21 | import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 | import com.kohlschutter.boilerpipe.document.TextDocument;
24 | 
25 | /**
26 |  * Keeps only those content blocks which contain at least <em>k</em> words.
27 |  */
28 | public final class MinWordsFilter implements BoilerpipeFilter {
29 |   private final int minWords;
30 | 
31 |   public MinWordsFilter(final int minWords) {
32 |     this.minWords = minWords;
33 |   }
34 | 
35 |   public boolean process(final TextDocument doc) throws BoilerpipeProcessingException {
36 | 
37 |     boolean changes = false;
38 | 
39 |     for (TextBlock tb : doc.getTextBlocks()) {
40 |       if (!tb.isContent()) {
41 |         continue;
42 |       }
43 |       if (tb.getNumWords() < minWords) {
44 |         tb.setIsContent(false);
45 |         changes = true;
46 |       }
47 | 
48 |     }
49 | 
50 |     return changes;
51 | 
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/filters/simple/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * These BoilerpipeFilters are straight-forward and probably not really specific to English.
3 |  */
4 | package com.kohlschutter.boilerpipe.filters.simple;
5 | 
6 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/labels/ConditionalLabelAction.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.labels;
19 | 
20 | import com.kohlschutter.boilerpipe.conditions.TextBlockCondition;
21 | import com.kohlschutter.boilerpipe.document.TextBlock;
22 | 
23 | /**
24 |  * Adds labels to a {@link TextBlock} if the given criteria are met.
25 |  */
26 | public final class ConditionalLabelAction extends LabelAction {
27 | 
28 |   private final TextBlockCondition condition;
29 | 
30 |   public ConditionalLabelAction(TextBlockCondition condition, String... labels) {
31 |     super(labels);
32 |     this.condition = condition;
33 |   }
34 | 
35 |   public void addTo(final TextBlock tb) {
36 |     if (condition.meetsCondition(tb)) {
37 |       addLabelsTo(tb);
38 |     }
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/labels/DefaultLabels.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.labels;
19 | 
20 | import com.kohlschutter.boilerpipe.document.TextBlock;
21 | 
22 | /**
23 |  * Some pre-defined labels which can be used in conjunction with {@link TextBlock#addLabel(String)}
24 |  * and {@link TextBlock#hasLabel(String)}.
25 |  */
26 | public final class DefaultLabels {
27 |   public static final String TITLE = "de.l3s.boilerpipe/TITLE";
28 |   public static final String ARTICLE_METADATA = "de.l3s.boilerpipe/ARTICLE_METADATA";
29 |   public static final String INDICATES_END_OF_TEXT = "de.l3s.boilerpipe/INDICATES_END_OF_TEXT";
30 |   public static final String MIGHT_BE_CONTENT = "de.l3s.boilerpipe/MIGHT_BE_CONTENT";
31 |   public static final String VERY_LIKELY_CONTENT = "de.l3s.boilerpipe/VERY_LIKELY_CONTENT";
32 |   public static final String STRICTLY_NOT_CONTENT = "de.l3s.boilerpipe/STRICTLY_NOT_CONTENT";
33 |   public static final String HR = "de.l3s.boilerpipe/HR";
34 |   public static final String LI = "de.l3s.boilerpipe/LI";
35 | 
36 |   public static final String HEADING = "de.l3s.boilerpipe/HEADING";
37 |   public static final String H1 = "de.l3s.boilerpipe/H1";
38 |   public static final String H2 = "de.l3s.boilerpipe/H2";
39 |   public static final String H3 = "de.l3s.boilerpipe/H3";
40 | 
41 |   public static final String MARKUP_PREFIX = "<";
42 | 
43 |   private DefaultLabels() {
44 |     // not to be instantiated
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/labels/LabelAction.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.labels;
19 | 
20 | import java.util.Arrays;
21 | 
22 | import com.kohlschutter.boilerpipe.document.TextBlock;
23 | 
24 | /**
25 |  * Helps adding labels to {@link TextBlock}s.
26 |  * 
27 |  * @see ConditionalLabelAction
28 |  */
29 | public class LabelAction {
30 |   protected final String[] labels;
31 | 
32 |   public LabelAction(String... labels) {
33 |     this.labels = labels;
34 |   }
35 | 
36 |   public void addTo(final TextBlock tb) {
37 |     addLabelsTo(tb);
38 |   }
39 | 
40 |   protected final void addLabelsTo(final TextBlock tb) {
41 |     tb.addLabels(labels);
42 |   }
43 | 
44 |   public String toString() {
45 |     return super.toString() + "{" + Arrays.asList(labels) + "}";
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * The Boilerpipe top-level package.
3 |  */
4 | package com.kohlschutter.boilerpipe;
5 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/sax/HTMLDocument.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.sax;
19 | 
20 | import java.io.ByteArrayInputStream;
21 | import java.nio.charset.Charset;
22 | 
23 | import org.xml.sax.InputSource;
24 | 
25 | /**
26 |  * An {@link InputSourceable} for {@link HTMLFetcher}.
27 |  */
28 | public class HTMLDocument implements InputSourceable {
29 |   private final Charset charset;
30 |   private final byte[] data;
31 | 
32 |   public HTMLDocument(final byte[] data, final Charset charset) {
33 |     this.data = data;
34 |     this.charset = charset;
35 |   }
36 | 
37 |   public HTMLDocument(final String data) {
38 |     Charset cs = Charset.forName("utf-8");
39 |     this.data = data.getBytes(cs);
40 |     this.charset = cs;
41 |   }
42 | 
43 |   public Charset getCharset() {
44 |     return charset;
45 |   }
46 | 
47 |   public byte[] getData() {
48 |     return data;
49 |   }
50 | 
51 |   public InputSource toInputSource() {
52 |     final InputSource is = new InputSource(new ByteArrayInputStream(data));
53 |     is.setEncoding(charset.name());
54 |     return is;
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/sax/InputSourceable.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.sax;
19 | 
20 | import org.xml.sax.InputSource;
21 | 
22 | /**
23 |  * An InputSourceable can return an arbitrary number of new {@link InputSource}s for a given
24 |  * document.
25 |  */
26 | public interface InputSourceable {
27 |   InputSource toInputSource();
28 | }
29 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/sax/TagAction.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.sax;
19 | 
20 | import org.xml.sax.Attributes;
21 | import org.xml.sax.SAXException;
22 | 
23 | /**
24 |  * Defines an action that is to be performed whenever a particular tag occurs during HTML parsing.
25 |  */
26 | public interface TagAction {
27 | 
28 |   boolean start(final BoilerpipeHTMLContentHandler instance, final String localName,
29 |       final String qName, final Attributes atts) throws SAXException;
30 | 
31 |   boolean end(final BoilerpipeHTMLContentHandler instance, final String localName,
32 |       final String qName) throws SAXException;
33 | 
34 |   boolean changesTagLevel();
35 | }


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/sax/TagActionMap.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.sax;
19 | 
20 | import java.util.HashMap;
21 | 
22 | /**
23 |  * Base class for definition a set of {@link TagAction}s that are to be used for the HTML parsing
24 |  * process.
25 |  * 
26 |  * @see DefaultTagActionMap
27 |  */
28 | public abstract class TagActionMap extends HashMap<String, TagAction> {
29 |   private static final long serialVersionUID = 1L;
30 | 
31 |   /**
32 |    * Sets a particular {@link TagAction} for a given tag. Any existing TagAction for that tag will
33 |    * be removed and overwritten.
34 |    * 
35 |    * @param tag The tag (will be stored internally 1. as it is, 2. lower-case, 3. upper-case)
36 |    * @param action The {@link TagAction}
37 |    */
38 |   protected void setTagAction(final String tag, final TagAction action) {
39 |     put(tag.toUpperCase(), action);
40 |     put(tag.toLowerCase(), action);
41 |     put(tag, action);
42 |   }
43 | 
44 |   /**
45 |    * Adds a particular {@link TagAction} for a given tag. If a TagAction already exists for that
46 |    * tag, a chained action, consisting of the previous and the new {@link TagAction} is created.
47 |    * 
48 |    * @param tag The tag (will be stored internally 1. as it is, 2. lower-case, 3. upper-case)
49 |    * @param action The {@link TagAction}
50 |    */
51 |   protected void addTagAction(final String tag, final TagAction action) {
52 |     TagAction previousAction = get(tag);
53 |     if (previousAction == null) {
54 |       setTagAction(tag, action);
55 |     } else {
56 |       setTagAction(tag, new CommonTagActions.Chained(previousAction, action));
57 |     }
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/sax/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes related to parsing and producing HTML from/to Boilerpipe TextDocuments.
3 |  */
4 | package com.kohlschutter.boilerpipe.sax;
5 | 
6 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/util/UnicodeTokenizer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * boilerpipe
 3 |  *
 4 |  * Copyright (c) 2009, 2014 Christian Kohlschütter
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.kohlschutter.boilerpipe.util;
19 | 
20 | import java.util.regex.Pattern;
21 | 
22 | /**
23 |  * Tokenizes text according to Unicode word boundaries and strips off non-word characters.
24 |  */
25 | public class UnicodeTokenizer {
26 |   private static final Pattern PAT_WORD_BOUNDARY = Pattern.compile("\\b");
27 |   private static final Pattern PAT_NOT_WORD_BOUNDARY = Pattern
28 |       .compile("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)/])[\u2063]*");
29 | 
30 |   /**
31 |    * Tokenizes the text and returns an array of tokens.
32 |    * 
33 |    * @param text The text
34 |    * @return The tokens
35 |    */
36 |   public static String[] tokenize(final CharSequence text) {
37 |     return PAT_NOT_WORD_BOUNDARY.matcher(PAT_WORD_BOUNDARY.matcher(text).replaceAll("\u2063"))
38 |         .replaceAll("$1").replaceAll("[ \u2063]+", " ").trim().split("[ ]+");
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/boilerpipe/boilerpipe-common/src/main/java/com/kohlschutter/boilerpipe/util/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Some helper classes.
3 |  */
4 | package com.kohlschutter.boilerpipe.util;
5 | 
6 | 


--------------------------------------------------------------------------------
/boilerpipe/nekohtml/dependency-reduced-pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <parent>
 4 |     <artifactId>cc-dbp-parent-pom</artifactId>
 5 |     <groupId>com.ibm.research.ai.ki</groupId>
 6 |     <version>1.0.0-SNAPSHOT</version>
 7 |     <relativePath>../../pom.xml</relativePath>
 8 |   </parent>
 9 |   <modelVersion>4.0.0</modelVersion>
10 |   <artifactId>nekohtml</artifactId>
11 |   <version>1.9.13-SNAPSHOT</version>
12 |   <build>
13 |     <plugins>
14 |       <plugin>
15 |         <artifactId>maven-shade-plugin</artifactId>
16 |         <version>2.3</version>
17 |         <executions>
18 |           <execution>
19 |             <phase>package</phase>
20 |             <goals>
21 |               <goal>shade</goal>
22 |             </goals>
23 |             <configuration>
24 |               <artifactSet>
25 |                 <includes>
26 |                   <include>net.sourceforge.nekohtml:nekohtml</include>
27 |                 </includes>
28 |               </artifactSet>
29 |               <promoteTransitiveDependencies>true</promoteTransitiveDependencies>
30 |             </configuration>
31 |           </execution>
32 |         </executions>
33 |       </plugin>
34 |     </plugins>
35 |   </build>
36 |   <dependencies>
37 |     <dependency>
38 |       <groupId>xerces</groupId>
39 |       <artifactId>xercesImpl</artifactId>
40 |       <version>2.9.1</version>
41 |       <scope>compile</scope>
42 |     </dependency>
43 |     <dependency>
44 |       <groupId>xml-apis</groupId>
45 |       <artifactId>xml-apis</artifactId>
46 |       <version>1.3.04</version>
47 |       <scope>compile</scope>
48 |     </dependency>
49 |   </dependencies>
50 | </project>
51 | 
52 | 


--------------------------------------------------------------------------------
/boilerpipe/nekohtml/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 4 | 	<modelVersion>4.0.0</modelVersion>
 5 | 	
 6 | 	<parent>
 7 | 		<groupId>com.ibm.research.ai.ki</groupId>
 8 | 		<artifactId>cc-dbp-parent-pom</artifactId>
 9 | 		<version>1.0.0-SNAPSHOT</version>
10 | 		<relativePath>../..</relativePath>
11 | 	</parent>
12 | 
13 | 	<artifactId>nekohtml</artifactId>
14 | 	<version>1.9.13-SNAPSHOT</version>
15 | 
16 | 	<build>
17 | 		<plugins>
18 | 			<plugin>
19 | 				<artifactId>maven-shade-plugin</artifactId>
20 | 				<version>2.3</version>
21 | 				<executions>
22 | 					<execution>
23 | 						<phase>package</phase>
24 | 						<goals>
25 | 							<goal>shade</goal>
26 | 						</goals>
27 | 						<configuration>
28 | 							<artifactSet>
29 | 								<includes>
30 | 									<include>net.sourceforge.nekohtml:nekohtml</include>
31 | 								</includes>
32 | 							</artifactSet>
33 | 							<promoteTransitiveDependencies>true</promoteTransitiveDependencies>
34 | 						</configuration>
35 | 					</execution>
36 | 				</executions>
37 | 			</plugin>
38 | 		</plugins>
39 | 	</build>
40 | 
41 | 	<dependencies>
42 | 		<dependency>
43 | 			<groupId>net.sourceforge.nekohtml</groupId>
44 | 			<artifactId>nekohtml</artifactId>
45 | 			<version>1.9.13</version>
46 | 			<!-- Apache License, Version 2.0 -->
47 | 		</dependency>
48 | 	</dependencies>
49 | </project>
50 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.corpus/src/main/java/com/ibm/research/ai/ki/corpora/crawl/CharsetDetect.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.corpora.crawl;
19 | 
20 | import java.io.*;
21 | import java.nio.charset.*;
22 | 
23 | import org.mozilla.universalchardet.*;
24 | 
25 | public class CharsetDetect {
26 |     static String mapCharset(String charsetName) {
27 |         try {
28 |             if (Charset.isSupported(charsetName))
29 |                 return charsetName;
30 |             String lc = charsetName.toLowerCase();
31 |             if(lc.contains("iso8859-1") || lc.contains("iso-8859-1")) {
32 |                 return "cp1252";
33 |             }
34 |             return charsetName;
35 |         } catch (Throwable t) {
36 |             return "UTF-8";
37 |         }
38 |     }
39 |     
40 |     public static String getCharsetFromBytes(byte buffer[]) throws IOException {
41 |         UniversalDetector detector = new UniversalDetector(null);
42 |         detector.handleData(buffer, 0, buffer.length);
43 |         detector.dataEnd();
44 |         String charsetName = detector.getDetectedCharset();
45 |         detector.reset();
46 |         return mapCharset(charsetName);
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.corpus/src/main/java/com/ibm/research/ai/ki/corpora/crawl/CommonCrawlConfig.java:
--------------------------------------------------------------------------------
 1 | package com.ibm.research.ai.ki.corpora.crawl;
 2 | 
 3 | import com.ibm.research.ai.ki.util.*;
 4 | 
 5 | public class CommonCrawlConfig extends PropertyStruct {
 6 |     private static final long serialVersionUID = 1L;
 7 | 
 8 |     /**
 9 |      * See https://github.com/optimaize/language-detector for langauge options
10 |      */
11 |     public String language = "en";
12 |     /**
13 |      * The language detector is typically very confident, most values are close to one or zero
14 |      */
15 |     public double minLanguageConfidence = 0.8;
16 |     /**
17 |      * Possible options are LinkAnnotation, SectionHeader, Paragraph and TextFormating.
18 |      * LinkAnnotation retains the anchor tag information (which spans of text are links and where they link to).
19 |      */
20 |     public String[] annotationTypes = new String[] {"LinkAnnotation"};
21 |     /**
22 |      * Number of threads downloading parts of Common Crawl, also the number of part files that will be created.
23 |      */
24 |     public int numThreads = 8;
25 |     /**
26 |      * URL prefix to add to the WARC file list
27 |      */
28 |     public String urlPrefix = "https://commoncrawl.s3.amazonaws.com/";
29 |     
30 |     /**
31 |      * To download only a portion of common crawl, limited to this many files.
32 |      */
33 |     public int warcFileLimit;
34 | }
35 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.corpus/src/main/resources/cc-dbp/cc-dbp.properties:
--------------------------------------------------------------------------------
 1 | #CommonCrawlConfig
 2 | 
 3 | language=en
 4 | minLanguageConfidence=0.8
 5 | numThreads=8
 6 | annotationTypes = [LinkAnnotation]
 7 | urlPrefix = https://commoncrawl.s3.amazonaws.com/
 8 | 
 9 | 
10 | 
11 | #support downloading only a portion with
12 | warcFileLimit=10


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.corpus/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=WARN, stdout
3 |  
4 | # Redirect log messages to console
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.corpus/src/main/resources/simplelogger.properties:
--------------------------------------------------------------------------------
 1 | # SLF4J's SimpleLogger configuration file
 2 | # Simple implementation of Logger that sends all enabled log messages, for all defined loggers, to System.err.
 3 | 
 4 | # Default logging detail level for all instances of SimpleLogger.
 5 | # Must be one of ("trace", "debug", "info", "warn", or "error").
 6 | # If not specified, defaults to "info".
 7 | org.slf4j.simpleLogger.defaultLogLevel=warn
 8 | 
 9 | # Logging detail level for a SimpleLogger instance named "xxxxx".
10 | # Must be one of ("trace", "debug", "info", "warn", or "error").
11 | # If not specified, the default logging detail level is used.
12 | #org.slf4j.simpleLogger.log.xxxxx=
13 | 
14 | # Set to true if you want the current date and time to be included in output messages.
15 | # Default is false, and will output the number of milliseconds elapsed since startup.
16 | #org.slf4j.simpleLogger.showDateTime=false
17 | 
18 | # The date and time format to be used in the output messages.
19 | # The pattern describing the date and time format is the same that is used in java.text.SimpleDateFormat.
20 | # If the format is not specified or is invalid, the default format is used.
21 | # The default format is yyyy-MM-dd HH:mm:ss:SSS Z.
22 | #org.slf4j.simpleLogger.dateTimeFormat=yyyy-MM-dd HH:mm:ss:SSS Z
23 | 
24 | # Set to true if you want to output the current thread name.
25 | # Defaults to true.
26 | #org.slf4j.simpleLogger.showThreadName=true
27 | 
28 | # Set to true if you want the Logger instance name to be included in output messages.
29 | # Defaults to true.
30 | #org.slf4j.simpleLogger.showLogName=true
31 | 
32 | # Set to true if you want the last component of the name to be included in output messages.
33 | # Defaults to false.
34 | #org.slf4j.simpleLogger.showShortLogName=false


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/README.md:
--------------------------------------------------------------------------------
1 | 1) ConvertDBpedia 
2 | deal with the whole 'M' suffix thing
3 | 
4 | Optional: get idCounts.tsv (only have spark version for this right now) this requires running BuildGazetteer on the unfiltered
5 | 
6 | 2) BuildGroundTruth
7 | 3) BuildGazetteer
8 | 4) TypePairFilter
9 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 | 	<modelVersion>4.0.0</modelVersion>
 3 | 
 4 | 	<parent>
 5 | 		<groupId>com.ibm.research.ai.ki</groupId>
 6 | 		<artifactId>cc-dbp-parent-pom</artifactId>
 7 | 		<version>1.0.0-SNAPSHOT</version>
 8 | 	</parent>
 9 | 
10 | 	<artifactId>kb</artifactId>
11 | 	<version>1.0.0-SNAPSHOT</version>
12 | 
13 | 	<dependencies>
14 | 		<dependency>
15 | 			<groupId>com.ibm.research.ai.ki</groupId>
16 | 			<artifactId>util</artifactId>
17 | 			<version>1.0.0-SNAPSHOT</version>
18 | 		</dependency>
19 | 		
20 | 		<dependency>
21 | 			<groupId>com.ibm.research.ai.ki</groupId>
22 | 			<artifactId>nlp</artifactId>
23 | 			<version>1.0.0-SNAPSHOT</version>
24 | 		</dependency>
25 | 		
26 | 		<dependency>
27 | 			<groupId>com.ibm.research.ai.ki</groupId>
28 | 			<artifactId>kbp</artifactId>
29 | 			<version>1.0.0-SNAPSHOT</version>
30 | 		</dependency>
31 | 	</dependencies>
32 | 
33 | </project>


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/ConfigureMinMaxEntityFreq.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kb;
19 | 
20 | import java.io.*;
21 | import java.util.*;
22 | 
23 | import com.ibm.research.ai.ki.util.*;
24 | 
25 | /**
26 |  * Shows examples of entities that occur at different frequency ranges, so that a sensible maximum occurrence frequency can be selected, 
27 |  * and possibly a minimum occurrence frequency.
28 |  * @author mrglass
29 |  *
30 |  */
31 | public class ConfigureMinMaxEntityFreq {
32 |     public static void main(String[] args) {
33 |         String kbDir = args[0];
34 |     
35 |         RandomUtil.Sample<String>[] termsByFreq = new RandomUtil.Sample[20];
36 |         for (int i = 0; i < termsByFreq.length; ++i) {
37 |             termsByFreq[i] = new RandomUtil.Sample<>(20);
38 |         }
39 |         Map<String,MutableDouble> idCounts = SparseVectors.fromString(FileUtil.readFileAsString(new File(kbDir, KBFiles.idCountsTsv)));
40 |         for (Map.Entry<String, MutableDouble> e : idCounts.entrySet()) {
41 |             int bucket = (int)Math.log(e.getValue().value);
42 |             if (bucket < 0) bucket = 0;
43 |             if (bucket >= termsByFreq.length) bucket = termsByFreq.length-1;
44 |             termsByFreq[bucket].maybeSave(Lang.LPAD(""+((int)e.getValue().value), 10)+" "+e.getKey());
45 |         }
46 |         for (int i = 0; i < termsByFreq.length; ++i) {
47 |             System.out.println("=======================================");
48 |             System.out.println(Lang.stringList(termsByFreq[i], "\n"));
49 |         }
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/GroundTruthConfig.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kb;
19 | 
20 | import com.ibm.research.ai.ki.util.*;
21 | 
22 | public class GroundTruthConfig extends PropertyStruct {
23 |     private static final long serialVersionUID = 1L;
24 | 
25 |     
26 |     public int minCorpusCount = 1;
27 |     public int maxCorpusCount = 300000;
28 |     public int minUnaryCount = 100;
29 |     public boolean useRelationTaxonomy = true;
30 |     
31 |     //CONSIDER: also type selection config
32 | }
33 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/KBConfig.java:
--------------------------------------------------------------------------------
 1 | package com.ibm.research.ai.ki.kb;
 2 | 
 3 | import java.io.*;
 4 | 
 5 | import com.ibm.research.ai.ki.util.*;
 6 | 
 7 | public class KBConfig extends PropertyStruct {
 8 |     private static final long serialVersionUID = 1L;
 9 | 
10 |     public String kbDir;
11 |     
12 |     /**
13 |      * To avoid generic terms, we ignore terms that occur more than this many times.
14 |      */
15 |     public int maxNodeCorpusCount = 3000000;
16 |     /**
17 |      * We can ignore rare terms if desired.
18 |      */
19 |     public int minNodeCorpusCount = 1;
20 |     /**
21 |      * 
22 |      */
23 |     public int minUnaryCount = 100;
24 |     /**
25 |      * Whether to consider super-relations in the labels for context sets.
26 |      */
27 |     public boolean useRelationTaxonomy = true;
28 |     
29 |     //for the coarse-grained type system
30 |     /**
31 |      * A type must have this many instances for which it is the most specific type
32 |      */
33 |     public int minTypeSize = 3000;
34 |     /**
35 |      * We will have no more than this many types in the coarse grained type system
36 |      */
37 |     public int maxNumberOfTypes = 100;
38 |     
39 |     //for the type filter
40 |     /**
41 |      * If an unordered type-pair does not have at least this many triples, it will not have any contexts generated.
42 |      * So if number-number relations never occur, we will never generated contexts for a number-number node-pair.
43 |      */
44 |     public int minTypePairFreq = 1;
45 | 
46 |     public int minTypeFreqForUnary = 1;
47 |     
48 |     
49 |     public File kbDir() {
50 |         return new File(kbDir);
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/KBFiles.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kb;
19 | 
20 | /**
21 |  * Files that can be present in a kb directory
22 |  * @author mrglass
23 |  *
24 |  */
25 | public class KBFiles {
26 |     public static final String triplesTsv = "triples.tsv";
27 |     public static final String labelsTsv = "labels.tsv";
28 |     public static final String relationTaxonomyTsv = "relationTaxonomy.tsv";
29 |     public static final String typesTsv = "types.tsv";
30 |     public static final String popularityTsv = "popularity.tsv";
31 |     //from DocEntityStats in ie.spark
32 |     public static final String idCountsTsv = "idCounts.tsv";
33 | }
34 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/conversion/DBpediaKBConfig.java:
--------------------------------------------------------------------------------
 1 | package com.ibm.research.ai.ki.kb.conversion;
 2 | 
 3 | import java.io.*;
 4 | 
 5 | import com.ibm.research.ai.ki.kb.*;
 6 | 
 7 | public class DBpediaKBConfig extends KBConfig {
 8 |     private static final long serialVersionUID = 1L;
 9 |     
10 |     public String dbpediaOwlUrl;
11 |     
12 |     public String objectsUrl;
13 |     
14 |     public String literalsUrl;
15 |     
16 |     public String labelsUrl;
17 |     
18 |     public String typesUrl;
19 |     
20 |     /**
21 |      * We can construct the KB without using idCounts.tsv if desired. Since getting idCounts.tsv requires running a gazetteer over the corpus and is potentially slow.
22 |      */
23 |     public boolean noNodeCorpusCounts;
24 |     
25 |     
26 |     protected File file(String url) {
27 |         return new File(kbDir, url.substring(url.lastIndexOf('/')+1));
28 |     }
29 |     
30 |     public File dbpediaOwlFile() {
31 |         return file(dbpediaOwlUrl);
32 |     }
33 |     
34 |     public File objectsFile() {
35 |         return file(objectsUrl);
36 |     }
37 |     
38 |     public File literalsFile() {
39 |         return file(literalsUrl);
40 |     }
41 |     
42 |     public File labelsFile() {
43 |         return file(labelsUrl);
44 |     }
45 |     
46 |     public File typesFile() {
47 |         return file(typesUrl);
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/src/main/java/com/ibm/research/ai/ki/kb/explore/FilterByCorpusCount.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kb.explore;
19 | 
20 | import java.io.*;
21 | import java.util.*;
22 | 
23 | import com.ibm.research.ai.ki.formats.*;
24 | import com.ibm.research.ai.ki.util.*;
25 | 
26 | 
27 | public class FilterByCorpusCount {
28 |     
29 |     public static void main(String[] args) {
30 |         String kbDir = args[0];
31 |         String kbDirFiltered = args[1];
32 |         int minCount = 1;
33 |         if (args.length > 2)
34 |             minCount = Integer.parseInt(args[2]);
35 |         Map<String,MutableDouble> idCounts = SparseVectors.fromString(FileUtil.readFileAsString(new File(kbDir, "idCounts.tsv")));
36 |         try (PrintStream out = FileUtil.getFilePrintStream(new File(kbDirFiltered, "labels.tsv").getAbsolutePath())) {
37 |             for (String[] lbl : new SimpleTsvIterable(new File(kbDir, "labels.tsv"))) {
38 |                 if (SparseVectors.getDefaultZero(idCounts, lbl[0]) >= minCount) {
39 |                     out.println(Lang.stringList(lbl, "\t"));
40 |                 }
41 |             }
42 |         }
43 |         try (PrintStream out = FileUtil.getFilePrintStream(new File(kbDirFiltered, "triples.tsv").getAbsolutePath())) {
44 |             for (String[] trip : new SimpleTsvIterable(new File(kbDir, "triples.tsv"))) {
45 |                 if (SparseVectors.getDefaultZero(idCounts, trip[0]) >= minCount && SparseVectors.getDefaultZero(idCounts, trip[2]) >= minCount) {
46 |                     out.println(Lang.stringList(trip, "\t"));
47 |                 }
48 |             }
49 |         }
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kb/src/main/resources/dbpediaConfig.properties:
--------------------------------------------------------------------------------
 1 | dbpediaOwlUrl=http://downloads.dbpedia.org/2016-10/dbpedia_2016-10.owl
 2 | objectsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/mappingbased_objects_en.ttl.bz2
 3 | literalsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/mappingbased_literals_en.ttl.bz2
 4 | labelsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/labels_en.ttl.bz2
 5 | typesUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/instance_types_transitive_en.ttl.bz2
 6 | 
 7 | #for ground truth
 8 | maxNodeCorpusCount = 300000
 9 | minNodeCorpusCount = 1
10 | useRelationTaxonomy = True
11 |     
12 | #for the coarse-grained type system
13 | minTypeSize = 3000
14 | maxNumberOfTypes = 100
15 |     
16 | #for the type filter
17 | minTypePairFreq = 1
18 | 
19 | noNodeCorpusCounts = False
20 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 | 	<modelVersion>4.0.0</modelVersion>
 3 | 
 4 | 	<parent>
 5 | 		<groupId>com.ibm.research.ai.ki</groupId>
 6 | 		<artifactId>cc-dbp-parent-pom</artifactId>
 7 | 		<version>1.0.0-SNAPSHOT</version>
 8 | 	</parent>
 9 | 
10 | 	<artifactId>kbp</artifactId>
11 | 	<version>1.0.0-SNAPSHOT</version>
12 | 
13 | 	<dependencies>
14 | 		<dependency>
15 | 			<groupId>com.ibm.research.ai.ki</groupId>
16 | 			<artifactId>util</artifactId>
17 | 			<version>1.0.0-SNAPSHOT</version>
18 | 		</dependency>
19 | 		
20 | 		<dependency>
21 | 			<groupId>com.ibm.research.ai.ki</groupId>
22 | 			<artifactId>nlp</artifactId>
23 | 			<version>1.0.0-SNAPSHOT</version>
24 | 		</dependency>
25 | 	
26 | 	
27 | 		<dependency>
28 | 			<groupId>org.apache.wink</groupId>
29 | 			<artifactId>wink-json4j</artifactId>
30 | 			<version>${wink-json4j.version}</version>
31 | 			<!-- Apache License, Version 2.0 -->
32 | 		</dependency>
33 | 		<dependency>
34 | 			<groupId>com.google.guava</groupId>
35 | 			<artifactId>guava</artifactId>
36 | 			<version>${guava.version}</version>
37 | 			<!-- Apache License, Version 2.0 -->
38 | 		</dependency>
39 | 		<dependency>
40 | 			<groupId>org.apache.commons</groupId>
41 | 			<artifactId>commons-lang3</artifactId>
42 | 			<version>${commons-lang3.version}</version>
43 | 			<!-- Apache License, Version 2.0 -->
44 | 		</dependency>
45 | 		<dependency>
46 | 			<groupId>commons-cli</groupId>
47 | 			<artifactId>commons-cli</artifactId>
48 | 			<version>${commons-cli.version}</version>
49 | 			<!-- Apache License, Version 2.0 -->
50 | 		</dependency>
51 | 		<dependency>
52 | 			<groupId>it.unimi.dsi</groupId>
53 | 			<artifactId>fastutil</artifactId>
54 | 			<version>7.1.0</version>
55 | 			<!-- Apache License, Version 2.0 -->
56 | 		</dependency>
57 | 	</dependencies>
58 | 
59 | </project>


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/CoveredTextEntityId.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp;
19 | 
20 | import java.util.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | import com.ibm.reseach.ai.ki.nlp.types.*;
24 | 
25 | /**
26 |  * For those entities without an id, we simply give them an id equal to the covered text, case normalized.
27 |  * So it is a text-equals entity linker.
28 |  * @author mrglass
29 |  *
30 |  */
31 | public class CoveredTextEntityId implements IPostprocessEntityRecognition {
32 |     private static final long serialVersionUID = 1L;
33 | 
34 |     @Override
35 |     public void initialize(Properties config) {}
36 | 
37 |     @Override
38 |     public void process(Document doc) {
39 |         for (EntityWithId e : doc.getAnnotations(EntityWithId.class)) {
40 |             if (e.id == null)
41 |                 e.id = e.coveredText(doc).toLowerCase().trim().replaceAll("\\s+", " ");
42 |         }
43 |     }
44 | 
45 |     @Override
46 |     public void initialize(IGroundTruth gt, RelexConfig config) {}
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/DocumentFeatureString.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp;
19 | 
20 | import com.ibm.reseach.ai.ki.nlp.*;
21 | 
22 | public class DocumentFeatureString implements DocumentStructure {
23 |     private static final long serialVersionUID = 1L;
24 | 
25 |     public String featureString;
26 |     
27 |     public DocumentFeatureString(String featureString) {
28 |         this.featureString = featureString;
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/FilterEntsByGroundTruth.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp;
19 | 
20 | import java.util.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | import com.ibm.reseach.ai.ki.nlp.types.*;
24 | 
25 | public class FilterEntsByGroundTruth implements IPostprocessEntityRecognition {
26 | 	private static final long serialVersionUID = 1L;
27 | 
28 | 	protected Set<String> relevantUrls;
29 | 	
30 | 	@Override
31 | 	public void initialize(Properties config) {}
32 | 
33 | 	@Override
34 | 	public void process(Document doc) {
35 | 		doc.removeAnnotations(EntityWithId.class, e -> !relevantUrls.contains(e.id));
36 | 	}
37 | 
38 | 	@Override
39 | 	public void initialize(IGroundTruth gt, RelexConfig config) {
40 | 		this.relevantUrls = gt.getRelevantIds();
41 | 	}
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IEntityPairFilter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp;
19 | 
20 | import java.io.*;
21 | 
22 | /**
23 |  * A class implementing this will be specified in the RelexConfig if some filtering of entity-pairs is desired.
24 |  * Otherwise the tsv dataset will contain all pairs of EntityWithId that occur in the same sentence.
25 |  * @author mrglass
26 |  *
27 |  */
28 | public interface IEntityPairFilter extends Serializable {
29 |     /**
30 |      * In Spark, initialize is called in the Spark head
31 |      * @param gt
32 |      * @param config
33 |      */
34 | 	public void initialize(GroundTruth gt, RelexConfig config);
35 | 	/**
36 | 	 * Return true if the entity-pair is a good candidate
37 | 	 * @param id1
38 | 	 * @param type1
39 | 	 * @param id2
40 | 	 * @param type2
41 | 	 * @return
42 | 	 */
43 | 	public boolean test(String id1, String type1, String id2, String type2);
44 | }
45 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IGroundTruth.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp;
19 | 
20 | import java.io.*;
21 | import java.util.*;
22 | 
23 | /**
24 |  * A generic ground truth interface, the methods needed for preprocessing.
25 |  * @author mrglass
26 |  *
27 |  */
28 | public interface IGroundTruth extends Serializable {
29 | 	public String getType(String id);
30 | 	public Set<String> getRelevantIds();
31 | 	
32 | 	public Map<String,String[]> buildEntitySetId2Relations();
33 | }
34 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IPostprocessEntityRecognition.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp;
19 | 
20 | import com.ibm.reseach.ai.ki.nlp.*;
21 | 
22 | /**
23 |  * An annotator that postprocesses the entity recognition and linking.
24 |  * Often to remove entities not of interest, or to fill in type based on id or id for NIL entity linking.
25 |  * @author mrglass
26 |  *
27 |  */
28 | public interface IPostprocessEntityRecognition extends Annotator {
29 | 	public void initialize(IGroundTruth gt, RelexConfig config);
30 | }
31 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IRelexDatasetManager.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp;
19 | 
20 | import java.io.*;
21 | 
22 | import com.ibm.research.ai.ki.kbp.*;
23 | 
24 | /**
25 |  * Provides classes for representing and creating a dataset for training/evaluation/mass-apply of 
26 |  * a relational knowledge induction system.
27 |  * 
28 |  * @author mrglass
29 |  *
30 |  * @param <M>
31 |  */
32 | public interface IRelexDatasetManager<M extends IRelexMention> extends Serializable {
33 |     
34 |     public IRelexTsv<M> getTsvMaker();
35 |     public IGroundTruth getGroundTruth();
36 |     public Class<M> getMentionClass();
37 |     public IRelexTensors<M> getTensorMaker();
38 |     
39 |     /**
40 |      * before this method is called, only getMentionClass is supposed to be called
41 |      * @param config
42 |      */
43 |     public void initialize(RelexConfig config);
44 | }


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IRelexMention.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp;
19 | 
20 | import java.io.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | 
24 | /**
25 |  * So we can unify the code for binary and unary relation mention
26 |  * @author mrglass
27 |  *
28 |  */
29 | public interface IRelexMention extends Serializable {
30 | 	//for reduce by key
31 | 	public String groupId();
32 | 	public int groupSplit(int splitCount);
33 | 	
34 | 	//the canonically ordered list of ids, separated by '\t'; if group ids are enabled the group id is given here too
35 | 	public String entitySetId();
36 | 	
37 | 	//downsampling and splitting train/validate/test
38 | 	public double getNegativeDownsamplePriority();
39 | 	public double getDatasetSplitPosition();
40 | 	//for negative downsampling
41 | 	public boolean isNegative();
42 | 	
43 | 	//where the document the mention comes from appears in the x-axis of the document learning curve (0-1)
44 | 	public double getDocumentLearningCurvePosition();
45 | 	
46 | 	//for vocab construction
47 | 	public String[] getTypes();
48 | 	public String[] getRelations();
49 | 	public String[] getTokens(Annotator tokenizer);
50 | 	
51 | 	//saving and loading from tsv
52 | 	public void fromString(String tsvLine);
53 | 	
54 | 	public String toString();
55 | 	
56 | 	//to avoid duplicates in a mentionset, if non-null, two IRelexMentions that share a uniquenessString are duplicates.
57 | 	public String uniquenessString();
58 | 	
59 | 	/**
60 | 	 * A human readable format for showing the support for an extracted relation.
61 | 	 * @return
62 | 	 */
63 | 	public String toSupportString();
64 | 	
65 | 	public void convertToPlaceholders();
66 | }
67 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IRelexTensors.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp;
19 | 
20 | import java.io.*;
21 | import java.util.*;
22 | 
23 | import com.ibm.reseach.ai.ki.nlp.*;
24 | 
25 | /**
26 |  * Creates the deep learning input tensors from a set of RelexMentions
27 |  * @author mrglass
28 |  *
29 |  * @param <M>
30 |  */
31 | public interface IRelexTensors<M extends IRelexMention> extends Serializable {
32 | 	public String[] getTypes();
33 | 	public String[] getRelations();
34 | 	/**
35 | 	 * The first object is assumed to be the String groupId.
36 | 	 * @param tokenizer
37 | 	 * @param fullMentionSet
38 | 	 * @return
39 | 	 */
40 | 	public List<Object[]> makeInstances(Annotator tokenizer, Collection<M> fullMentionSet);
41 | }
42 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/IRelexTsv.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp;
19 | 
20 | import java.io.*;
21 | import java.util.*;
22 | 
23 | import com.ibm.reseach.ai.ki.nlp.*;
24 | 
25 | /**
26 |  * Pulls the IRelexMentions out of a Document that has EntityWithId, Token and Sentence annotations.
27 |  * @author mrglass
28 |  *
29 |  * @param <M>
30 |  */
31 | public interface IRelexTsv<M extends IRelexMention> extends Serializable {
32 | 	public List<M> getMentions(Document doc);
33 | }
34 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/NounPhraseEntityWithId.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp;
19 | 
20 | import java.util.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | import com.ibm.reseach.ai.ki.nlp.types.*;
24 | 
25 | public class NounPhraseEntityWithId implements Annotator {
26 | 	private static final long serialVersionUID = 1L;
27 | 
28 | 	public static final String SOURCE = NounPhraseEntityWithId.class.getSimpleName();
29 | 	
30 | 	@Override
31 | 	public void initialize(Properties config) {}
32 | 
33 | 	//NPs with these as their first tokens are not entity terms
34 | 	protected Set<String> ignoreFirstTokens = new HashSet<>(Arrays.asList(
35 | 			"the",
36 | 			"that", "these", "those", "this",
37 | 			"a", "an", 
38 | 			"who", "which", "it", 
39 | 			"its", "your", "our", "my", "their",
40 | 			"you", "me"));
41 | 	
42 | 	
43 | 	@Override
44 | 	public void process(Document doc) {
45 | 		for (Chunk c : doc.getAnnotations(Chunk.class)) {
46 | 			if ("NP".equals(c.tag)) {
47 | 				
48 | 				Token firstToken = doc.getAnnotations(Token.class, c).get(0);
49 | 				if (ignoreFirstTokens.contains(firstToken.coveredText(doc).toLowerCase()))
50 | 					continue;
51 | 				
52 | 				if (c.coveredText(doc).replaceAll("\\W+", "").isEmpty())
53 | 					continue;
54 | 				
55 | 				doc.addAnnotation(new EntityWithId(SOURCE, 
56 | 						c.start, c.end, 
57 | 						GroundTruth.unknownType, c.coveredText(doc).toLowerCase()));
58 | 			}
59 | 		}
60 | 		//we could drop chunk annotations now
61 | 	}
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/RelexDatasetFiles.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp;
19 | 
20 | /**
21 |  * The files saved in the tsv dataset to tensor dataset conversion.
22 |  * Also used when training and applying the model.
23 |  * 
24 |  * @author mrglass
25 |  *
26 |  */
27 | public class RelexDatasetFiles {
28 |     //in the convert dir
29 | 	public static final String wordVectors = "wordVectors.ef";
30 | 	public static final String groupSplits = "groupSplits.ser.gz";
31 | 	public static final String tokenizerPipeline = "tokenizer.ser.gz";
32 | 	public static final String typePairFilterFile = "typePairs.tsv";
33 | 	public static final String typeFilterFile = "typeUnary.tsv";
34 | 	/**
35 | 	 * Created by DocEntityStats
36 | 	 */
37 | 	public static final String idCountsFile = "idCounts.tsv"; 
38 | 	
39 | 	public static final String dataDirSuffix = "Dir";
40 | 	
41 | 	//in the hdfsOutputDir
42 | 	public static final String hdfsMentions = "relexMentions.tsv";
43 | 	public static final String hdfsTensors = "tensors.b64";
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/ShowExamples.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp;
19 | 
20 | import java.io.*;
21 | import java.util.*;
22 | 
23 | import com.google.common.collect.*;
24 | 
25 | import com.ibm.research.ai.ki.util.*;
26 | import com.ibm.research.ai.ki.util.RandomUtil.*;
27 | 
28 | /**
29 |  * Reads the simple tsv format of RelexMention.Reader/Writer. And shows examples of 'interesting' entity-pair mention sets.
30 |  * @author mrglass
31 |  *
32 |  */
33 | public class ShowExamples {
34 | 	/**
35 | 	 * The samples that are interesting
36 | 	 * @param m
37 | 	 * @return
38 | 	 */
39 | 	static boolean isInteresting(List<RelexMention> m) {
40 | 		return m.size() > 1 && !m.get(0).isNegative();
41 | 	}
42 | 	
43 | 	/**
44 | 	 * Example args:
45 | 	 * simpleFormat/train.tsv
46 | 	 * 
47 | 	 * @param args
48 | 	 */
49 | 	public static void main(String[] args) {
50 | 
51 | 		RandomUtil.Sample<String> sample = new RandomUtil.Sample<String>(20);
52 | 		for (List<RelexMention> m : RelexMentionReader.getSetReader(new File(args[0]), RelexMention.class)) {
53 | 			if (isInteresting(m) && sample.shouldSave()) {
54 | 				RelexMention m1 = m.get(0);
55 | 				sample.save(
56 | 						m1.span1.substring(m1.sentence)+"\t"+
57 | 						m1.span2.substring(m1.sentence)+"\t"+
58 | 						Lang.stringList(m1.relTypes, ",")+"\n  "+
59 | 						Lang.stringList(Iterables.transform(m, mi -> mi.sentence), "\n  "));
60 | 			}
61 | 		}
62 | 
63 | 		System.out.println(Lang.stringList(sample, "\n\n=======================\n"));
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/Tokenizer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp;
19 | 
20 | import java.io.*;
21 | import java.util.Properties;
22 | 
23 | import com.ibm.reseach.ai.ki.nlp.*;
24 | import com.ibm.research.ai.ki.kbp.*;
25 | import com.ibm.research.ai.ki.nlp.parse.*;
26 | import com.ibm.research.ai.ki.util.*;
27 | 
28 | public abstract class Tokenizer {
29 | 	private static Annotator tokenizer = null;
30 | 	public static Annotator getTokenizer(RelexConfig config) {
31 | 		synchronized (Tokenizer.class) {
32 | 			if (tokenizer == null) {
33 | 			    if (config.tokenizerPipelineFile != null) {
34 | 			        tokenizer = FileUtil.loadObjectFromFile(config.tokenizerPipelineFile);
35 | 			    } else if (new File(config.convertDir, RelexDatasetFiles.tokenizerPipeline).exists()) {
36 | 			        tokenizer = FileUtil.loadObjectFromFile(new File(config.convertDir, RelexDatasetFiles.tokenizerPipeline));
37 | 			    } else {
38 |     			    tokenizer = new Pipeline(
39 |     				        new ClearNLPTokenize()
40 |     				    //, new DigitSequenceTokenize() //add some special tokenization for digit groups
41 |     				);
42 | 			    }
43 | 				tokenizer.initialize(new Properties());
44 | 			}
45 | 			return tokenizer;
46 | 		}
47 | 	}
48 | }
49 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/TypePairEntityPairFilter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp;
19 | 
20 | import java.io.*;
21 | import java.util.*;
22 | 
23 | import com.ibm.research.ai.ki.formats.*;
24 | 
25 | public class TypePairEntityPairFilter implements IEntityPairFilter {
26 |     private static final long serialVersionUID = 1L;
27 | 
28 |     protected Set<String> typePairs = new HashSet<>();
29 |     
30 |     @Override
31 |     public void initialize(GroundTruth gt, RelexConfig config) {
32 |         if (!new File(config.convertDir, "typePairs.tsv").exists())
33 |             throw new IllegalArgumentException("No typePairs.tsv file in convertDir");
34 |         for (String[] parts : new SimpleTsvIterable(new File(config.convertDir, RelexDatasetFiles.typePairFilterFile))) {
35 |             String t1 = parts[0];
36 |             String t2 = parts[1];
37 |             typePairs.add(t1+'\t'+t2);
38 |         }
39 |     }
40 | 
41 |     @Override
42 |     public boolean test(String id1, String type1, String id2, String type2) {
43 |         String tp = null;
44 |         if (type1.compareTo(type2) <= 0) {
45 |             tp = type1+'\t'+type2;
46 |         } else {
47 |             tp = type2+'\t'+type1;
48 |         }
49 |         
50 |         return typePairs.contains(tp);
51 |     }
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/unary/IEntityFilter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp.unary;
19 | 
20 | import java.io.*;
21 | 
22 | import com.ibm.research.ai.ki.kbp.*;
23 | 
24 | 
25 | public interface IEntityFilter extends Serializable {
26 |     public void initialize(UnaryGroundTruth gt, RelexConfig config);
27 |     public boolean test(String docId, String id, String type);
28 | }
29 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/java/com/ibm/research/ai/ki/kbp/unary/RelexDatasetManagerUnary.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.kbp.unary;
19 | 
20 | import java.io.*;
21 | 
22 | import com.ibm.research.ai.ki.kbp.*;
23 | import com.ibm.research.ai.ki.util.*;
24 | 
25 | public class RelexDatasetManagerUnary implements IRelexDatasetManager<UnaryRelexMention> {
26 |     private static final long serialVersionUID = 1L;
27 | 
28 |     RelexConfig config;
29 |     UnaryGroundTruth gt;
30 |     
31 |     @Override
32 |     public IRelexTsv<UnaryRelexMention> getTsvMaker() {
33 |         if (gt == null && new File(config.groundTruthFile).exists())
34 |             this.gt = FileUtil.loadObjectFromFile(config.groundTruthFile);
35 |         return new UnaryRelexTsvDataset(gt, config);
36 |     }
37 | 
38 |     @Override
39 |     public IGroundTruth getGroundTruth() {
40 |         if (gt == null && new File(config.groundTruthFile).exists())
41 |             this.gt = FileUtil.loadObjectFromFile(config.groundTruthFile);
42 |         return gt;
43 |     }
44 | 
45 |     @Override
46 |     public Class<UnaryRelexMention> getMentionClass() {
47 |         return UnaryRelexMention.class;
48 |     }
49 | 
50 |     @Override
51 |     public IRelexTensors<UnaryRelexMention> getTensorMaker() {
52 |         return new UnaryRelexTensors(config);
53 |     }
54 | 
55 |     @Override
56 |     public void initialize(RelexConfig config) {
57 |         this.config = config;
58 |         
59 |     }
60 | 
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.kbp/src/main/resources/relexConfigNonSpark.properties:
--------------------------------------------------------------------------------
 1 | documentSampleFraction = 1.0
 2 | negativeExampleSampleFraction = 0.05
 3 | targetNegativeToPositveRatio = -1
 4 | directionStyle = bothWays
 5 | titleContext = False
 6 | sectionContext = False
 7 | limitEntitiesToGroundTruth = False
 8 | gtTypes = False
 9 | vocabLimit = 2000000
10 | vocabMinCount = 2
11 | initialEmbeddingsFile = TODO/wordvectorFileInEmbeddingFormat.ef
12 | minMentionSet = 1
13 | maxMentionSet = 100
14 | maxMentionGroups = 5
15 | maxPositionEmbeddings = 80
16 | typeStyle = single
17 | groundTruthFile = TODO/gt.ser.gz
18 | convertDir = TODO/
19 | datasetSplitNames = [train, validate, test]
20 | datasetSpitFractions = [0.8, 0.1, 0.1]
21 | #typePairs.tsv expected to exist in convertDir
22 | entityPairFilterClass = com.ibm.research.ai.ki.kbp.TypePairEntityPairFilter


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/Annotator.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp;
19 | 
20 | import java.io.*;
21 | import java.util.*;
22 | 
23 | /**
24 |  * NOTE: Annotators are expected to be threadsafe.
25 |  * @author mrglass
26 |  *
27 |  */
28 | public interface Annotator extends Serializable {
29 | 	public void initialize(Properties config);
30 | 	public void process(Document doc);
31 | }
32 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/DocumentStructure.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp;
19 | 
20 | import java.io.*;
21 | 
22 | /**
23 |  * marker interface for information like 'annotations' but without a meaningful Span
24 |  * examples:
25 |  *   coref chains
26 |  *   relations or frames inferred from multiple sentences
27 |  *   date of document
28 |  *   annotation worker that produced an annotated document
29 |  *   document category
30 |  * @author mrglass
31 |  *
32 |  */
33 | public interface DocumentStructure extends Serializable {
34 | 	//CONSIDER: public String getSource();
35 | }
36 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/DocumentWriter2.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp;
19 | 
20 | import com.ibm.research.ai.ki.util.*;
21 | import com.ibm.research.ai.ki.util.io.*;
22 | 
23 | import java.io.*;
24 | import java.nio.file.*;
25 | import java.util.zip.*;
26 | 
27 | /**
28 |  * Version of DocumentWriter based on the new abstract class MultiFileWriter.
29 |  * Needs testing before DocumentWriter is replaced.
30 |  * @author mrglass
31 |  *
32 |  */
33 | public class DocumentWriter2 extends MultiFileWriter<ObjectOutputStream, Document> {
34 | 	public DocumentWriter2(File rootDir) {
35 | 		super(rootDir);
36 | 	}
37 | 
38 | 	public DocumentWriter2(File rootDir, int itemsPerFile, boolean overwrite) {
39 | 		super(rootDir, itemsPerFile, overwrite);
40 | 	}
41 | 
42 | 	@Override
43 | 	protected String getExt() {
44 | 		return ".ser.gz";
45 | 	}
46 | 
47 | 	@Override
48 | 	protected void write(ObjectOutputStream stream, Document obj) throws IOException {
49 | 		stream.writeObject(obj);
50 | 	}
51 | 
52 | 	@Override
53 | 	protected ObjectOutputStream getStream(File f) throws IOException {
54 | 		return new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(f), 2 << 16));
55 | 	}
56 | 
57 | 
58 | 	public synchronized void write(Document doc) {	
59 | 		try {
60 | 			super.write(doc);
61 | 		} catch (Exception e) {
62 | 			Lang.error(e);
63 | 		}
64 | 	}
65 | 	
66 | 	@Override
67 | 	protected void deepenDirectories() {
68 | 		super.deepenDirectories();
69 | 	}
70 | 	
71 | 	@Override
72 | 	public synchronized void close() {
73 | 		super.close();
74 | 	}
75 | }
76 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Author.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import com.fasterxml.jackson.annotation.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | 
24 | public class Author implements DocumentStructure {
25 | 	private static final long serialVersionUID = 1L;
26 | 	public String id;
27 | 	@JsonCreator
28 | 	public Author(@JsonProperty("id") String id) {
29 | 		this.id = id;
30 | 	}
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Categories.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import java.util.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | import com.ibm.research.ai.ki.util.*;
24 | 
25 | /**
26 |  * The categories that a Document belongs to (like Wikipedia categories)
27 |  * @author mrglass
28 |  *
29 |  */
30 | public class Categories extends HashSet<String> implements DocumentStructure {
31 | 	private static final long serialVersionUID = 1L;
32 | 	/**
33 | 	 * add a category to a Document
34 | 	 * @param doc
35 | 	 * @param category
36 | 	 */
37 | 	public static void addCategory(Document doc, String category) {
38 | 		Categories cats = doc.getDocumentStructure(Categories.class);
39 | 		if (cats == null) {
40 | 			cats = new Categories();
41 | 			doc.setDocumentStructure(cats);
42 | 		}
43 | 		cats.add(category);
44 | 	}
45 | 	/**
46 | 	 * unmodifiable set of categories
47 | 	 * @param doc
48 | 	 * @return
49 | 	 */
50 | 	public static Set<String> getCategories(Document doc) {
51 | 		return Collections.unmodifiableSet(Lang.NVL(doc.getDocumentStructure(Categories.class), (Set<String>)Collections.EMPTY_SET));
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Chunk.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import com.fasterxml.jackson.annotation.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | 
24 | public class Chunk extends Annotation {
25 | 	private static final long serialVersionUID = 1L;
26 | 	
27 | 	public String tag; //NP, VP, PP
28 | 	
29 | 	@JsonCreator
30 | 	public Chunk(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end, @JsonProperty("tag") String tag) {
31 | 		super(source, start, end);
32 | 		this.tag = tag;
33 | 	}
34 | 	
35 | 	@Override
36 | 	public String highlightLabel() {
37 | 		return tag;
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/DocDate.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import java.util.*;
21 | 
22 | import com.fasterxml.jackson.annotation.*;
23 | 
24 | import com.ibm.reseach.ai.ki.nlp.*;
25 | 
26 | /**
27 |  * Date the document was written or created
28 |  * @author mrglass
29 |  *
30 |  */
31 | public class DocDate implements DocumentStructure {
32 | 	private static final long serialVersionUID = 1L;
33 | 	public Date date;
34 | 	@JsonCreator
35 | 	public DocDate(@JsonProperty("date") Date date) {
36 | 		this.date = date;
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/DocumentContentType.java:
--------------------------------------------------------------------------------
 1 | package com.ibm.reseach.ai.ki.nlp.types;
 2 | 
 3 | import com.ibm.reseach.ai.ki.nlp.*;
 4 | 
 5 | import com.fasterxml.jackson.annotation.*;
 6 | 
 7 | public class DocumentContentType implements DocumentStructure {
 8 | 
 9 |     private static final long serialVersionUID = 1L;
10 |     public String contentType;
11 |     @JsonCreator
12 |     public DocumentContentType(@JsonProperty("contentType") String contentType) {
13 |         this.contentType = contentType;
14 |     }
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/DocumentSource.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import com.ibm.reseach.ai.ki.nlp.*;
21 | 
22 | import com.fasterxml.jackson.annotation.*;
23 | 
24 | public class DocumentSource implements DocumentStructure {
25 |     private static final long serialVersionUID = 1L;
26 |     public String source;
27 |     @JsonCreator
28 |     public DocumentSource(@JsonProperty("source") String source) {
29 |         this.source = source;
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Entity.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import com.fasterxml.jackson.annotation.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | 
24 | public class Entity extends Annotation {
25 | 	private static final long serialVersionUID = 1L;
26 | 	
27 | 	public String type;
28 | 	
29 | 	@JsonCreator
30 | 	public Entity(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end, @JsonProperty("type") String type) {
31 | 		super(source, start, end);
32 | 		this.type = type;
33 | 	}
34 | 
35 | 	public Entity(Entity e) {
36 | 		this(e.source, e.start, e.end, e.type);
37 | 	}
38 | 	
39 | 	@Override
40 | 	public String highlightLabel() {
41 | 		return type;
42 | 	}
43 | }
44 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/EntityWithId.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import com.ibm.research.ai.ki.util.*;
21 | 
22 | import com.fasterxml.jackson.annotation.*;
23 | 
24 | /**
25 |  * Like a LinkedEntity but with just a single string for the id.
26 |  * @author mrglass
27 |  *
28 |  */
29 | public class EntityWithId extends Entity {
30 | 	private static final long serialVersionUID = 1L;
31 | 	
32 | 	public String id;
33 | 	
34 | 	public EntityWithId(Entity e) {
35 | 	    this(e.source, e.start, e.end, e.type, 
36 | 	            (e instanceof EntityWithId) ? ((EntityWithId)e).id : null);
37 | 	}
38 | 	
39 | 	public EntityWithId(
40 | 			@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end, 
41 | 			@JsonProperty("type") String type, @JsonProperty("id") String id) 
42 | 	{
43 | 		super(source, start, end, type);
44 | 		this.id = id;
45 | 	}
46 | 
47 | 	@Override
48 | 	public String highlightLabel() {
49 | 		return Lang.NVL(type,"unk")+":"+id;
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Event.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import java.util.*;
21 | 
22 | import com.fasterxml.jackson.annotation.*;
23 | 
24 | import com.ibm.reseach.ai.ki.nlp.*;
25 | import com.ibm.research.ai.ki.util.*;
26 | 
27 | /**
28 |  * The span of the Event is something like the event extent and is usually not very meaningful.
29 |  * Most of the semantics come from the eventType and the argument mentions and roles.
30 |  * @author mrglass
31 |  *
32 |  */
33 | public class Event extends Annotation {
34 | 	private static final long serialVersionUID = 1L;
35 | 	
36 | 	public Event(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end, @JsonProperty("type") String type) {
37 | 		super(source, start, end);
38 | 		this.type = type;
39 | 	}
40 | 
41 | 	public void addArgument(Document doc, String role, Entity entity) {
42 | 		arguments.add(Pair.of(role, doc.getAnnoRef(entity)));
43 | 	}
44 | 	
45 | 	//the type of the event
46 | 	public String type;
47 | 	
48 | 	//CONSIDER: maybe allow arguments to be Annotation in general rather than requiring Entity
49 | 	//Pair is role name and entity, role name may be null
50 | 	public List<Pair<String,AnnoRef<Entity>>> arguments = new ArrayList<>();
51 | 	
52 | 	//CONSIDER: pull the trigger into an EventTrigger annotation class?
53 | 	//the span of the trigger, may be null
54 | 	public Span trigger;
55 | 
56 | 	@Override
57 | 	public String highlightLabel() {
58 | 		return type;
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/LinkAnnotation.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import com.fasterxml.jackson.annotation.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | 
24 | /**
25 |  * Annotation for a link (like wikilink or web link) in text
26 |  * @author mrglass
27 |  *
28 |  */
29 | public class LinkAnnotation extends Annotation {
30 | 	private static final long serialVersionUID = 1L;
31 | 
32 | 	public String target;
33 | 	
34 | 	@JsonCreator
35 | 	public LinkAnnotation(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end, @JsonProperty("target") String target) {
36 | 		super(source, start, end);
37 | 		this.target = target;
38 | 	}
39 | 	
40 | 	   @Override
41 | 	    public String highlightLabel() {
42 | 	        return "A:"+target;
43 | 	    }
44 | }
45 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/ListAnnotation.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import java.util.*;
21 | import java.util.stream.*;
22 | 
23 | import com.fasterxml.jackson.annotation.*;
24 | import com.google.common.collect.*;
25 | 
26 | import com.ibm.reseach.ai.ki.nlp.*;
27 | 
28 | /**
29 |  * Annotation describing a list in text
30 |  * @author mrglass
31 |  *
32 |  */
33 | public class ListAnnotation extends Annotation {
34 | 	private static final long serialVersionUID = 1L;
35 | 	
36 | 	@JsonCreator
37 | 	public ListAnnotation(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) {
38 | 		super(source, start, end);
39 | 	}
40 | 
41 | 	public List<AnnoRef<ListItem>> items = new ArrayList<>();
42 | 	
43 | 	public void addListItem(Document doc, ListItem item) {
44 | 		doc.addAnnotation(item);
45 | 		items.add(doc.getAnnoRef(item));
46 | 	}
47 | 	
48 | }
49 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/ListItem.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import com.fasterxml.jackson.annotation.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | 
24 | public class ListItem extends Annotation {
25 | 	private static final long serialVersionUID = 1L;
26 | 	
27 | 	@JsonCreator
28 | 	public ListItem(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) {
29 | 		super(source, start, end);
30 | 	}
31 | 	public ListItem(Document doc, ListAnnotation list, String source, int first, int last) {
32 | 		super(source, first, last);
33 | 		list.items.add(doc.getAnnoRef(this));
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Paragraph.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import com.fasterxml.jackson.annotation.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | 
24 | public class Paragraph extends Annotation {
25 | 	private static final long serialVersionUID = 1L;
26 | 	
27 | 	@JsonCreator
28 | 	public Paragraph(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) {
29 | 		super(source, start, end);
30 | 	}
31 | }
32 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Relation.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import com.fasterxml.jackson.annotation.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | 
24 | public class Relation extends Annotation {
25 | 	private static final long serialVersionUID = 1L;
26 | 	
27 | 	public String relationType;
28 | 	protected AnnoRef<Annotation> arg1;
29 | 	protected AnnoRef<Annotation> arg2;
30 | 	//CONSIDER: type parameter rather than general 'Annotation'
31 | 	public Annotation getArg1() {
32 | 		return arg1.get();
33 | 	}
34 | 	public Annotation getArg2() {
35 | 		return arg2.get();
36 | 	}
37 | 	
38 | 	//to support subclasses of Relation that name their arguments
39 | 	public String getArg1Name() {
40 | 		return "arg1";
41 | 	}
42 | 	public String getArg2Name() {
43 | 		return "arg2";
44 | 	}
45 | 	
46 | 	@JsonCreator
47 | 	public Relation(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) {
48 | 		super(source, start, end);
49 | 	}
50 | 
51 | 	
52 | 	public Relation(String source, Document doc, Annotation arg1, Annotation arg2, String relationType) {
53 | 		super(source, Math.min(arg1.start, arg2.start), Math.max(arg1.end, arg2.end));
54 | 		this.arg1 = doc.getAnnoRef(arg1);
55 | 		this.arg2 = doc.getAnnoRef(arg2);
56 | 		this.relationType = relationType;
57 | 	}
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Section.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import com.fasterxml.jackson.annotation.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | 
24 | public class Section extends Annotation {
25 | 	private static final long serialVersionUID = 1L;
26 | 	
27 | 	@JsonCreator
28 | 	public Section(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) {
29 | 		super(source, start, end);
30 | 	}
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/SectionHeader.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import com.fasterxml.jackson.annotation.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | import com.ibm.research.ai.ki.util.*;
24 | 
25 | public class SectionHeader extends Annotation {
26 | 	private static final long serialVersionUID = 1L;
27 | 	
28 | 	@JsonCreator
29 | 	public SectionHeader(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) {
30 | 		super(source, start, end);
31 | 	}
32 | 
33 | 	public AnnoRef<Section> sectionBody;
34 | 	public AnnoRef<SectionHeader> superSection;
35 | 	
36 | }
37 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Sentence.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import com.fasterxml.jackson.annotation.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | 
24 | public class Sentence extends Annotation {
25 | 	
26 | 	private static final long serialVersionUID = 1L;
27 | 	
28 | 	@JsonCreator
29 | 	public Sentence(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) {
30 | 		super(source, start, end);
31 | 	}
32 | }
33 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/TextFormatting.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import com.fasterxml.jackson.annotation.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | 
24 | public class TextFormatting extends Annotation {
25 | 	private static final long serialVersionUID = 1L;
26 | 	
27 | 	public enum Format {bold, italic};
28 | 	
29 | 	public final Format format;
30 | 	
31 | 	@JsonCreator
32 | 	public TextFormatting(@JsonProperty("source") String source, 
33 | 			@JsonProperty("start") int start, @JsonProperty("end") int end, 
34 | 			@JsonProperty("format") Format format) 
35 | 	{
36 | 		super(source, start, end);
37 | 		this.format = format;
38 | 	}
39 | 	
40 | 	@Override
41 |     public String highlightLabel() {
42 |         return format == Format.italic ? "i" : "b";
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/reseach/ai/ki/nlp/types/Title.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.reseach.ai.ki.nlp.types;
19 | 
20 | import com.fasterxml.jackson.annotation.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | 
24 | public class Title extends Annotation {
25 | 	private static final long serialVersionUID = 1L;
26 | 	
27 | 	@JsonCreator
28 | 	public Title(@JsonProperty("source") String source, @JsonProperty("start") int start, @JsonProperty("end") int end) {
29 | 		super(source, start, end);
30 | 	}
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/research/ai/ki/nlp/parse/DigitSequenceTokenize.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.nlp.parse;
19 | 
20 | import java.util.ArrayList;
21 | import java.util.List;
22 | import java.util.Properties;
23 | import java.util.regex.Matcher;
24 | import java.util.regex.Pattern;
25 | 
26 | import com.ibm.reseach.ai.ki.nlp.*;
27 | import com.ibm.reseach.ai.ki.nlp.types.*;
28 | import com.ibm.research.ai.ki.util.*;
29 | 
30 | /**
31 |  * Adds tokens for each digit sequence (\b[0-9]+\b)
32 |  * @author mrglass
33 |  *
34 |  */
35 | public class DigitSequenceTokenize implements Annotator {
36 | 	private static final long serialVersionUID = 1L;
37 | 
38 | 	public static final String SOURCE = "DST";
39 | 	
40 | 	protected Pattern digitSeq = Pattern.compile("\\b[0-9]+\\b");
41 | 	
42 | 	@Override
43 | 	public void initialize(Properties config) {}
44 | 
45 | 	@Override
46 | 	public void process(Document doc) {
47 | 		Matcher m = digitSeq.matcher(doc.text);
48 | 		NonOverlappingSpans nos = new NonOverlappingSpans();
49 | 		List<Token> toAdd = new ArrayList<>();
50 | 		while (m.find()) {
51 | 			Token t = new Token(SOURCE, m.start(), m.end());
52 | 			t.lemma = t.coveredText(doc);
53 | 			t.pos = "CD";
54 | 			if (!nos.addSpan(t)) {
55 | 				throw new Error("Span overlaps?? "+doc.toSimpleInlineMarkup());
56 | 			}
57 | 			toAdd.add(t);
58 | 		}
59 | 		//remove tokens that overlap with our new ones
60 | 		doc.removeAnnotations(Token.class, t -> nos.overlaps(t));
61 | 		for (Token t : toAdd)
62 | 			doc.addAnnotation(t);
63 | 	}
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/research/ai/ki/nlp/parse/NormalizeTextTransform.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.nlp.parse;
19 | 
20 | import com.ibm.reseach.ai.ki.nlp.*;
21 | 
22 | /**
23 |  * Like the text normalization in Google's w2v but with flag for [0-9] -> ' '
24 |  * @author mrglass
25 |  *
26 |  */
27 | public class NormalizeTextTransform extends TransformString {
28 | 	private static final long serialVersionUID = 1L;
29 | 	
30 | 	protected boolean removeDigits;
31 | 	
32 | 	public NormalizeTextTransform(boolean removeDigits) {
33 | 		super("com/ibm/research/ai/ki/nlp/parse/normalizeText-replace.tsv");
34 | 		this.removeDigits = removeDigits;
35 | 	}
36 | 	
37 | 	@Override
38 | 	public String transform(String text, OffsetCorrection trans2orig, OffsetCorrection orig2trans) {
39 | 		String result = super.transform(text,trans2orig,orig2trans).toLowerCase();
40 | 		if (removeDigits)
41 | 			result = result.replaceAll("[0-9]", " ");
42 | 		return result;
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/java/com/ibm/research/ai/ki/nlp/parse/RegexTokenize.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.nlp.parse;
19 | 
20 | import java.io.*;
21 | import java.util.*;
22 | import java.util.regex.*;
23 | 
24 | import com.ibm.reseach.ai.ki.nlp.*;
25 | import com.ibm.reseach.ai.ki.nlp.types.*;
26 | import com.ibm.research.ai.ki.util.*;
27 | 
28 | public class RegexTokenize implements Annotator {
29 | 	private static final long serialVersionUID = 1L;
30 | 
31 | 	public static final String REGEX_KEY = "tokenRegex";
32 | 	//whitespace or punctuation delimits tokens, and are themselves not tokens
33 | 	public static final String DEFAULT = "[\\p{Punct}\\s]+";
34 | 	
35 | 	public static final String WHITESPACE = Lang.pWhite_Space+"+";
36 | 	
37 | 	public static final String SOURCE = RegexTokenize.class.getSimpleName();
38 | 	
39 | 	protected Pattern tokenRegex;
40 | 	
41 | 	@Override
42 | 	public void initialize(Properties config) {
43 | 		tokenRegex =  Pattern.compile(Lang.NVL(config.getProperty(REGEX_KEY), DEFAULT));
44 | 	}
45 | 
46 | 	@Override
47 | 	public void process(Document doc) {
48 | 		for (Annotation seg : doc.getSegmentation(Sentence.class, Paragraph.class)) {
49 | 			Matcher m = tokenRegex.matcher(doc.coveredText(seg));
50 | 			int prevStart = 0;
51 | 			while (m.find()) {
52 | 				if (m.start() > prevStart) {
53 | 					doc.addAnnotation(new Token(SOURCE, seg.start + prevStart, seg.start + m.start()));
54 | 				}
55 | 				prevStart = m.end();
56 | 			}
57 | 			if (prevStart != seg.length()) //may end with whitespace
58 | 				doc.addAnnotation(new Token(SOURCE, seg.start + prevStart, seg.end));
59 | 		}
60 | 	}
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/resources/com/ibm/research/ai/ki/nlp/parse/clearNLP-replace.tsv:
--------------------------------------------------------------------------------
  1 | \u02BA	"
  2 | \u2036	"
  3 | \u2033	"
  4 | \u201C	"
  5 | \u201D	"
  6 | \u201E	"
  7 | \u201F	"
  8 | \u275D	"
  9 | \u275E	"
 10 | \u3003	"
 11 | \u301F	"
 12 | \u301D	"
 13 | \u301E	"
 14 | \uFF02	"
 15 | \u00B4	'
 16 | \u02B9	'
 17 | \u02BC	'
 18 | \u02C8	'
 19 | \u0301	'
 20 | \u2018	'
 21 | \u2019	'
 22 | \u201B	'
 23 | \u2032	'
 24 | \u275B	'
 25 | \u275C	'
 26 | \u02CB	`
 27 | \u0300	`
 28 | \u2035	`
 29 | \u2037	'''
 30 | \u2010	-
 31 | \u2011	-
 32 | \u2012	-
 33 | \u2013	-
 34 | \u2014	-
 35 | \u2015	-
 36 | \u0335	-
 37 | \u0336	-
 38 | \u2016	||
 39 | \u2017	_
 40 | \u02CD	_
 41 | \u0331	_
 42 | \u0332	_
 43 | \u0333	_
 44 | \u02DC	~
 45 | \u0303	~
 46 | \u0330	~
 47 | \u2053	~
 48 | \u223C	~
 49 | \u301C	~
 50 | \u0334	~
 51 | \u02C2	<
 52 | \u02C3	>
 53 | \u27EA	<
 54 | \u27EB	>
 55 | \u2039	<
 56 | \u203A	>
 57 | \u27E8	<
 58 | \u27E9	>
 59 | \u3008	<
 60 | \u3009	>
 61 | \u27E6	[
 62 | \u27E7	]
 63 | \u3014	[
 64 | \u3015	]
 65 | \u3016	[
 66 | \u3017	]
 67 | \u3018	[
 68 | \u3019	]
 69 | \u301A	[
 70 | \u301B	]
 71 | \u2983	{
 72 | \u2984	}
 73 | \u02C4	^
 74 | \u02C6	^
 75 | \u0302	^
 76 | \u2038	^
 77 | \u2303	^
 78 | \u01C0	|
 79 | \u05C0	|
 80 | \u2223	|
 81 | \u2758	|
 82 | \u00F7	/
 83 | \u2044	/
 84 | \u2215	/
 85 | \u0337	/
 86 | \u0338	/
 87 | \u20E5	\
 88 | \u2216	\
 89 | \u066D	*
 90 | \u204E	*
 91 | \u2217	*
 92 | \u2731	*
 93 | \u0589	:
 94 | \u05C3	:
 95 | \u2236	:
 96 | \u2264	<=
 97 | \u2265	>=
 98 | \u2266	<=
 99 | \u2267	>=
100 | \u066A	%
101 | \u2052	%
102 | \u01C3	!
103 | \u2762	!
104 | \u266F	#
105 | \u201A	,
106 | \u203D	?
107 | \u2025	..
108 | \u2026	...
109 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/resources/com/ibm/research/ai/ki/nlp/parse/normalizeText-replace.tsv:
--------------------------------------------------------------------------------
 1 | ’	 ' 
 2 | ′	 ' 
 3 | ''	 
 4 | '	 ' 
 5 | “	 " 
 6 | ”	 " 
 7 | "	 " 
 8 | .	 . 
 9 | <br />	 
10 | , 	 , 
11 | (	 ( 
12 | )	 ) 
13 | !	 ! 
14 | ?	 ? 
15 | ;	 
16 | :	 
17 | -	 - 
18 | =	 
19 | =	 
20 | *	 
21 | |	 
22 | «	 
23 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/resources/downloadOpenNLPModels.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | models=( "en-token.bin" "en-sent.bin" "en-pos-maxent.bin" "en-chunker.bin" "en-parser-chunking.bin" )
 4 | nermodels=( "en-ner-date.bin" "en-ner-location.bin" "en-ner-money.bin" "en-ner-organization.bin" "en-ner-percentage.bin" "en-ner-person.bin" "en-ner-time.bin" )
 5 | 
 6 | allmodels=("${models[@]}" "${nermodels[@]}")
 7 | 
 8 | #download the models from http://opennlp.sourceforge.net/models-1.5/
 9 | for file in "${allmodels[@]}"
10 | do
11 | 	if [ ! -f $file ]; then
12 | 		wget http://opennlp.sourceforge.net/models-1.5/$file
13 | 	fi
14 | done
15 | 
16 | 
17 | #CONSIDER: instead use download-maven-plugin (https://stackoverflow.com/questions/2741806/maven-downloading-files-from-url) in the nlp project
18 | # run in the validate phase (first phase)
19 | # to download each OpenNLP model file to ${project.basedir}/src/main/resources
20 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/resources/en-sent.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/cc-dbp/92f6079dd2a56d33827f944913bb369ebf33f027/com.ibm.research.ai.ki.nlp/src/main/resources/en-sent.bin


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=WARN, stdout
3 |  
4 | # Redirect log messages to console
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/test/java/com/ibm/research/ai/ki/nlp/OverlappingSpansTest.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.nlp;
19 | 
20 | import java.util.*;
21 | 
22 | import com.google.common.collect.*;
23 | 
24 | import com.ibm.reseach.ai.ki.nlp.*;
25 | import com.ibm.research.ai.ki.util.*;
26 | 
27 | public class OverlappingSpansTest {
28 | 	public void validate(Document doc) {
29 | 		OverlappingSpans ospans = new OverlappingSpans(doc.getAnnotations(Annotation.class));
30 | 		List<Annotation> sample = RandomUtil.getSample(doc.getAnnotations(Annotation.class), 100);
31 | 		for (Annotation a : sample) {
32 | 			validate(ospans, doc, a);
33 | 		}
34 | 	}
35 | 	protected void validate(OverlappingSpans ospans, Document doc, Annotation a) {
36 | 		Set<Span> ores = ospans.getSpansOverlapping(a);
37 | 		Set<Span> linearRes = new HashSet<>();
38 | 		for (Annotation oa : doc.getAnnotations(Annotation.class)) {
39 | 			if (oa.overlaps(a)) 
40 | 				linearRes.add(oa);
41 | 		}
42 | 		int matchSize = Sets.intersection(ores, linearRes).size();
43 | 		if (matchSize != ores.size())
44 | 			throw new Error("fail");
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/test/java/com/ibm/research/ai/ki/nlp/TransformStringTest.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.nlp;
19 | 
20 | import com.ibm.reseach.ai.ki.nlp.*;
21 | 
22 | public class TransformStringTest {
23 | 	public void validate(Document doc) {
24 | 		//TODO: transform the doc and transform it back?
25 | 		//offset correction testing:
26 | 		//string normalize gazetteer and text, indexOf, denormalize, then check that all span matches are still normalized matches
27 | 	}
28 | }
29 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/test/java/com/ibm/research/ai/ki/nlp/parse/TestClearNLP.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.nlp.parse;
19 | 
20 | import java.io.File;
21 | import java.util.Properties;
22 | 
23 | import com.ibm.reseach.ai.ki.nlp.*;
24 | import com.ibm.research.ai.ki.nlp.parse.*;
25 | import com.ibm.research.ai.ki.util.*;
26 | 
27 | public class TestClearNLP {
28 | 	public static void main(String[] args) {
29 | 	    String testDir = args[0];
30 | 	    
31 | 		//ClearNLPTransform transform = new ClearNLPTransform();
32 | 		//System.out.println(transform.transform("The man ran (   ;  ) and so ( '  ).", null, null));
33 | 		//System.out.println(transform.transform("Alhazen\n\n(;   ), also known by the Lat", null, null));
34 | 		Pipeline p = new Pipeline(new ClearNLPSentence(), new ClearNLPPOS(), new ClearNLPParse());
35 | 		p.initialize(new Properties());
36 | 		p.enableProfiling();
37 | 		PeriodicChecker report = new PeriodicChecker(100);
38 | 		int docNum = 0;
39 | 		for (Document doc : new PipelinedDocuments(p, new DocumentReader(new File(testDir)))) {
40 | 			++docNum;
41 | 			if (report.isTime()) {
42 | 				System.out.println("On document "+docNum);
43 | 				System.out.println(p.stringProfiling());
44 | 			}
45 | 		}
46 | 		System.out.println(p.stringProfiling());
47 | 	}
48 | }
49 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.nlp/src/test/java/com/ibm/research/ai/ki/nlp/parse/TestNER.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.nlp.parse;
19 | 
20 | import java.io.*;
21 | 
22 | import com.ibm.reseach.ai.ki.nlp.*;
23 | import com.ibm.reseach.ai.ki.nlp.types.*;
24 | import com.ibm.research.ai.ki.nlp.parse.*;
25 | import com.ibm.research.ai.ki.util.*;
26 | import com.ibm.research.ai.ki.util.RandomUtil.*;
27 | 
28 | import java.util.*;
29 | 
30 | 
31 | public class TestNER {
32 | 	public static void main(String[] args) {
33 | 		File docDir = new File(args[0]);
34 | 		Pipeline p = new Pipeline(
35 | 				new ResettingAnnotator(), new OpenNLPSentence(), 
36 | 				new ClearNLPTokenize(), new ClearNLPPOS(), 
37 | 				new ClearNLPNER(), new OpenNLPNER());
38 | 		p.initialize(new Properties());
39 | 		p.enableProfiling();
40 | 		Iterable<Document> docs = new PipelinedDocuments(p, new DocumentReader(docDir));
41 | 		Map<String,MutableDouble> typeCounts = new HashMap<>();
42 | 		RandomUtil.Sample<String> sampled = new RandomUtil.Sample<>(50);
43 | 		for (Document doc : docs) {
44 | 			for (Entity e : doc.getAnnotations(Entity.class)) {
45 | 				SparseVectors.increase(typeCounts, e.type+"-"+e.source, 1.0);
46 | 				if (sampled.shouldSave())
47 | 					sampled.save(e.coveredText(doc)+" :: "+e.type+"-"+e.source);
48 | 			}
49 | 		}
50 | 		System.out.println(SparseVectors.toString(typeCounts));
51 | 		System.out.println(Lang.stringList(sampled, "\n"));
52 | 		System.out.println(p.annotatorListing());
53 | 	}
54 | }
55 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.spark/src/main/resources/relexConfig.properties:
--------------------------------------------------------------------------------
 1 | documentSampleFraction = 1.0
 2 | negativeExampleSampleFraction = 0.05
 3 | targetNegativeToPositveRatio = -1
 4 | directionStyle = bothWays
 5 | titleContext = False
 6 | sectionContext = False
 7 | limitEntitiesToGroundTruth = False
 8 | gtTypes = False
 9 | vocabLimit = 2000000
10 | vocabMinCount = 2
11 | initialEmbeddingsFile = TODO/wordvectorFileInEmbeddingFormat.ef
12 | minMentionSet = 1
13 | maxMentionSet = 100
14 | maxMentionGroups = 5
15 | maxPositionEmbeddings = 80
16 | typeStyle = single
17 | groundTruthFile = TODO/gt.ser.gz
18 | convertDir = TODO/
19 | datasetSplitNames = [train, validate, test]
20 | datasetSpitFractions = [0.8, 0.1, 0.1]
21 | #typePairs.tsv expected to exist in convertDir
22 | entityPairFilterClass = com.ibm.research.ai.ki.kbp.TypePairEntityPairFilter


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 | 	<modelVersion>4.0.0</modelVersion>
 3 | 
 4 | 	<parent>
 5 | 		<groupId>com.ibm.research.ai.ki</groupId>
 6 | 		<artifactId>cc-dbp-parent-pom</artifactId>
 7 | 		<version>1.0.0-SNAPSHOT</version>
 8 | 	</parent>
 9 | 
10 | 	<artifactId>util</artifactId>
11 | 	<version>1.0.0-SNAPSHOT</version>
12 | 
13 | 	<dependencies>
14 | 		<dependency>
15 | 			<groupId>com.google.guava</groupId>
16 | 			<artifactId>guava</artifactId>
17 | 			<version>${guava.version}</version>
18 | 			<!-- Apache License, Version 2.0 -->
19 | 		</dependency>
20 | 		<dependency>
21 | 			<groupId>org.apache.commons</groupId>
22 | 			<artifactId>commons-math3</artifactId>
23 | 			<version>${commons-math3.version}</version>
24 | 			<!-- Apache License, Version 2.0 -->
25 | 		</dependency>
26 | 		<dependency>
27 | 			<groupId>org.apache.commons</groupId>
28 | 			<artifactId>commons-lang3</artifactId>
29 | 			<version>${commons-lang3.version}</version>
30 | 			<!-- Apache License, Version 2.0 -->
31 | 		</dependency>
32 | 		<dependency>
33 | 			<groupId>org.apache.commons</groupId>
34 | 			<artifactId>commons-compress</artifactId>
35 | 			<version>${commons-compress.version}</version>
36 | 			<!-- Apache License, Version 2.0 -->
37 | 		</dependency>
38 | 	</dependencies>
39 | 
40 | </project>


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/formats/SimpleTsvIterable.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.formats;
19 | 
20 | import java.io.*;
21 | import java.util.*;
22 | 
23 | import com.google.common.collect.*;
24 | 
25 | import com.ibm.research.ai.ki.util.*;
26 | 
27 | public class SimpleTsvIterable implements Iterable<String[]> {
28 | 	protected String filename;
29 | 	protected boolean skipHeader;
30 | 	
31 | 	public SimpleTsvIterable(File file) {
32 | 		this(file.getAbsolutePath(),false);
33 | 	}
34 | 	public SimpleTsvIterable(File file, boolean skipHeader) {
35 | 		this(file.getAbsolutePath(),skipHeader);
36 | 	}
37 | 	
38 | 	public SimpleTsvIterable(String filename) {
39 | 		this(filename,false);
40 | 	}
41 | 	public SimpleTsvIterable(String filename, boolean skipHeader) {
42 | 		this.filename = filename;
43 | 		this.skipHeader = skipHeader;
44 | 	}	
45 | 	
46 | 	@Override
47 | 	public Iterator<String[]> iterator() {
48 | 		Iterator<String> lineIter = Iterators.filter(
49 | 				FileUtil.getRawLines(filename).iterator(), 
50 | 				s -> !s.isEmpty());
51 | 		if (skipHeader && lineIter.hasNext())
52 | 			lineIter.next();
53 | 		return new NextOnlyIterator<String[]>() {
54 | 			@Override
55 | 			protected String[] getNext() {
56 | 				if (!lineIter.hasNext())
57 | 					return null;
58 | 				return lineIter.next().split("\t");
59 | 			}
60 | 			
61 | 		};
62 | 	}
63 | 
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/CombinedSpans.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util;
19 | 
20 | import com.google.common.collect.*;
21 | 
22 | public class CombinedSpans {
23 | 	public CombinedSpans() {
24 | 		spans = TreeRangeSet.create();
25 | 	}
26 | 	protected CombinedSpans(RangeSet<Integer> spans) {
27 | 		this.spans = spans;
28 | 	}
29 | 	protected RangeSet<Integer> spans;
30 | 	
31 | 	public void add(Span s) {
32 | 		spans.add(Range.closedOpen(s.start, s.end));
33 | 	}
34 | 	
35 | 	public boolean contains(Span s) {
36 | 		return spans.encloses(Range.closedOpen(s.start, s.end));
37 | 	}
38 | 	
39 | 	public boolean contains(int position) {
40 | 		return spans.contains(position);
41 | 	}
42 | 	
43 | 	public CombinedSpans complement() {
44 | 		return new CombinedSpans(spans.complement());
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/FirstPairComparator.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util;
19 | 
20 | import java.io.*;
21 | import java.util.Comparator;
22 | 
23 | /**
24 |  * orders Pairs based on their first element
25 |  * @author partha
26 |  *
27 |  */
28 | public class FirstPairComparator implements Comparator<Pair>, Serializable {
29 | 
30 |   private Comparator comparator;
31 |   boolean reverse;
32 |   
33 |   /**
34 |    * default constructor that assumes the first element is Comparable
35 |    */
36 |   public FirstPairComparator() {
37 |    
38 |   }
39 |   /**
40 |    * constructor that takes a Comparator where null means treat the elements as Comparable
41 |    * @param comp
42 |    */
43 |   public FirstPairComparator(Comparator comparator) {
44 |     this.comparator = comparator;
45 |   }
46 |   
47 |   /**
48 |    * reverses the order
49 |    */
50 |   public void setReverseOrdering() {
51 |     this.reverse = true;
52 |   }
53 |   
54 |   @Override
55 |   public int compare(Pair p1, Pair p2) {
56 | 	  if (p1 == null) throw new IllegalArgumentException();
57 | 	  if (p2 == null) throw new IllegalArgumentException();
58 |     if (comparator != null) {
59 |       return reverse?comparator.compare(p2.first, p1.first):comparator.compare(p1.first, p2.first);
60 |     } else {
61 |       return reverse?((Comparable) p2.first).compareTo(p1.first): ((Comparable) p1.first).compareTo(p2.first);  //null means treat the elements as Comparable
62 |     }
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/LogLinear.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util;
19 | 
20 | public class LogLinear {
21 | 	/**
22 | 	 * inverse of logistic, log-odds
23 | 	 * @param x
24 | 	 * @return
25 | 	 */
26 | 	public static double logit(double x) {
27 | 		return Math.log(x / (1-x));
28 | 	}
29 | 	/**
30 | 	 * Sigmoid function
31 | 	 * @param x
32 | 	 * @return
33 | 	 */
34 | 	public static double logistic(double x) {
35 | 		return 1.0/(1.0+Math.exp(-x));
36 | 	}
37 | 	/**
38 | 	 * smoothes x away from values too close to zero or one
39 | 	 * x will be in the range [smoothby, (1-smoothby)] if it was in the range [0,1] originally
40 | 	 * @param x
41 | 	 * @param smoothby
42 | 	 * @return
43 | 	 */
44 | 	public static double smooth(double x, double smoothby) {
45 | 		return (1-2*smoothby)*x+smoothby;
46 | 	}
47 | }
48 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/MutableDouble.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util;
19 | 
20 | import java.io.Serializable;
21 | import java.util.*;
22 | 
23 | 
24 | public class MutableDouble implements Cloneable, Comparable<MutableDouble>, Serializable {
25 | 
26 |   private static final long serialVersionUID = 1L;
27 | 
28 |   public double value;
29 | 
30 |   public MutableDouble() {
31 |     value = 0;
32 |   }
33 | 
34 |   public MutableDouble(double value) {
35 |     this.value = value;
36 |   }
37 | 
38 |   @Override
39 |   public int compareTo(MutableDouble that) {
40 |     return this.value == that.value ? 0 : this.value < that.value ? -1 : 1;
41 |   }
42 |   
43 |   @Override
44 |   public String toString() {
45 |     return String.valueOf(this.value);
46 |   }
47 |   
48 | 	public static class AbsValueComparator implements Comparator<MutableDouble> {
49 | 		public int compare(MutableDouble o1, MutableDouble o2) {
50 | 			if (o1 == null || o2 == null) return 0;
51 | 			return (int)Math.signum(Math.abs(o1.value) - Math.abs(o2.value));
52 | 		}
53 | 		
54 | 	}  
55 | }
56 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/MutableInteger.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util;
19 | 
20 | import java.io.Serializable;
21 | 
22 | public class MutableInteger implements Cloneable, Comparable<MutableInteger>, Serializable {
23 | 
24 |   private static final long serialVersionUID = 1L;
25 | 
26 |   public int value;
27 | 
28 |   public MutableInteger() {
29 |     value = 0;
30 |   }
31 | 
32 |   public MutableInteger(int value) {
33 |     this.value = value;
34 |   }
35 | 
36 |   @Override
37 |   public int compareTo(MutableInteger that) {
38 |     return this.value == that.value ? 0 : this.value < that.value ? -1 : 1;
39 |   }
40 |   
41 |   public String toString() {
42 | 	  return String.valueOf(value);
43 |   }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/NextOnlyIterator.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util;
19 | 
20 | import java.util.*;
21 | 
22 | public abstract class NextOnlyIterator<T> implements Iterator<T>, AutoCloseable {
23 | 	private T next;
24 | 	private boolean done = false;
25 | 	
26 | 	abstract protected T getNext();
27 | 
28 | 	
29 | 	@Override
30 | 	public void remove() {
31 | 		throw new UnsupportedOperationException();
32 | 	}
33 | 	
34 | 	@Override
35 | 	public boolean hasNext() {
36 | 		if (done)
37 | 			return false;
38 | 		if (next == null)
39 | 			next = getNext();
40 | 		done = next == null;
41 | 		if (done)
42 | 			close();
43 | 		return !done;
44 | 	}
45 | 
46 | 	@Override
47 | 	public T next() {
48 | 		if (done)
49 | 			return null;
50 | 		if (next != null) {
51 | 			T toRet = next;
52 | 			next = null;
53 | 			return toRet;
54 | 		}
55 | 		T toRet = getNext();
56 | 		done = toRet == null;
57 | 		if (done)
58 | 			close();
59 | 		return toRet;
60 | 	}
61 | 	
62 | 	public void close() {}
63 | 	
64 | 	@Override
65 | 	protected void finalize() throws Throwable {
66 | 	    super.finalize();
67 | 	    if (!done)
68 | 	    	close();
69 | 	}
70 | }
71 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/eval/MultiPrecisionRecall.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util.eval;
19 | 
20 | import java.util.*;
21 | 
22 | import com.google.common.collect.*;
23 | 
24 | import com.ibm.research.ai.ki.util.eval.PrecisionRecall.*;
25 | 
26 | public class MultiPrecisionRecall {
27 | 	public static final String ALL = "ALL";
28 | 	
29 | 	public Map<String, PrecisionRecall> prs;
30 | 	
31 | 	public MultiPrecisionRecall() {
32 | 		prs = new HashMap<>();
33 | 		prs.put(ALL, new PrecisionRecall());
34 | 	}
35 | 	
36 | 	public void addAnswered(String id, double score, boolean relevant, double weight, String... tags) {
37 | 		Instance inst = new Instance(id, score, relevant, weight);
38 | 		for (String t : tags) {
39 | 			if (t == null)
40 | 				continue;
41 | 			if (t.equals(ALL))
42 | 				throw new IllegalArgumentException("The tag '"+ALL+"' is reserved");
43 | 			//CONSIDER: check for duplicate tags?
44 | 			prs.computeIfAbsent(t, s -> new PrecisionRecall()).addAnswered(inst);
45 | 		}
46 | 		prs.get(ALL).addAnswered(inst);
47 | 	}
48 | 	
49 | 	public void addOutOfRecall(int outOfRecallCount, String... tags) {
50 | 		for (String t : tags) {
51 | 			if (t.equals(ALL))
52 | 				throw new IllegalArgumentException("The tag '"+ALL+"' is reserved");
53 | 			//CONSIDER: check for duplicate tags?
54 | 			prs.computeIfAbsent(t, s -> new PrecisionRecall()).addOutOfRecall(outOfRecallCount);
55 | 		}
56 | 		prs.get(ALL).addOutOfRecall(outOfRecallCount);
57 | 	}
58 | 	
59 | 	public Map<String,PrecisionRecall.SummaryScores> computeSummaryScores() {
60 | 		return Maps.transformValues(prs, pr -> pr.computeSummaryScores());
61 | 	}
62 | 	
63 | }
64 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/graphs/GraphAlgorithms.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util.graphs;
19 | 
20 | import java.util.*;
21 | import java.util.function.*;
22 | 
23 | import com.ibm.research.ai.ki.util.*;
24 | 
25 | public class GraphAlgorithms {
26 |     /**
27 |      * Construct the transitive closure of things related to nodes.
28 |      * @param nodes
29 |      * @param getRelated get the nodes related to a given node
30 |      * @return mapping from the members of nodes, to nodes that they are related to, and nodes those are transitively related to
31 |      */
32 |     public static <Node, IN extends Iterable<Node>> Map<Node,Set<Node>> transitiveClosure(Iterable<Node> nodes, Function<Node,IN> getRelated) {
33 |         Map<Node,Set<Node>> tc = new HashMap<>();
34 |         for (Node ni : nodes) {
35 |             Set<Node> rel = tc.computeIfAbsent(ni, k -> new HashSet<>());
36 |             IN rn = getRelated.apply(ni);
37 |             if (rn != null) {
38 |                 for (Node ri : rn) {
39 |                     rel.add(ri);
40 |                 }
41 |             }
42 |         }
43 |         
44 |         boolean changed;
45 |         Set<Node> toAdd = new HashSet<>();
46 |         do {
47 |             changed = false;
48 |             for (Map.Entry<Node, Set<Node>> e : tc.entrySet()) {
49 |                 toAdd.clear();
50 |                 for (Node r : e.getValue()) {
51 |                     toAdd.addAll(Lang.NVL(tc.get(r),Collections.EMPTY_SET));
52 |                 }
53 |                 if (e.getValue().addAll(toAdd))
54 |                     changed = true;
55 |             }
56 |         } while (changed);
57 |         
58 |         return tc;
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/io/OldVersionOf.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util.io;
19 | 
20 | /**
21 |  * Used by RefactoringObjectInputStream
22 |  * When breaking serialization compatibility for a class Foo
23 |  * 1) copy the old version to a new class name FooV1.
24 |  * 2) have FooV1 implement OldVersionOf Foo
25 |  * 3) change Foo in the way desired and write the convert function for FooV1
26 |  * 4) update the serialVersionId in Foo
27 |  * 5) create a mapping in serializedMappings.properties: com.ibm.Foo:oldSerialVersionId -> com.ibm.FooV1
28 |  * @author mrglass
29 |  *
30 |  * @param <T> the class that it is an old version of
31 |  */
32 | public interface OldVersionOf<T> {
33 | 	public T convert();
34 | }
35 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/parallel/ISimpleExecutor.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util.parallel;
19 | 
20 | public interface ISimpleExecutor {
21 | 	/**
22 | 	 * The number of threads that this executor uses.
23 | 	 * @return
24 | 	 */
25 | 	public int getNumProcessors();
26 | 	/**
27 | 	 * Waits for all tasks to finish, polling every milliPoll milliseconds
28 | 	 * @param milliPoll
29 | 	 */
30 | 	public void awaitFinishing(long milliPoll);
31 | 	/**
32 | 	 * Waits for all tasks to finish
33 | 	 */
34 | 	public void awaitFinishing();
35 | 	/**
36 | 	 * Add the task to the list of things to execute in parallel
37 | 	 * @param task
38 | 	 */
39 | 	public void execute(Runnable task);
40 | 	
41 | 	/**
42 | 	 * Waits for all submitted tasks to finish but no longer accepts additional tasks.
43 | 	 * The executor cannot be used after this is executed.
44 | 	 */
45 | 	public void shutdown();
46 | 	
47 | 	/**
48 | 	 * True if all submitted tasks have finished
49 | 	 * @return
50 | 	 */
51 | 	public boolean isFinished();
52 | }
53 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/parallel/SingleThreadedExecutor.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util.parallel;
19 | 
20 | public class SingleThreadedExecutor implements ISimpleExecutor {
21 | 
22 | 	@Override
23 | 	public int getNumProcessors() {
24 | 		return 1;
25 | 	}
26 | 
27 | 	@Override
28 | 	public void awaitFinishing(long milliPoll) {	
29 | 	}
30 | 
31 | 	@Override
32 | 	public void awaitFinishing() {
33 | 	}
34 | 
35 | 	@Override
36 | 	public void execute(Runnable task) {
37 | 		task.run();
38 | 	}
39 | 
40 | 	@Override
41 | 	public void shutdown() {
42 | 	}
43 | 
44 | 	public boolean isFinished() {
45 | 		return true;
46 | 	}
47 | }
48 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/main/java/com/ibm/research/ai/ki/util/parallel/StreamEater.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util.parallel;
19 | 
20 | import java.io.*;
21 | import java.util.function.*;
22 | 
23 | import com.ibm.research.ai.ki.util.*;
24 | 
25 | public class StreamEater extends Thread {
26 | 	public static StreamEater eatStream(BufferedReader in, Consumer<String> lineHandler) {
27 | 		StreamEater e = new StreamEater(in, lineHandler);
28 | 		e.setDaemon(true);
29 | 		e.start();
30 | 		return e;
31 | 	}
32 | 	
33 | 	private StreamEater(BufferedReader in, Consumer<String> lineHandler) {
34 | 		this.in = in;
35 | 		this.lineHandler = lineHandler;
36 | 	}
37 | 	private BufferedReader in;
38 | 	private Consumer<String> lineHandler;
39 | 	@Override
40 | 	public void run() {
41 | 		try {
42 | 			String line = null;
43 | 			while ((line = in.readLine()) != null) {
44 | 				if (lineHandler != null)
45 | 					lineHandler.accept(line);
46 | 			}
47 | 			in.close();
48 | 		} catch (Exception e) {
49 | 			throw new Error(e);
50 | 		}		
51 | 	}
52 | }


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/main/resources/com/ibm/research/ai/ki/util/serializedMappings.properties:
--------------------------------------------------------------------------------
1 | #old class name = new class name
2 | 
3 | #example:
4 | #com.ibm.research.ai.ki.Anchor=com.ibm.research.ai.ki.kbp.Anchor


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/test/java/com/ibm/research/ai/ki/util/BjUtilTestCounter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util;
19 | 
20 | public class BjUtilTestCounter {
21 |   public static void main(String[] args) throws InterruptedException {
22 |     for (int i=0; i<2000; i++){
23 |       Thread.sleep(10);
24 |       System.out.println(i);
25 |     }
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/test/java/com/ibm/research/ai/ki/util/ExecuteJavaProc.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util;
19 | 
20 | import java.io.File;
21 | import java.io.IOException;
22 | 
23 | import com.ibm.research.ai.ki.util.*;
24 | 
25 | public class ExecuteJavaProc {
26 |   private ExecuteJavaProc() {
27 |   }
28 | 
29 |   public static int exec(Class klass) throws IOException, InterruptedException {
30 |     String javaHome = System.getProperty("java.home");
31 |     String javaBin = javaHome + File.separator + "bin" + File.separator + "java";
32 |     String classpath = System.getProperty("java.class.path");
33 |     String className = klass.getCanonicalName();
34 |     ProcessBuilder builder = new ProcessBuilder(javaBin, "-cp", classpath, className);
35 |     Process process = builder.start();
36 |     FileUtil.readProcessAsString(process);
37 |     process.waitFor();
38 |     return process.exitValue();
39 |   }
40 |   
41 |   public static void main(String[] args) throws IOException, InterruptedException {
42 |     exec(BjUtilTestCounter.class);
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/test/java/com/ibm/research/ai/ki/util/NBestTest.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util;
19 | 
20 | import static org.junit.Assert.assertEquals;
21 | 
22 | import java.util.List;
23 | 
24 | import com.ibm.research.ai.ki.util.*;
25 | 
26 | import org.junit.Test;
27 | 
28 | public class NBestTest {
29 | 
30 |   @Test
31 |   public void testAddTAndEmpty() {
32 |     int limit = 5;
33 |     NBest<Person> nBest = new NBest<Person>(limit);
34 |     int max = 0;
35 |     for (int i=0; i< 20; i++){
36 |       int rand = 1 + (int)(Math.random() * ((100 - 1) + 1));
37 |       nBest.add(new Person("Name_"+i, rand));
38 |       max = max<rand? rand:max;
39 |     }
40 |     List<Person> people = nBest.empty();
41 |     assertEquals(limit, people.size());
42 |     assertEquals(max, people.get(0).age);
43 |     assertEquals(0, nBest.empty().size());
44 |   }
45 | 
46 |   class Person implements Comparable<Person>{
47 |     private String name;
48 |     private int age;
49 |     
50 |     public Person(String name, int age) {
51 |      this.name = name;
52 |      this.age = age;
53 |     }
54 |     
55 |     @Override
56 |     public int compareTo(Person other) {
57 |       if (this.age==other.age){
58 |         return 0;
59 |       }
60 |       return this.age > other.age? 1:-1;
61 |     }
62 |     
63 |     @Override
64 |     public String toString() {
65 |       return name+" : "+age;
66 |     }
67 |     
68 |   }
69 | 
70 | }
71 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/test/java/com/ibm/research/ai/ki/util/NonOverlappingTest.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util;
19 | 
20 | import static org.junit.Assert.*;
21 | 
22 | import java.util.*;
23 | 
24 | import com.ibm.research.ai.ki.util.*;
25 | 
26 | import org.junit.*;
27 | 
28 | public class NonOverlappingTest {
29 | 	@Test
30 | 	public void vsLinearScan() {
31 | 		Random rand = new Random(123);
32 | 		for (int scale : new int[] {1, 10, 100}) {
33 | 			long nosTime = 0;
34 | 			long lsTime = 0;
35 | 			for (int testi = 0; testi < 100; ++testi) {
36 | 				List<Span> spans = SpanTest.randomSpans(rand, scale);
37 | 				if (rand.nextBoolean()) {
38 | 					Collections.sort(spans, new Span.LengthComparator().reversed());
39 | 				}
40 | 				
41 | 				List<Span> nonOverlapping = new ArrayList<>();
42 | 				NonOverlappingSpans nos = new NonOverlappingSpans();
43 | 				for (Span s : spans) {
44 | 					long start = System.nanoTime();
45 | 					boolean lsOk = true;
46 | 					for (Span n : nonOverlapping)
47 | 						if (n.overlaps(s)) {
48 | 							lsOk = false;
49 | 							break;
50 | 						}
51 | 					if (lsOk)
52 | 						nonOverlapping.add(s);
53 | 					lsTime += System.nanoTime() - start;
54 | 					
55 | 					start = System.nanoTime();
56 | 					boolean nosOk = nos.addSpan(s);
57 | 					nosTime += System.nanoTime() - start;
58 | 					
59 | 					assertEquals(lsOk, nosOk);
60 | 				}
61 | 			}
62 | 			//System.out.println("Speedup = "+(double)lsTime/(double)nosTime);
63 | 		}
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/test/java/com/ibm/research/ai/ki/util/PropertyLoaderTest.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util;
19 | 
20 | import static org.junit.Assert.assertEquals;
21 | 
22 | import com.ibm.research.ai.ki.util.*;
23 | 
24 | import org.junit.Test;
25 | 
26 | public class PropertyLoaderTest {
27 | 
28 | 
29 |   @Test
30 |   public void testLoadProperties() {
31 |     assertEquals("value", PropertyLoader.loadProperties("com.ibm.research.ai.ki.util.1").get("name"));
32 |     assertEquals("value", PropertyLoader.loadProperties("/com/ibm/research/ai/ki/util/1").get("name"));
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/test/java/com/ibm/research/ai/ki/util/RandomUtilTest.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * cc-dbp-dataset
 3 |  *
 4 |  * Copyright (c) 2017 IBM
 5 |  *
 6 |  * The author licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.ibm.research.ai.ki.util;
19 | 
20 | import static org.junit.Assert.*;
21 | 
22 | import java.util.HashMap;
23 | import java.util.HashSet;
24 | import java.util.Map;
25 | import java.util.Set;
26 | 
27 | import com.ibm.research.ai.ki.util.*;
28 | 
29 | import org.junit.Test;
30 | 
31 | public class RandomUtilTest {
32 | 
33 |   @Test
34 |   public void testRandomInt() {
35 |     for (int i = 0; i < 100; i++) {
36 |       assertEquals(0, RandomUtil.randomInt(0, 1));
37 |     }
38 |   }
39 | 
40 |   @Test
41 |   public void testRandomMemberAndRemove() {
42 |     Set<Integer> integers = new HashSet<Integer>();
43 |     for (int i = 0; i < 10000; i++) {
44 |       integers.add(i);
45 |     }
46 |     for (int i = 0; i < 10000; i++) {
47 |       assertTrue(integers.contains(RandomUtil.randomMember(integers)));
48 |     }
49 |     
50 |     for (int i = 0; i < 100000; i++) {
51 |       assertTrue(!integers.contains(RandomUtil.removeRandom(integers)));
52 |     }
53 |   }
54 | 
55 |   @Test
56 |   public void testRandomEntry() {
57 |     Map<Integer, Integer> integers = new HashMap<Integer, Integer>();
58 |     for (int i = 0; i < 10000; i++) {
59 |       integers.put(i,i);
60 |     }
61 |     for (int i = 0; i < 10000; i++) {
62 |       assertTrue(integers.containsKey(RandomUtil.randomEntry(integers).getKey()));
63 |     }
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/com.ibm.research.ai.ki.util/src/test/resources/com/ibm/research/ai/ki/util/1.properties:
--------------------------------------------------------------------------------
1 | name=value
2 | 


--------------------------------------------------------------------------------
/config.properties:
--------------------------------------------------------------------------------
 1 | #CommonCrawlConfig
 2 | 
 3 | language=en
 4 | minLanguageConfidence=0.8
 5 | numThreads=8
 6 | #save these types as offset annotation in the corpus
 7 | annotationTypes = [LinkAnnotation]
 8 | urlPrefix = https://commoncrawl.s3.amazonaws.com/
 9 | 
10 | #support downloading only a portion of Common Crawl
11 | #warcFileLimit=1000
12 | 
13 | 
14 | #DBpediaKBConfig
15 | 
16 | dbpediaOwlUrl=http://downloads.dbpedia.org/2016-10/dbpedia_2016-10.owl
17 | objectsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/mappingbased_objects_en.ttl.bz2
18 | literalsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/mappingbased_literals_en.ttl.bz2
19 | labelsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/labels_en.ttl.bz2
20 | typesUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/instance_types_transitive_en.ttl.bz2
21 | 
22 | #for ground truth
23 | maxNodeCorpusCount = 500000
24 | minNodeCorpusCount = 1
25 | useRelationTaxonomy = True
26 |     
27 | #for the coarse-grained type system
28 | minTypeSize = 3000
29 | maxNumberOfTypes = 100
30 |     
31 | #for the type filter
32 | minTypePairFreq = 1
33 | 
34 | noNodeCorpusCounts = False
35 | 
36 | 
37 | #RelexConfig
38 | 
39 | documentSampleFraction = 1.0
40 | negativeExampleSampleFraction = 0.05
41 | targetNegativeToPositveRatio = -1
42 | directionStyle = bothWays
43 | titleContext = False
44 | sectionContext = False
45 | limitEntitiesToGroundTruth = False
46 | gtTypes = False
47 | vocabLimit = 2000000
48 | vocabMinCount = 2
49 | minMentionSet = 1
50 | maxMentionSet = 100
51 | maxMentionGroups = 5
52 | maxPositionEmbeddings = 80
53 | typeStyle = single
54 | datasetSplitNames = [train, validate, test]
55 | datasetSpitFractions = [0.8, 0.1, 0.1]
56 | entityPairFilterClass = com.ibm.research.ai.ki.kbp.TypePairEntityPairFilter


--------------------------------------------------------------------------------
/configSmall-de.properties:
--------------------------------------------------------------------------------
 1 | #CommonCrawlConfig
 2 | 
 3 | #German language (changed en->de)
 4 | language=de
 5 | 
 6 | minLanguageConfidence=0.8
 7 | numThreads=8
 8 | #save these types as offset annotation in the corpus
 9 | annotationTypes = [LinkAnnotation]
10 | urlPrefix = https://commoncrawl.s3.amazonaws.com/
11 | 
12 | #support downloading only a portion of Common Crawl
13 | warcFileLimit=1000
14 | 
15 | 
16 | #DBpediaKBConfig
17 | 
18 | dbpediaOwlUrl=http://downloads.dbpedia.org/2016-10/dbpedia_2016-10.owl
19 | 
20 | #German language (changed urls en->de)
21 | objectsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/de/mappingbased_objects_de.ttl.bz2
22 | literalsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/de/mappingbased_literals_de.ttl.bz2
23 | labelsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/de/labels_de.ttl.bz2
24 | typesUrl=http://downloads.dbpedia.org/2016-10/core-i18n/de/instance_types_transitive_de.ttl.bz2
25 | 
26 | #for ground truth
27 | #German language (50000 -> 5000) because less text
28 | maxNodeCorpusCount = 5000
29 | minNodeCorpusCount = 1
30 | useRelationTaxonomy = True
31 |     
32 | #for the coarse-grained type system
33 | #German language (3000 -> 1000) because KB is smaller
34 | minTypeSize = 1000
35 | maxNumberOfTypes = 100
36 |     
37 | #for the type filter
38 | minTypePairFreq = 1
39 | 
40 | noNodeCorpusCounts = False
41 | 
42 | 
43 | #RelexConfig
44 | 
45 | documentSampleFraction = 1.0
46 | negativeExampleSampleFraction = 0.05
47 | targetNegativeToPositveRatio = -1
48 | directionStyle = bothWays
49 | titleContext = False
50 | sectionContext = False
51 | limitEntitiesToGroundTruth = False
52 | gtTypes = False
53 | vocabLimit = 2000000
54 | vocabMinCount = 2
55 | minMentionSet = 1
56 | maxMentionSet = 100
57 | maxMentionGroups = 5
58 | maxPositionEmbeddings = 80
59 | typeStyle = single
60 | datasetSplitNames = [train, validate, test]
61 | datasetSpitFractions = [0.8, 0.1, 0.1]
62 | entityPairFilterClass = com.ibm.research.ai.ki.kbp.TypePairEntityPairFilter
63 | 


--------------------------------------------------------------------------------
/configSmall.properties:
--------------------------------------------------------------------------------
 1 | #CommonCrawlConfig
 2 | 
 3 | language=en
 4 | minLanguageConfidence=0.8
 5 | numThreads=8
 6 | #save these types as offset annotation in the corpus
 7 | annotationTypes = [LinkAnnotation]
 8 | urlPrefix = https://commoncrawl.s3.amazonaws.com/
 9 | 
10 | #support downloading only a portion of Common Crawl
11 | warcFileLimit=1000
12 | 
13 | 
14 | #DBpediaKBConfig
15 | 
16 | dbpediaOwlUrl=http://downloads.dbpedia.org/2016-10/dbpedia_2016-10.owl
17 | objectsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/mappingbased_objects_en.ttl.bz2
18 | literalsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/mappingbased_literals_en.ttl.bz2
19 | labelsUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/labels_en.ttl.bz2
20 | typesUrl=http://downloads.dbpedia.org/2016-10/core-i18n/en/instance_types_transitive_en.ttl.bz2
21 | 
22 | #for ground truth
23 | maxNodeCorpusCount = 50000
24 | minNodeCorpusCount = 1
25 | useRelationTaxonomy = True
26 |     
27 | #for the coarse-grained type system
28 | minTypeSize = 3000
29 | maxNumberOfTypes = 100
30 |     
31 | #for the type filter
32 | minTypePairFreq = 1
33 | 
34 | noNodeCorpusCounts = False
35 | 
36 | 
37 | #RelexConfig
38 | 
39 | documentSampleFraction = 1.0
40 | negativeExampleSampleFraction = 0.05
41 | targetNegativeToPositveRatio = -1
42 | directionStyle = bothWays
43 | titleContext = False
44 | sectionContext = False
45 | limitEntitiesToGroundTruth = False
46 | gtTypes = False
47 | vocabLimit = 2000000
48 | vocabMinCount = 2
49 | minMentionSet = 1
50 | maxMentionSet = 100
51 | maxMentionGroups = 5
52 | maxPositionEmbeddings = 80
53 | typeStyle = single
54 | datasetSplitNames = [train, validate, test]
55 | datasetSpitFractions = [0.8, 0.1, 0.1]
56 | entityPairFilterClass = com.ibm.research.ai.ki.kbp.TypePairEntityPairFilter


--------------------------------------------------------------------------------
/createSmall-de.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #stop at first error, unset variables are errors
 4 | set -o nounset
 5 | set -o errexit
 6 | 
 7 | if [ "$#" -ne 1 ]; then
 8 |     echo "Please supply a single argument, the directory to save the dataset"
 9 |     exit 1
10 | fi
11 | 
12 | ./create.sh $1 configSmall-de.properties
13 | 


--------------------------------------------------------------------------------
/createSmall.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #stop at first error, unset variables are errors
 4 | set -o nounset
 5 | set -o errexit
 6 | 
 7 | if [ "$#" -ne 1 ]; then
 8 |     echo "Please supply a single argument, the directory to save the dataset"
 9 |     exit 1
10 | fi
11 | 
12 | ./create.sh $1 configSmall.properties


--------------------------------------------------------------------------------
/unaryConfig.properties:
--------------------------------------------------------------------------------
 1 | #RelexConfig
 2 | 
 3 | documentSampleFraction = 1.0
 4 | negativeExampleSampleFraction = 1.0
 5 | targetNegativeToPositveRatio = -1
 6 | vocabLimit = 2000000
 7 | vocabMinCount = 2
 8 | minMentionSet = 1
 9 | maxMentionSet = 100
10 | maxMentionGroups = 5
11 | maxPositionEmbeddings = 80
12 | typeStyle = single
13 | datasetSplitNames = [train, validate, test]
14 | datasetSpitFractions = [0.8, 0.1, 0.1]
15 | entityPairFilterClass = com.ibm.research.ai.ki.kbp.unary.DownsampleEntityFilter
16 | relexManagerClass = com.ibm.research.ai.ki.kbp.unary.RelexDatasetManagerUnary


--------------------------------------------------------------------------------
/unaryCreate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #stop at first error, unset variables are errors
 4 | set -o nounset
 5 | set -o errexit
 6 | 
 7 | scriptDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 8 | 
 9 | # Base directory to save the cc-dbp dataset in
10 | baseDir=$1
11 | # Configuration file to use
12 | config=${2:-unaryConfig.properties}
13 | 
14 | # baseline context set construction
15 | java -Xmx8G -cp com.ibm.research.ai.ki.kbp/target/kbp-1.0.0-SNAPSHOT-jar-with-dependencies.jar \
16 | com.ibm.research.ai.ki.kbp.KBPBuildDataset -unaryConfig $config -in $baseDir/docs-gaz.json.gz.b64 -out $baseDir/dataset -kb $baseDir/kb
17 | 
18 | # show sample of positive context sets
19 | awk  -F $'\t' '$6!=""' $baseDir/dataset/unaryContextSets/contexts-part0.tsv | head
20 | 


--------------------------------------------------------------------------------