├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── build.sbt ├── project ├── assembly.sbt ├── build.properties └── plugins.sbt └── src ├── main ├── resources │ └── reference.conf └── scala │ └── io │ └── mem0r1es │ └── trank │ ├── TRanker.scala │ ├── pipeline │ ├── EntityLinking.scala │ ├── NER.scala │ ├── PreProcessor.scala │ ├── TypeRanking.scala │ └── TypeRetrieval.scala │ ├── ranking │ ├── ANCESTORS.scala │ ├── ANC_DEPTH.scala │ ├── DEPTH.scala │ ├── HierInfo.scala │ └── RankingAlgo.scala │ └── util │ ├── IndexUtils.scala │ └── TRankIndexType.scala └── test ├── resources ├── exascale.info.html └── exascale.info.txt └── scala └── io └── mem0r1es └── trank ├── pipeline ├── NERSpec.scala └── PreProcessorSpec.scala └── ranking ├── ANCESTORSSpec.scala ├── ANC_DEPTHSpec.scala └── DEPTHSpec.scala /.gitignore: -------------------------------------------------------------------------------- 1 | # sbt specific 2 | dist/* 3 | target/ 4 | lib_managed/ 5 | src_managed/ 6 | project/boot/ 7 | project/plugins/project/ 8 | 9 | # IntelliJ 10 | .idea/ 11 | 12 | # Mac OS X 13 | .DS_Store 14 | 15 | # TRank specific 16 | trank-indexes/ 17 | *.log 18 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.11.7 4 | - 2.10.5 5 | jdk: 6 | - oraclejdk8 7 | - oraclejdk7 8 | - openjdk7 9 | 10 | script: sbt ++$TRAVIS_SCALA_VERSION test 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and 10 | distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 13 | owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities 16 | that control, are controlled by, or are under common control with that entity. 17 | For the purposes of this definition, "control" means (i) the power, direct or 18 | indirect, to cause the direction or management of such entity, whether by 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising 23 | permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, including 26 | but not limited to software source code, documentation source, and configuration 27 | files. 28 | 29 | "Object" form shall mean any form resulting from mechanical transformation or 30 | translation of a Source form, including but not limited to compiled object code, 31 | generated documentation, and conversions to other media types. 32 | 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made 34 | available under the License, as indicated by a copyright notice that is included 35 | in or attached to the work (an example is provided in the Appendix below). 36 | 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that 38 | is based on (or derived from) the Work and for which the editorial revisions, 39 | annotations, elaborations, or other modifications represent, as a whole, an 40 | original work of authorship. For the purposes of this License, Derivative Works 41 | shall not include works that remain separable from, or merely link (or bind by 42 | name) to the interfaces of, the Work and Derivative Works thereof. 43 | 44 | "Contribution" shall mean any work of authorship, including the original version 45 | of the Work and any modifications or additions to that Work or Derivative Works 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 47 | by the copyright owner or by an individual or Legal Entity authorized to submit 48 | on behalf of the copyright owner. For the purposes of this definition, 49 | "submitted" means any form of electronic, verbal, or written communication sent 50 | to the Licensor or its representatives, including but not limited to 51 | communication on electronic mailing lists, source code control systems, and 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for 53 | the purpose of discussing and improving the Work, but excluding communication 54 | that is conspicuously marked or otherwise designated in writing by the copyright 55 | owner as "Not a Contribution." 56 | 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 58 | of whom a Contribution has been received by Licensor and subsequently 59 | incorporated within the Work. 60 | 61 | 2. Grant of Copyright License. 62 | 63 | Subject to the terms and conditions of this License, each Contributor hereby 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 65 | irrevocable copyright license to reproduce, prepare Derivative Works of, 66 | publicly display, publicly perform, sublicense, and distribute the Work and such 67 | Derivative Works in Source or Object form. 68 | 69 | 3. Grant of Patent License. 70 | 71 | Subject to the terms and conditions of this License, each Contributor hereby 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 73 | irrevocable (except as stated in this section) patent license to make, have 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where 75 | such license applies only to those patent claims licensable by such Contributor 76 | that are necessarily infringed by their Contribution(s) alone or by combination 77 | of their Contribution(s) with the Work to which such Contribution(s) was 78 | submitted. If You institute patent litigation against any entity (including a 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 80 | Contribution incorporated within the Work constitutes direct or contributory 81 | patent infringement, then any patent licenses granted to You under this License 82 | for that Work shall terminate as of the date such litigation is filed. 83 | 84 | 4. Redistribution. 85 | 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof 87 | in any medium, with or without modifications, and in Source or Object form, 88 | provided that You meet the following conditions: 89 | 90 | You must give any other recipients of the Work or Derivative Works a copy of 91 | this License; and 92 | You must cause any modified files to carry prominent notices stating that You 93 | changed the files; and 94 | You must retain, in the Source form of any Derivative Works that You distribute, 95 | all copyright, patent, trademark, and attribution notices from the Source form 96 | of the Work, excluding those notices that do not pertain to any part of the 97 | Derivative Works; and 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any 99 | Derivative Works that You distribute must include a readable copy of the 100 | attribution notices contained within such NOTICE file, excluding those notices 101 | that do not pertain to any part of the Derivative Works, in at least one of the 102 | following places: within a NOTICE text file distributed as part of the 103 | Derivative Works; within the Source form or documentation, if provided along 104 | with the Derivative Works; or, within a display generated by the Derivative 105 | Works, if and wherever such third-party notices normally appear. The contents of 106 | the NOTICE file are for informational purposes only and do not modify the 107 | License. You may add Your own attribution notices within Derivative Works that 108 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 109 | provided that such additional attribution notices cannot be construed as 110 | modifying the License. 111 | You may add Your own copyright statement to Your modifications and may provide 112 | additional or different license terms and conditions for use, reproduction, or 113 | distribution of Your modifications, or for any such Derivative Works as a whole, 114 | provided Your use, reproduction, and distribution of the Work otherwise complies 115 | with the conditions stated in this License. 116 | 117 | 5. Submission of Contributions. 118 | 119 | Unless You explicitly state otherwise, any Contribution intentionally submitted 120 | for inclusion in the Work by You to the Licensor shall be under the terms and 121 | conditions of this License, without any additional terms or conditions. 122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of 123 | any separate license agreement you may have executed with Licensor regarding 124 | such Contributions. 125 | 126 | 6. Trademarks. 127 | 128 | This License does not grant permission to use the trade names, trademarks, 129 | service marks, or product names of the Licensor, except as required for 130 | reasonable and customary use in describing the origin of the Work and 131 | reproducing the content of the NOTICE file. 132 | 133 | 7. Disclaimer of Warranty. 134 | 135 | Unless required by applicable law or agreed to in writing, Licensor provides the 136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, 137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 138 | including, without limitation, any warranties or conditions of TITLE, 139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are 140 | solely responsible for determining the appropriateness of using or 141 | redistributing the Work and assume any risks associated with Your exercise of 142 | permissions under this License. 143 | 144 | 8. Limitation of Liability. 145 | 146 | In no event and under no legal theory, whether in tort (including negligence), 147 | contract, or otherwise, unless required by applicable law (such as deliberate 148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be 149 | liable to You for damages, including any direct, indirect, special, incidental, 150 | or consequential damages of any character arising as a result of this License or 151 | out of the use or inability to use the Work (including but not limited to 152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or 153 | any and all other commercial damages or losses), even if such Contributor has 154 | been advised of the possibility of such damages. 155 | 156 | 9. Accepting Warranty or Additional Liability. 157 | 158 | While redistributing the Work or Derivative Works thereof, You may choose to 159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or 160 | other liability obligations and/or rights consistent with this License. However, 161 | in accepting such obligations, You may act only on Your own behalf and on Your 162 | sole responsibility, not on behalf of any other Contributor, and only if You 163 | agree to indemnify, defend, and hold each Contributor harmless for any liability 164 | incurred by, or claims asserted against, such Contributor by reason of your 165 | accepting any such warranty or additional liability. 166 | 167 | END OF TERMS AND CONDITIONS 168 | 169 | APPENDIX: How to apply the Apache License to your work 170 | 171 | To apply the Apache License to your work, attach the following boilerplate 172 | notice, with the fields enclosed by brackets "[]" replaced with your own 173 | identifying information. (Don't include the brackets!) The text should be 174 | enclosed in the appropriate comment syntax for the file format. We also 175 | recommend that a file or class name and description of purpose be included on 176 | the same "printed page" as the copyright notice for easier identification within 177 | third-party archives. 178 | 179 | Copyright [2015] [eXascale Infolab] 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | TRank [![Build Status](https://travis-ci.org/XI-lab/TRank.png?branch=master)](https://travis-ci.org/XI-lab/TRank) 2 | ===== 3 | TRank implements a Scala pipeline for: 4 | * boilerplate removal on markup content 5 | * Named Entity Recognition 6 | * Entity linkage with *DBpedia* URIs 7 | * Entity typing using a novel type hierarchy that combines *DBpedia*, *Yago*, and *schema.org* classes 8 | * Type ranking based on algorithms that underwent thorough evaluation via crowdsourcing 9 | 10 | 11 | For example, a document containing the label *University of Fribourg* will return: 12 | ```scala 13 | http://dbpedia.org/resource/University_of_Fribourg -> 14 | 15 | Seq(http://dbpedia.org/class/yago/UniversitiesInSwitzerland, 16 | http://dbpedia.org/class/yago/PuBlicUniversities, 17 | http://schema.org/CollegeOrUniversity, 18 | http://dbpedia.org/ontology/University, 19 | http://dbpedia.org/ontology/EducationalInstitution, 20 | http://schema.org/EducationalOrganization, 21 | http://dbpedia.org/ontology/Organisation, 22 | http://schema.org/Organization, 23 | http://dbpedia.org/ontology/Agent) 24 | ``` 25 | 26 | How To Use TRank 27 | ---------------- 28 | ### API 29 | To use TRank, it is enough to create a TRanker object with any textual content: 30 | ```scala 31 | class TRanker(content: String) 32 | ``` 33 | possibly specifying an alternative ranking algorithm, instead of the default ANCESTORS: 34 | ```scala 35 | class TRanker(content: String, rankingAlgo: RankingAlgo) 36 | 37 | trait RankingAlgo { def rank(???): Seq[URI] } 38 | ``` 39 | 40 | The results of the whole pipeline process are accessible through: 41 | ```scala 42 | TRanker.entityToTRankedTypes: Map[URI, Seq[URI]] 43 | ``` 44 | for the final step, and through similar data structures for all the intermediate steps. 45 | 46 | ### Indexes 47 | TRank requires 3 Lucene indexes that are available for 48 | [download here](http://trank.exascale.info/downloads/trank-indexes.tgz). 49 | The .tgz can be extracted in the classpath of the library, and TRank will start to use seamlessly the 3 indexes. 50 | 51 | **IMPORTANT:** do not change the directory structure of `trank-indexes/`. 52 | 53 | 54 | Alternatively, TRank uses the [Typesafe Configuration](https://github.com/typesafehub/config) library to manage user 55 | settings. To override the default path to the indexes, it is enough to define the `TRank.index_basepath` property. 56 | 57 | 58 | Background 59 | ---------- 60 | Much of Web search and browsing activity is today centered around entities. For this reason, Search Engine Result 61 | Pages (SERPs) increasingly contain information about the searched entities such as pictures, short summaries, 62 | related entities, and factual information. A key facet that is often displayed on the SERPs and that is instrumental 63 | for many applications is the entity type. However, an entity is usually not associated to a single generic type 64 | in the background knowledge bases but rather to a set of more specific types, which may be relevant or not given the 65 | document context. For example, one can find on the Linked Open Data cloud the fact that Tom Hanks is a person, an actor, 66 | and a person from Concord, California. All these types are correct but some may be too general to be interesting (e.g., 67 | person), while other may be interesting but already known to the user (e.g., actor), or may be irrelevant given the 68 | current browsing context (e.g., person from Concord, California). In this paper, we define the new task of ranking entity 69 | types given an entity and its context. We propose and evaluate new methods to find the most relevant entity type based on 70 | collection statistics and on the graph structure interconnecting entities and types. An extensive experimental evaluation 71 | over several document collections at different levels of granularity (e.g., sentences, paragraphs, etc.) and different 72 | type hierarchies (including DBPedia, Freebase, and schema.org) shows that hierarchy-based approaches provide more accurate 73 | results when picking entity types to be displayed to the end-user. 74 | 75 | 76 | For more information, check the [ISWC2013 paper](https://exascale.info/assets/pdf/entityTypes.pdf). 77 | 78 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | organization := "io.mem0r1es" 2 | 3 | name := "TRank" 4 | 5 | version := "1.0" 6 | 7 | scalaVersion := "2.11.6" 8 | 9 | scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature") 10 | 11 | 12 | // -------------------- 13 | // --- Dependencies --- 14 | // -------------------- 15 | 16 | resolvers += "Typesafe Repo" at "http://repo.typesafe.com/typesafe/releases/" 17 | 18 | // CoreNLP + resources 19 | libraryDependencies ++= Seq( 20 | "edu.stanford.nlp" % "stanford-corenlp" % "3.4.1", 21 | "edu.stanford.nlp" % "stanford-corenlp" % "3.4.1" classifier "models" 22 | ) 23 | 24 | // Lucene deps 25 | libraryDependencies ++= Seq( 26 | "org.apache.lucene" % "lucene-core" % "4.10.4", 27 | "org.apache.lucene" % "lucene-analyzers-common" % "4.10.4", 28 | "org.apache.lucene" % "lucene-queries" % "4.10.4" 29 | ) 30 | 31 | // Misc 32 | libraryDependencies ++= Seq( 33 | "org.apache.tika" % "tika-core" % "1.7", 34 | "org.apache.tika" % "tika-parsers" % "1.7", 35 | "commons-io" % "commons-io" % "2.4", 36 | "com.typesafe" % "config" % "1.2.1", 37 | "com.typesafe.play" %% "play-json" % "2.3.8", 38 | "org.scalatest" %% "scalatest" % "2.2.1" % "test" 39 | ) 40 | 41 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0") 2 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.8 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eXascaleInfolab/TRank/4f69dc9dfc8f60b53ce402382c654f6d7aabe827/project/plugins.sbt -------------------------------------------------------------------------------- /src/main/resources/reference.conf: -------------------------------------------------------------------------------- 1 | TRank { 2 | index_basepath = "trank-indexes/" 3 | } -------------------------------------------------------------------------------- /src/main/scala/io/mem0r1es/trank/TRanker.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank 2 | 3 | import com.typesafe.config.Config 4 | import com.typesafe.config.ConfigFactory 5 | import io.mem0r1es.trank.pipeline.EntityLinking.linkEntities 6 | import io.mem0r1es.trank.pipeline.NER.runNER 7 | import io.mem0r1es.trank.pipeline.PreProcessor.preProcess 8 | import io.mem0r1es.trank.pipeline.TypeRanking.rankTypes 9 | import io.mem0r1es.trank.pipeline.TypeRetrieval.retrieveTypes 10 | import io.mem0r1es.trank.ranking.ANCESTORS 11 | import io.mem0r1es.trank.ranking.RankingAlgo 12 | import java.io.InputStream 13 | import scala.io.Source 14 | import java.io.ByteArrayInputStream 15 | 16 | class TRanker(content: InputStream, rankingAlgo: RankingAlgo, config: Config) { 17 | 18 | config.checkValid(ConfigFactory.defaultReference(), "TRank") 19 | 20 | /** 21 | * Default to standard config. 22 | */ 23 | def this(content: InputStream, rankingAlgo: RankingAlgo) { 24 | this(content, rankingAlgo, ConfigFactory.load()) 25 | } 26 | 27 | /** 28 | * Default to ANCESTORS ranking algorithm, and standard config. 29 | */ 30 | def this(content: InputStream) { 31 | this(content, new ANCESTORS, ConfigFactory.load()) 32 | } 33 | 34 | /** 35 | * Default to standard config. 36 | */ 37 | def this(contentStr: String, rankingAlgo: RankingAlgo) { 38 | this(new ByteArrayInputStream(contentStr.getBytes()), 39 | rankingAlgo, 40 | ConfigFactory.load()) 41 | } 42 | 43 | /** 44 | * Default to ANCESTORS ranking algorithm, and standard config. 45 | */ 46 | def this(contentStr: String) { 47 | this(new ByteArrayInputStream(contentStr.getBytes()), 48 | new ANCESTORS, 49 | ConfigFactory.load()) 50 | } 51 | 52 | 53 | val contentRaw = content 54 | 55 | // TRank pipeline steps 56 | val contentPreProcessed = preProcess(content) 57 | private val entityLabels = runNER(contentPreProcessed) 58 | val entityToLabel = linkEntities(entityLabels, config) 59 | val entityURIs = entityToLabel.keySet 60 | val entityToTypes = retrieveTypes(entityURIs, config) 61 | val entityToTRankedTypes = rankTypes(entityToTypes, rankingAlgo, config) 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/io/mem0r1es/trank/pipeline/EntityLinking.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.pipeline 2 | 3 | import java.net.URI 4 | 5 | import org.apache.lucene.index.Term 6 | import org.apache.lucene.search.BooleanClause 7 | import org.apache.lucene.search.BooleanClause.Occur 8 | import org.apache.lucene.search.BooleanQuery 9 | import org.apache.lucene.search.IndexSearcher 10 | import org.apache.lucene.search.Query 11 | import org.apache.lucene.search.TermQuery 12 | 13 | import com.typesafe.config.Config 14 | 15 | import io.mem0r1es.trank.util.IndexUtils 16 | import io.mem0r1es.trank.util.TRankIndexType 17 | 18 | object EntityLinking { 19 | 20 | /** 21 | * Links Named Entity labels to DBpedia URIs. 22 | */ 23 | def linkEntities(entityLabels: Set[String], config: Config): Map[URI, String] = { 24 | var entities = Map[URI, String]() 25 | 26 | entityLabels.foreach { label => 27 | val uri = getURI(label, config) 28 | entities += uri -> label 29 | } 30 | entities.toMap 31 | } 32 | 33 | private def getURI(label: String, config: Config): URI = { 34 | val searcher = IndexUtils.getIndexSearcher(TRankIndexType.URI_INDEX, config) 35 | val exact = exactQuery(label, searcher) 36 | val bool = boolQuery(label, searcher) 37 | 38 | exact match { 39 | case Some(x) => return x 40 | case None => 41 | } 42 | bool match { 43 | case Some(x) => return x 44 | case _ => return new URI("") 45 | } 46 | } 47 | 48 | private def exactQuery(label: String, searcher: IndexSearcher): Option[URI] = { 49 | val query = new TermQuery(new Term("labelex", label.toLowerCase())) 50 | top1(query, searcher) 51 | } 52 | 53 | private def boolQuery(label: String, searcher: IndexSearcher): Option[URI] = { 54 | val query = new BooleanQuery 55 | label.toLowerCase().split(" ").foreach { term => 56 | query.add(new BooleanClause(new TermQuery(new Term("label", term)), Occur.SHOULD)) 57 | } 58 | top1(query, searcher) 59 | } 60 | 61 | private def top1(query: Query, searcher: IndexSearcher): Option[URI] = { 62 | val docs = searcher.search(query, 1) 63 | if (docs.scoreDocs.length > 0) { 64 | val d = searcher.doc(docs.scoreDocs(0).doc) 65 | Option(new URI(d.get("uri"))) 66 | } else { 67 | None 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/scala/io/mem0r1es/trank/pipeline/NER.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.pipeline 2 | 3 | import edu.stanford.nlp.ie.crf.CRFClassifier 4 | import java.util.Properties 5 | 6 | object NER { 7 | 8 | private val props = new Properties() 9 | props.put("annotators", "tokenize") 10 | private val classifier = CRFClassifier.getClassifierNoExceptions( 11 | "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz") 12 | 13 | /** 14 | * Runs the Stanford Named Entity Reconizer on the given content, 15 | * returning a Set with all the entity labels. 16 | */ 17 | def runNER(content: String): Set[String] = { 18 | val annotatedContent = classifier.classifyWithInlineXML(content) 19 | extractEntities(annotatedContent) 20 | } 21 | 22 | private def extractEntities(content: String): Set[String] = { 23 | extractSingleType(content, "", "") ++ 24 | extractSingleType(content, "", "") ++ 25 | extractSingleType(content, "", "") 26 | } 27 | 28 | private def extractSingleType(content: String, openTag: String, closeTag: String): Set[String] = { 29 | var entities = Set[String]() 30 | 31 | val fragments = content.split(openTag) 32 | fragments.slice(1, fragments.length).foreach { fragment => 33 | val label = fragment.split(closeTag)(0) 34 | entities += label 35 | } 36 | 37 | entities.toSet 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/io/mem0r1es/trank/pipeline/PreProcessor.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.pipeline 2 | 3 | import org.apache.tika.sax.BodyContentHandler 4 | import org.apache.tika.metadata.Metadata 5 | import org.apache.tika.parser.html.HtmlParser 6 | import java.io.InputStream 7 | import org.apache.tika.parser.ParseContext 8 | 9 | 10 | object PreProcessor { 11 | 12 | /** 13 | * Runs the content pre-processing step (e.g., HTML tags removal) 14 | */ 15 | def preProcess(content: InputStream): String = { 16 | extractTextFromHTML(content) 17 | } 18 | 19 | private def extractTextFromHTML(content: InputStream): String = { 20 | val handler = new BodyContentHandler() 21 | val metadata = new Metadata() 22 | new HtmlParser().parse(content, handler, metadata, new ParseContext()) 23 | 24 | handler.toString 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/io/mem0r1es/trank/pipeline/TypeRanking.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.pipeline 2 | 3 | import java.net.URI 4 | 5 | import play.api.libs.json.{JsValue, Json} 6 | 7 | import org.apache.lucene.index.Term 8 | import org.apache.lucene.search.TermQuery 9 | 10 | import com.typesafe.config.Config 11 | 12 | import io.mem0r1es.trank.ranking.HierInfo 13 | import io.mem0r1es.trank.ranking.RankingAlgo 14 | import io.mem0r1es.trank.util.IndexUtils 15 | import io.mem0r1es.trank.util.TRankIndexType 16 | 17 | object TypeRanking { 18 | 19 | def rankTypes(entityTypes: Map[URI, Set[URI]], 20 | rankingAlgo: RankingAlgo, 21 | config: Config): Map[URI, Seq[(URI, Double)]] = { 22 | 23 | var ranked = Map[URI, Seq[(URI, Double)]]() 24 | 25 | entityTypes.foreach { 26 | case (uri, types) => 27 | ranked += uri -> rankingAlgo.rank(types.map { t => t -> queryHier(t, config) }.toMap) 28 | } 29 | ranked 30 | } 31 | 32 | private def queryHier(typeURI: URI, config: Config): HierInfo = { 33 | val searcher = IndexUtils.getIndexSearcher(TRankIndexType.PATH_INDEX, config) 34 | 35 | val query = new TermQuery(new Term("uri", typeURI.toString)) 36 | val docs = searcher.search(query, 1) 37 | if (docs.scoreDocs.length > 0) { 38 | val d = searcher.doc(docs.scoreDocs(0).doc) 39 | val level = d.get("level").toInt 40 | val path: JsValue = Json.parse(d.get("path")) 41 | path.asOpt[Array[String]] match { 42 | case Some(l: Array[String]) => new HierInfo(level, l.map(t => new URI(t.toString))) 43 | case _ => new HierInfo(level, Seq[URI]()) 44 | } 45 | } else { 46 | new HierInfo(-1, Seq[URI]()) 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/io/mem0r1es/trank/pipeline/TypeRetrieval.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.pipeline 2 | 3 | import java.net.URI 4 | 5 | import scala.Array.canBuildFrom 6 | 7 | import org.apache.lucene.index.Term 8 | import org.apache.lucene.search.TermQuery 9 | 10 | import com.typesafe.config.Config 11 | 12 | import io.mem0r1es.trank.util.IndexUtils 13 | import io.mem0r1es.trank.util.TRankIndexType 14 | 15 | object TypeRetrieval { 16 | 17 | /** 18 | * Given a DBpedia resource URI, retrieve all its RDF types. 19 | */ 20 | def retrieveTypes(entities: Set[URI], config: Config): Map[URI, Set[URI]] = { 21 | var typedEntities = Map[URI, Set[URI]]() 22 | 23 | entities.foreach { entity => 24 | val types = getTypes(entity, config) 25 | typedEntities += entity -> types 26 | } 27 | typedEntities 28 | } 29 | 30 | private def getTypes(entity: URI, config: Config): Set[URI] = { 31 | val searcher = IndexUtils.getIndexSearcher(TRankIndexType.TYPE_INDEX, config) 32 | 33 | val query = new TermQuery(new Term("uri", entity.toString)) 34 | val docs = searcher.search(query, 1) 35 | if (docs.scoreDocs.length > 0) { 36 | val d = searcher.doc(docs.scoreDocs(0).doc) 37 | d.getValues("type").map(new URI(_)).toSet 38 | } else { 39 | Set[URI]() 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/io/mem0r1es/trank/ranking/ANCESTORS.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.ranking 2 | 3 | import java.net.URI 4 | 5 | class ANCESTORS extends RankingAlgo { 6 | 7 | /** 8 | * Rank types by inverse-sort on the # of ANCESTORS contained in the type set. 9 | */ 10 | override def rank(entityTypes: Map[URI, HierInfo]): Seq[(URI, Double)] = { 11 | 12 | def score(path: Seq[URI]): Double = { 13 | path.filter (entityTypes.contains(_)) 14 | .length 15 | } 16 | 17 | entityTypes.map { 18 | case (k, v) => (k, score(v.path)) 19 | } 20 | .toSeq 21 | .sortBy(_._2) 22 | .reverse 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/io/mem0r1es/trank/ranking/ANC_DEPTH.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.ranking 2 | 3 | import java.net.URI 4 | 5 | class ANC_DEPTH extends RankingAlgo { 6 | 7 | /** 8 | * Rank types by inverse-sort on the # of ANCESTORS contained in the type set, 9 | * weighted by their DEPTHS. 10 | */ 11 | override def rank(entityTypes: Map[URI, HierInfo]): Seq[(URI, Double)] = { 12 | 13 | def score(hier: HierInfo): Double = { 14 | hier.path.filter (entityTypes.contains(_)) 15 | .map { case uri => entityTypes(uri).level.toDouble } 16 | .sum 17 | } 18 | 19 | entityTypes.map { 20 | case (k, v) => (k, score(v)) 21 | }.toSeq.sortBy(_._2).reverse 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/io/mem0r1es/trank/ranking/DEPTH.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.ranking 2 | 3 | import java.net.URI 4 | 5 | import scala.collection.Seq 6 | import scala.collection.immutable.Map 7 | 8 | class DEPTH extends RankingAlgo { 9 | 10 | /** 11 | * Rank types by inverse-sort on the hierarchy level. 12 | */ 13 | override def rank(entityTypes: Map[URI, HierInfo]): Seq[(URI, Double)] = { 14 | entityTypes.toSeq.map { case (k,v) => (k, v.level.toDouble)} 15 | .sortBy(_._2) 16 | .reverse 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/io/mem0r1es/trank/ranking/HierInfo.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.ranking 2 | 3 | import java.net.URI 4 | 5 | class HierInfo(val level: Int, val path: Seq[URI]) 6 | -------------------------------------------------------------------------------- /src/main/scala/io/mem0r1es/trank/ranking/RankingAlgo.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.ranking 2 | 3 | import java.net.URI 4 | 5 | trait RankingAlgo { 6 | 7 | def rank(entityTypes: Map[URI, HierInfo]): Seq[(URI, Double)] 8 | } -------------------------------------------------------------------------------- /src/main/scala/io/mem0r1es/trank/util/IndexUtils.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.util 2 | 3 | import java.io.File 4 | 5 | import com.typesafe.config.Config 6 | import org.apache.lucene.index.DirectoryReader 7 | import org.apache.lucene.search.IndexSearcher 8 | import org.apache.lucene.store.NIOFSDirectory 9 | 10 | 11 | object IndexUtils { 12 | 13 | import TRankIndexType._ 14 | private var searcherCache = Map[TRankIndexType, IndexSearcher]() 15 | 16 | def getIndexSearcher(indexType: TRankIndexType, config: Config): IndexSearcher = { 17 | val searcher = searcherCache.get(indexType) 18 | 19 | searcher match { 20 | case Some(value) => value 21 | case None => { 22 | val value = createIndexSearcher(indexType, config) 23 | searcherCache += indexType -> value 24 | value 25 | } 26 | } 27 | } 28 | 29 | private def createIndexSearcher(indexType: TRankIndexType, config: Config): IndexSearcher = { 30 | val indexPath = new File(config.getString("TRank.index_basepath") + "/" + indexType) 31 | val directory = new NIOFSDirectory(indexPath) 32 | val reader = DirectoryReader.open(directory) 33 | new IndexSearcher(reader) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/io/mem0r1es/trank/util/TRankIndexType.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.util 2 | 3 | object TRankIndexType extends Enumeration { 4 | type TRankIndexType = Value 5 | 6 | val URI_INDEX = Value("uriindex") 7 | val TYPE_INDEX = Value("typeindex") 8 | val PATH_INDEX = Value("pathindex") 9 | } 10 | -------------------------------------------------------------------------------- /src/test/resources/exascale.info.html: -------------------------------------------------------------------------------- 1 |
2 | XI--the eXascale Infolab--is a new research group at the University of Fribourg, Switzerland.
We are designing, building and deploying next-generation infrastructures for Big Data, with a focus on social, scientific, and linked data. 3 |
4 | -------------------------------------------------------------------------------- /src/test/resources/exascale.info.txt: -------------------------------------------------------------------------------- 1 | XI--the eXascale Infolab--is a new research group at the University of Fribourg, Switzerland. 2 | We are designing, building and deploying next-generation infrastructures for Big Data, with a focus on social, scientific, and linked data. 3 | -------------------------------------------------------------------------------- /src/test/scala/io/mem0r1es/trank/pipeline/NERSpec.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.pipeline 2 | 3 | import io.mem0r1es.trank.pipeline.NER._ 4 | import org.scalatest.FlatSpec 5 | import scala.io.Source 6 | 7 | class NERSpec extends FlatSpec { 8 | 9 | "A NER" should "extract entity labels" in { 10 | val content = Source.fromFile("src/test/resources/exascale.info.txt").mkString 11 | val entities = runNER(content) 12 | assert(entities contains ("Switzerland")) 13 | assert(entities contains ("University of Fribourg")) 14 | } 15 | 16 | it should "not fail with content without Named Entities" in { 17 | val content = "Just some basic text without any named entities." 18 | val entities = runNER(content) 19 | assert(entities.isEmpty) 20 | } 21 | 22 | it should "not fail with empty content" in { 23 | val entities = runNER("") 24 | assert(entities.isEmpty) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/test/scala/io/mem0r1es/trank/pipeline/PreProcessorSpec.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.pipeline 2 | 3 | import io.mem0r1es.trank.pipeline.PreProcessor._ 4 | import org.scalatest.FlatSpec 5 | import scala.io.Source 6 | import java.io.ByteArrayInputStream 7 | import java.io.FileInputStream 8 | 9 | class PreProcessorSpec extends FlatSpec { 10 | 11 | val htmlStr = Source.fromFile("src/test/resources/exascale.info.html").mkString 12 | val txtStr = Source.fromFile("src/test/resources/exascale.info.txt").mkString 13 | 14 | "A PreProcessor" should "remove boilerplate from HTML content" in { 15 | val html = new FileInputStream("src/test/resources/exascale.info.html") 16 | assert(preProcess(html).trim === txtStr.trim) 17 | } 18 | 19 | it should "leave intact textual content" in { 20 | val txt = new FileInputStream("src/test/resources/exascale.info.txt") 21 | assert(preProcess(txt).trim === txtStr.trim) 22 | } 23 | 24 | it should "not fail with empty content" in { 25 | assert(preProcess(new ByteArrayInputStream("".getBytes())) === "") 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/test/scala/io/mem0r1es/trank/ranking/ANCESTORSSpec.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.ranking 2 | 3 | import java.net.URI 4 | 5 | import org.scalatest.FlatSpec 6 | 7 | class ANCESTORSSpec extends FlatSpec { 8 | 9 | val type1 = (new URI("http://type1"), 10 | new HierInfo(2, Seq[URI](new URI("http://path1"), new URI("http://path2")))) 11 | val type2 = (new URI("http://type2"), 12 | new HierInfo(5, Seq[URI](new URI("http://type1"), new URI("http://path3")))) 13 | val type3 = (new URI("http://type3"), 14 | new HierInfo(4, Seq[URI](new URI("http://type1"), new URI("http://type2")))) 15 | 16 | "An ANCESTORS ranker" should "rank types properly" in { 17 | val ranked = new ANCESTORS().rank(Map(type1, type2, type3)) 18 | assert(ranked(0)._1.toString === "http://type3") 19 | assert(ranked(0)._2 === 2) 20 | } 21 | 22 | it should "not fail when no types are provided" in { 23 | new ANCESTORS().rank(Map[URI, HierInfo]()) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/test/scala/io/mem0r1es/trank/ranking/ANC_DEPTHSpec.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.ranking 2 | 3 | import java.net.URI 4 | 5 | import org.scalatest.FlatSpec 6 | 7 | class ANC_DEPTHSpec extends FlatSpec { 8 | 9 | val type1 = (new URI("http://type1"), 10 | new HierInfo(2, Seq[URI](new URI("http://path1"), new URI("http://path2")))) 11 | val type2 = (new URI("http://type2"), 12 | new HierInfo(5, Seq[URI](new URI("http://type1"), new URI("http://type3")))) 13 | val type3 = (new URI("http://type3"), 14 | new HierInfo(4, Seq[URI](new URI("http://type2"), new URI("http://path3")))) 15 | 16 | "An ANC_DEPTH ranker" should "rank types properly" in { 17 | val ranked = new ANC_DEPTH().rank(Map(type1, type2, type3)) 18 | assert(ranked(0)._1.toString === "http://type2") 19 | assert(ranked(0)._2 === 6) 20 | } 21 | 22 | it should "not fail when no types are provided" in { 23 | new ANC_DEPTH().rank(Map[URI, HierInfo]()) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/test/scala/io/mem0r1es/trank/ranking/DEPTHSpec.scala: -------------------------------------------------------------------------------- 1 | package io.mem0r1es.trank.ranking 2 | 3 | import java.net.URI 4 | 5 | import org.scalatest.FlatSpec 6 | 7 | class DEPTHSpec extends FlatSpec { 8 | 9 | val type1 = (new URI("http://type1"), 10 | new HierInfo(2, Seq[URI](new URI("http://path1"), new URI("http://path2"))) 11 | ) 12 | val type2 = (new URI("http://type2"), 13 | new HierInfo(4, Seq[URI](new URI("http://path2"), new URI("http://path3"))) 14 | ) 15 | 16 | 17 | "A DEPTH ranker" should "rank types by maximum depth" in { 18 | val ranked = new DEPTH().rank(Map(type1, type2)) 19 | assert(ranked(0)._1.toString === "http://type2") 20 | assert(ranked(0)._2 === 4) 21 | } 22 | 23 | it should "not fail when no types are provided" in { 24 | new DEPTH().rank(Map[URI, HierInfo]()) 25 | } 26 | } 27 | --------------------------------------------------------------------------------