├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── build.sbt
├── project
├── assembly.sbt
├── build.properties
└── plugins.sbt
└── src
├── main
├── resources
│ └── reference.conf
└── scala
│ └── io
│ └── mem0r1es
│ └── trank
│ ├── TRanker.scala
│ ├── pipeline
│ ├── EntityLinking.scala
│ ├── NER.scala
│ ├── PreProcessor.scala
│ ├── TypeRanking.scala
│ └── TypeRetrieval.scala
│ ├── ranking
│ ├── ANCESTORS.scala
│ ├── ANC_DEPTH.scala
│ ├── DEPTH.scala
│ ├── HierInfo.scala
│ └── RankingAlgo.scala
│ └── util
│ ├── IndexUtils.scala
│ └── TRankIndexType.scala
└── test
├── resources
├── exascale.info.html
└── exascale.info.txt
└── scala
└── io
└── mem0r1es
└── trank
├── pipeline
├── NERSpec.scala
└── PreProcessorSpec.scala
└── ranking
├── ANCESTORSSpec.scala
├── ANC_DEPTHSpec.scala
└── DEPTHSpec.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | # sbt specific
2 | dist/*
3 | target/
4 | lib_managed/
5 | src_managed/
6 | project/boot/
7 | project/plugins/project/
8 |
9 | # IntelliJ
10 | .idea/
11 |
12 | # Mac OS X
13 | .DS_Store
14 |
15 | # TRank specific
16 | trank-indexes/
17 | *.log
18 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 | - 2.11.7
4 | - 2.10.5
5 | jdk:
6 | - oraclejdk8
7 | - oraclejdk7
8 | - openjdk7
9 |
10 | script: sbt ++$TRAVIS_SCALA_VERSION test
11 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction, and
10 | distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright
13 | owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all other entities
16 | that control, are controlled by, or are under common control with that entity.
17 | For the purposes of this definition, "control" means (i) the power, direct or
18 | indirect, to cause the direction or management of such entity, whether by
19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
20 | outstanding shares, or (iii) beneficial ownership of such entity.
21 |
22 | "You" (or "Your") shall mean an individual or Legal Entity exercising
23 | permissions granted by this License.
24 |
25 | "Source" form shall mean the preferred form for making modifications, including
26 | but not limited to software source code, documentation source, and configuration
27 | files.
28 |
29 | "Object" form shall mean any form resulting from mechanical transformation or
30 | translation of a Source form, including but not limited to compiled object code,
31 | generated documentation, and conversions to other media types.
32 |
33 | "Work" shall mean the work of authorship, whether in Source or Object form, made
34 | available under the License, as indicated by a copyright notice that is included
35 | in or attached to the work (an example is provided in the Appendix below).
36 |
37 | "Derivative Works" shall mean any work, whether in Source or Object form, that
38 | is based on (or derived from) the Work and for which the editorial revisions,
39 | annotations, elaborations, or other modifications represent, as a whole, an
40 | original work of authorship. For the purposes of this License, Derivative Works
41 | shall not include works that remain separable from, or merely link (or bind by
42 | name) to the interfaces of, the Work and Derivative Works thereof.
43 |
44 | "Contribution" shall mean any work of authorship, including the original version
45 | of the Work and any modifications or additions to that Work or Derivative Works
46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work
47 | by the copyright owner or by an individual or Legal Entity authorized to submit
48 | on behalf of the copyright owner. For the purposes of this definition,
49 | "submitted" means any form of electronic, verbal, or written communication sent
50 | to the Licensor or its representatives, including but not limited to
51 | communication on electronic mailing lists, source code control systems, and
52 | issue tracking systems that are managed by, or on behalf of, the Licensor for
53 | the purpose of discussing and improving the Work, but excluding communication
54 | that is conspicuously marked or otherwise designated in writing by the copyright
55 | owner as "Not a Contribution."
56 |
57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf
58 | of whom a Contribution has been received by Licensor and subsequently
59 | incorporated within the Work.
60 |
61 | 2. Grant of Copyright License.
62 |
63 | Subject to the terms and conditions of this License, each Contributor hereby
64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
65 | irrevocable copyright license to reproduce, prepare Derivative Works of,
66 | publicly display, publicly perform, sublicense, and distribute the Work and such
67 | Derivative Works in Source or Object form.
68 |
69 | 3. Grant of Patent License.
70 |
71 | Subject to the terms and conditions of this License, each Contributor hereby
72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
73 | irrevocable (except as stated in this section) patent license to make, have
74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where
75 | such license applies only to those patent claims licensable by such Contributor
76 | that are necessarily infringed by their Contribution(s) alone or by combination
77 | of their Contribution(s) with the Work to which such Contribution(s) was
78 | submitted. If You institute patent litigation against any entity (including a
79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a
80 | Contribution incorporated within the Work constitutes direct or contributory
81 | patent infringement, then any patent licenses granted to You under this License
82 | for that Work shall terminate as of the date such litigation is filed.
83 |
84 | 4. Redistribution.
85 |
86 | You may reproduce and distribute copies of the Work or Derivative Works thereof
87 | in any medium, with or without modifications, and in Source or Object form,
88 | provided that You meet the following conditions:
89 |
90 | You must give any other recipients of the Work or Derivative Works a copy of
91 | this License; and
92 | You must cause any modified files to carry prominent notices stating that You
93 | changed the files; and
94 | You must retain, in the Source form of any Derivative Works that You distribute,
95 | all copyright, patent, trademark, and attribution notices from the Source form
96 | of the Work, excluding those notices that do not pertain to any part of the
97 | Derivative Works; and
98 | If the Work includes a "NOTICE" text file as part of its distribution, then any
99 | Derivative Works that You distribute must include a readable copy of the
100 | attribution notices contained within such NOTICE file, excluding those notices
101 | that do not pertain to any part of the Derivative Works, in at least one of the
102 | following places: within a NOTICE text file distributed as part of the
103 | Derivative Works; within the Source form or documentation, if provided along
104 | with the Derivative Works; or, within a display generated by the Derivative
105 | Works, if and wherever such third-party notices normally appear. The contents of
106 | the NOTICE file are for informational purposes only and do not modify the
107 | License. You may add Your own attribution notices within Derivative Works that
108 | You distribute, alongside or as an addendum to the NOTICE text from the Work,
109 | provided that such additional attribution notices cannot be construed as
110 | modifying the License.
111 | You may add Your own copyright statement to Your modifications and may provide
112 | additional or different license terms and conditions for use, reproduction, or
113 | distribution of Your modifications, or for any such Derivative Works as a whole,
114 | provided Your use, reproduction, and distribution of the Work otherwise complies
115 | with the conditions stated in this License.
116 |
117 | 5. Submission of Contributions.
118 |
119 | Unless You explicitly state otherwise, any Contribution intentionally submitted
120 | for inclusion in the Work by You to the Licensor shall be under the terms and
121 | conditions of this License, without any additional terms or conditions.
122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of
123 | any separate license agreement you may have executed with Licensor regarding
124 | such Contributions.
125 |
126 | 6. Trademarks.
127 |
128 | This License does not grant permission to use the trade names, trademarks,
129 | service marks, or product names of the Licensor, except as required for
130 | reasonable and customary use in describing the origin of the Work and
131 | reproducing the content of the NOTICE file.
132 |
133 | 7. Disclaimer of Warranty.
134 |
135 | Unless required by applicable law or agreed to in writing, Licensor provides the
136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
138 | including, without limitation, any warranties or conditions of TITLE,
139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
140 | solely responsible for determining the appropriateness of using or
141 | redistributing the Work and assume any risks associated with Your exercise of
142 | permissions under this License.
143 |
144 | 8. Limitation of Liability.
145 |
146 | In no event and under no legal theory, whether in tort (including negligence),
147 | contract, or otherwise, unless required by applicable law (such as deliberate
148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be
149 | liable to You for damages, including any direct, indirect, special, incidental,
150 | or consequential damages of any character arising as a result of this License or
151 | out of the use or inability to use the Work (including but not limited to
152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or
153 | any and all other commercial damages or losses), even if such Contributor has
154 | been advised of the possibility of such damages.
155 |
156 | 9. Accepting Warranty or Additional Liability.
157 |
158 | While redistributing the Work or Derivative Works thereof, You may choose to
159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or
160 | other liability obligations and/or rights consistent with this License. However,
161 | in accepting such obligations, You may act only on Your own behalf and on Your
162 | sole responsibility, not on behalf of any other Contributor, and only if You
163 | agree to indemnify, defend, and hold each Contributor harmless for any liability
164 | incurred by, or claims asserted against, such Contributor by reason of your
165 | accepting any such warranty or additional liability.
166 |
167 | END OF TERMS AND CONDITIONS
168 |
169 | APPENDIX: How to apply the Apache License to your work
170 |
171 | To apply the Apache License to your work, attach the following boilerplate
172 | notice, with the fields enclosed by brackets "[]" replaced with your own
173 | identifying information. (Don't include the brackets!) The text should be
174 | enclosed in the appropriate comment syntax for the file format. We also
175 | recommend that a file or class name and description of purpose be included on
176 | the same "printed page" as the copyright notice for easier identification within
177 | third-party archives.
178 |
179 | Copyright [2015] [eXascale Infolab]
180 |
181 | Licensed under the Apache License, Version 2.0 (the "License");
182 | you may not use this file except in compliance with the License.
183 | You may obtain a copy of the License at
184 |
185 | http://www.apache.org/licenses/LICENSE-2.0
186 |
187 | Unless required by applicable law or agreed to in writing, software
188 | distributed under the License is distributed on an "AS IS" BASIS,
189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 | See the License for the specific language governing permissions and
191 | limitations under the License.
192 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | TRank [](https://travis-ci.org/XI-lab/TRank)
2 | =====
3 | TRank implements a Scala pipeline for:
4 | * boilerplate removal on markup content
5 | * Named Entity Recognition
6 | * Entity linkage with *DBpedia* URIs
7 | * Entity typing using a novel type hierarchy that combines *DBpedia*, *Yago*, and *schema.org* classes
8 | * Type ranking based on algorithms that underwent thorough evaluation via crowdsourcing
9 |
10 |
11 | For example, a document containing the label *University of Fribourg* will return:
12 | ```scala
13 | http://dbpedia.org/resource/University_of_Fribourg ->
14 |
15 | Seq(http://dbpedia.org/class/yago/UniversitiesInSwitzerland,
16 | http://dbpedia.org/class/yago/PuBlicUniversities,
17 | http://schema.org/CollegeOrUniversity,
18 | http://dbpedia.org/ontology/University,
19 | http://dbpedia.org/ontology/EducationalInstitution,
20 | http://schema.org/EducationalOrganization,
21 | http://dbpedia.org/ontology/Organisation,
22 | http://schema.org/Organization,
23 | http://dbpedia.org/ontology/Agent)
24 | ```
25 |
26 | How To Use TRank
27 | ----------------
28 | ### API
29 | To use TRank, it is enough to create a TRanker object with any textual content:
30 | ```scala
31 | class TRanker(content: String)
32 | ```
33 | possibly specifying an alternative ranking algorithm, instead of the default ANCESTORS:
34 | ```scala
35 | class TRanker(content: String, rankingAlgo: RankingAlgo)
36 |
37 | trait RankingAlgo { def rank(???): Seq[URI] }
38 | ```
39 |
40 | The results of the whole pipeline process are accessible through:
41 | ```scala
42 | TRanker.entityToTRankedTypes: Map[URI, Seq[URI]]
43 | ```
44 | for the final step, and through similar data structures for all the intermediate steps.
45 |
46 | ### Indexes
47 | TRank requires 3 Lucene indexes that are available for
48 | [download here](http://trank.exascale.info/downloads/trank-indexes.tgz).
49 | The .tgz can be extracted in the classpath of the library, and TRank will start to use seamlessly the 3 indexes.
50 |
51 | **IMPORTANT:** do not change the directory structure of `trank-indexes/`.
52 |
53 |
54 | Alternatively, TRank uses the [Typesafe Configuration](https://github.com/typesafehub/config) library to manage user
55 | settings. To override the default path to the indexes, it is enough to define the `TRank.index_basepath` property.
56 |
57 |
58 | Background
59 | ----------
60 | Much of Web search and browsing activity is today centered around entities. For this reason, Search Engine Result
61 | Pages (SERPs) increasingly contain information about the searched entities such as pictures, short summaries,
62 | related entities, and factual information. A key facet that is often displayed on the SERPs and that is instrumental
63 | for many applications is the entity type. However, an entity is usually not associated to a single generic type
64 | in the background knowledge bases but rather to a set of more specific types, which may be relevant or not given the
65 | document context. For example, one can find on the Linked Open Data cloud the fact that Tom Hanks is a person, an actor,
66 | and a person from Concord, California. All these types are correct but some may be too general to be interesting (e.g.,
67 | person), while other may be interesting but already known to the user (e.g., actor), or may be irrelevant given the
68 | current browsing context (e.g., person from Concord, California). In this paper, we define the new task of ranking entity
69 | types given an entity and its context. We propose and evaluate new methods to find the most relevant entity type based on
70 | collection statistics and on the graph structure interconnecting entities and types. An extensive experimental evaluation
71 | over several document collections at different levels of granularity (e.g., sentences, paragraphs, etc.) and different
72 | type hierarchies (including DBPedia, Freebase, and schema.org) shows that hierarchy-based approaches provide more accurate
73 | results when picking entity types to be displayed to the end-user.
74 |
75 |
76 | For more information, check the [ISWC2013 paper](https://exascale.info/assets/pdf/entityTypes.pdf).
77 |
78 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | organization := "io.mem0r1es"
2 |
3 | name := "TRank"
4 |
5 | version := "1.0"
6 |
7 | scalaVersion := "2.11.6"
8 |
9 | scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature")
10 |
11 |
12 | // --------------------
13 | // --- Dependencies ---
14 | // --------------------
15 |
16 | resolvers += "Typesafe Repo" at "http://repo.typesafe.com/typesafe/releases/"
17 |
18 | // CoreNLP + resources
19 | libraryDependencies ++= Seq(
20 | "edu.stanford.nlp" % "stanford-corenlp" % "3.4.1",
21 | "edu.stanford.nlp" % "stanford-corenlp" % "3.4.1" classifier "models"
22 | )
23 |
24 | // Lucene deps
25 | libraryDependencies ++= Seq(
26 | "org.apache.lucene" % "lucene-core" % "4.10.4",
27 | "org.apache.lucene" % "lucene-analyzers-common" % "4.10.4",
28 | "org.apache.lucene" % "lucene-queries" % "4.10.4"
29 | )
30 |
31 | // Misc
32 | libraryDependencies ++= Seq(
33 | "org.apache.tika" % "tika-core" % "1.7",
34 | "org.apache.tika" % "tika-parsers" % "1.7",
35 | "commons-io" % "commons-io" % "2.4",
36 | "com.typesafe" % "config" % "1.2.1",
37 | "com.typesafe.play" %% "play-json" % "2.3.8",
38 | "org.scalatest" %% "scalatest" % "2.2.1" % "test"
39 | )
40 |
41 |
--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")
2 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.8
2 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eXascaleInfolab/TRank/4f69dc9dfc8f60b53ce402382c654f6d7aabe827/project/plugins.sbt
--------------------------------------------------------------------------------
/src/main/resources/reference.conf:
--------------------------------------------------------------------------------
1 | TRank {
2 | index_basepath = "trank-indexes/"
3 | }
--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/TRanker.scala:
--------------------------------------------------------------------------------
1 | package io.mem0r1es.trank
2 |
3 | import com.typesafe.config.Config
4 | import com.typesafe.config.ConfigFactory
5 | import io.mem0r1es.trank.pipeline.EntityLinking.linkEntities
6 | import io.mem0r1es.trank.pipeline.NER.runNER
7 | import io.mem0r1es.trank.pipeline.PreProcessor.preProcess
8 | import io.mem0r1es.trank.pipeline.TypeRanking.rankTypes
9 | import io.mem0r1es.trank.pipeline.TypeRetrieval.retrieveTypes
10 | import io.mem0r1es.trank.ranking.ANCESTORS
11 | import io.mem0r1es.trank.ranking.RankingAlgo
12 | import java.io.InputStream
13 | import scala.io.Source
14 | import java.io.ByteArrayInputStream
15 |
16 | class TRanker(content: InputStream, rankingAlgo: RankingAlgo, config: Config) {
17 |
18 | config.checkValid(ConfigFactory.defaultReference(), "TRank")
19 |
20 | /**
21 | * Default to standard config.
22 | */
23 | def this(content: InputStream, rankingAlgo: RankingAlgo) {
24 | this(content, rankingAlgo, ConfigFactory.load())
25 | }
26 |
27 | /**
28 | * Default to ANCESTORS ranking algorithm, and standard config.
29 | */
30 | def this(content: InputStream) {
31 | this(content, new ANCESTORS, ConfigFactory.load())
32 | }
33 |
34 | /**
35 | * Default to standard config.
36 | */
37 | def this(contentStr: String, rankingAlgo: RankingAlgo) {
38 | this(new ByteArrayInputStream(contentStr.getBytes()),
39 | rankingAlgo,
40 | ConfigFactory.load())
41 | }
42 |
43 | /**
44 | * Default to ANCESTORS ranking algorithm, and standard config.
45 | */
46 | def this(contentStr: String) {
47 | this(new ByteArrayInputStream(contentStr.getBytes()),
48 | new ANCESTORS,
49 | ConfigFactory.load())
50 | }
51 |
52 |
53 | val contentRaw = content
54 |
55 | // TRank pipeline steps
56 | val contentPreProcessed = preProcess(content)
57 | private val entityLabels = runNER(contentPreProcessed)
58 | val entityToLabel = linkEntities(entityLabels, config)
59 | val entityURIs = entityToLabel.keySet
60 | val entityToTypes = retrieveTypes(entityURIs, config)
61 | val entityToTRankedTypes = rankTypes(entityToTypes, rankingAlgo, config)
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/pipeline/EntityLinking.scala:
--------------------------------------------------------------------------------
1 | package io.mem0r1es.trank.pipeline
2 |
3 | import java.net.URI
4 |
5 | import org.apache.lucene.index.Term
6 | import org.apache.lucene.search.BooleanClause
7 | import org.apache.lucene.search.BooleanClause.Occur
8 | import org.apache.lucene.search.BooleanQuery
9 | import org.apache.lucene.search.IndexSearcher
10 | import org.apache.lucene.search.Query
11 | import org.apache.lucene.search.TermQuery
12 |
13 | import com.typesafe.config.Config
14 |
15 | import io.mem0r1es.trank.util.IndexUtils
16 | import io.mem0r1es.trank.util.TRankIndexType
17 |
18 | object EntityLinking {
19 |
20 | /**
21 | * Links Named Entity labels to DBpedia URIs.
22 | */
23 | def linkEntities(entityLabels: Set[String], config: Config): Map[URI, String] = {
24 | var entities = Map[URI, String]()
25 |
26 | entityLabels.foreach { label =>
27 | val uri = getURI(label, config)
28 | entities += uri -> label
29 | }
30 | entities.toMap
31 | }
32 |
33 | private def getURI(label: String, config: Config): URI = {
34 | val searcher = IndexUtils.getIndexSearcher(TRankIndexType.URI_INDEX, config)
35 | val exact = exactQuery(label, searcher)
36 | val bool = boolQuery(label, searcher)
37 |
38 | exact match {
39 | case Some(x) => return x
40 | case None =>
41 | }
42 | bool match {
43 | case Some(x) => return x
44 | case _ => return new URI("")
45 | }
46 | }
47 |
48 | private def exactQuery(label: String, searcher: IndexSearcher): Option[URI] = {
49 | val query = new TermQuery(new Term("labelex", label.toLowerCase()))
50 | top1(query, searcher)
51 | }
52 |
53 | private def boolQuery(label: String, searcher: IndexSearcher): Option[URI] = {
54 | val query = new BooleanQuery
55 | label.toLowerCase().split(" ").foreach { term =>
56 | query.add(new BooleanClause(new TermQuery(new Term("label", term)), Occur.SHOULD))
57 | }
58 | top1(query, searcher)
59 | }
60 |
61 | private def top1(query: Query, searcher: IndexSearcher): Option[URI] = {
62 | val docs = searcher.search(query, 1)
63 | if (docs.scoreDocs.length > 0) {
64 | val d = searcher.doc(docs.scoreDocs(0).doc)
65 | Option(new URI(d.get("uri")))
66 | } else {
67 | None
68 | }
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/pipeline/NER.scala:
--------------------------------------------------------------------------------
1 | package io.mem0r1es.trank.pipeline
2 |
3 | import edu.stanford.nlp.ie.crf.CRFClassifier
4 | import java.util.Properties
5 |
6 | object NER {
7 |
8 | private val props = new Properties()
9 | props.put("annotators", "tokenize")
10 | private val classifier = CRFClassifier.getClassifierNoExceptions(
11 | "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz")
12 |
13 | /**
14 | * Runs the Stanford Named Entity Reconizer on the given content,
15 | * returning a Set with all the entity labels.
16 | */
17 | def runNER(content: String): Set[String] = {
18 | val annotatedContent = classifier.classifyWithInlineXML(content)
19 | extractEntities(annotatedContent)
20 | }
21 |
22 | private def extractEntities(content: String): Set[String] = {
23 | extractSingleType(content, "