├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── build.sbt
├── project
    ├── assembly.sbt
    ├── build.properties
    └── plugins.sbt
└── src
    ├── main
        ├── resources
        │   └── reference.conf
        └── scala
        │   └── io
        │       └── mem0r1es
        │           └── trank
        │               ├── TRanker.scala
        │               ├── pipeline
        │                   ├── EntityLinking.scala
        │                   ├── NER.scala
        │                   ├── PreProcessor.scala
        │                   ├── TypeRanking.scala
        │                   └── TypeRetrieval.scala
        │               ├── ranking
        │                   ├── ANCESTORS.scala
        │                   ├── ANC_DEPTH.scala
        │                   ├── DEPTH.scala
        │                   ├── HierInfo.scala
        │                   └── RankingAlgo.scala
        │               └── util
        │                   ├── IndexUtils.scala
        │                   └── TRankIndexType.scala
    └── test
        ├── resources
            ├── exascale.info.html
            └── exascale.info.txt
        └── scala
            └── io
                └── mem0r1es
                    └── trank
                        ├── pipeline
                            ├── NERSpec.scala
                            └── PreProcessorSpec.scala
                        └── ranking
                            ├── ANCESTORSSpec.scala
                            ├── ANC_DEPTHSpec.scala
                            └── DEPTHSpec.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | # sbt specific
 2 | dist/*
 3 | target/
 4 | lib_managed/
 5 | src_managed/
 6 | project/boot/
 7 | project/plugins/project/
 8 | 
 9 | # IntelliJ
10 | .idea/
11 | 
12 | # Mac OS X
13 | .DS_Store
14 | 
15 | # TRank specific
16 | trank-indexes/
17 | *.log
18 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: scala
 2 | scala:
 3 |   - 2.11.7
 4 |   - 2.10.5
 5 | jdk:
 6 |   - oraclejdk8
 7 |   - oraclejdk7
 8 |   - openjdk7
 9 | 
10 | script: sbt ++$TRAVIS_SCALA_VERSION test
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 | Version 2.0, January 2004
  3 | http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 | "License" shall mean the terms and conditions for use, reproduction, and
 10 | distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright
 13 | owner that is granting the License.
 14 | 
 15 | "Legal Entity" shall mean the union of the acting entity and all other entities
 16 | that control, are controlled by, or are under common control with that entity.
 17 | For the purposes of this definition, "control" means (i) the power, direct or
 18 | indirect, to cause the direction or management of such entity, whether by
 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
 20 | outstanding shares, or (iii) beneficial ownership of such entity.
 21 | 
 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising
 23 | permissions granted by this License.
 24 | 
 25 | "Source" form shall mean the preferred form for making modifications, including
 26 | but not limited to software source code, documentation source, and configuration
 27 | files.
 28 | 
 29 | "Object" form shall mean any form resulting from mechanical transformation or
 30 | translation of a Source form, including but not limited to compiled object code,
 31 | generated documentation, and conversions to other media types.
 32 | 
 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made
 34 | available under the License, as indicated by a copyright notice that is included
 35 | in or attached to the work (an example is provided in the Appendix below).
 36 | 
 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that
 38 | is based on (or derived from) the Work and for which the editorial revisions,
 39 | annotations, elaborations, or other modifications represent, as a whole, an
 40 | original work of authorship. For the purposes of this License, Derivative Works
 41 | shall not include works that remain separable from, or merely link (or bind by
 42 | name) to the interfaces of, the Work and Derivative Works thereof.
 43 | 
 44 | "Contribution" shall mean any work of authorship, including the original version
 45 | of the Work and any modifications or additions to that Work or Derivative Works
 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work
 47 | by the copyright owner or by an individual or Legal Entity authorized to submit
 48 | on behalf of the copyright owner. For the purposes of this definition,
 49 | "submitted" means any form of electronic, verbal, or written communication sent
 50 | to the Licensor or its representatives, including but not limited to
 51 | communication on electronic mailing lists, source code control systems, and
 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for
 53 | the purpose of discussing and improving the Work, but excluding communication
 54 | that is conspicuously marked or otherwise designated in writing by the copyright
 55 | owner as "Not a Contribution."
 56 | 
 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf
 58 | of whom a Contribution has been received by Licensor and subsequently
 59 | incorporated within the Work.
 60 | 
 61 | 2. Grant of Copyright License.
 62 | 
 63 | Subject to the terms and conditions of this License, each Contributor hereby
 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 65 | irrevocable copyright license to reproduce, prepare Derivative Works of,
 66 | publicly display, publicly perform, sublicense, and distribute the Work and such
 67 | Derivative Works in Source or Object form.
 68 | 
 69 | 3. Grant of Patent License.
 70 | 
 71 | Subject to the terms and conditions of this License, each Contributor hereby
 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 73 | irrevocable (except as stated in this section) patent license to make, have
 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where
 75 | such license applies only to those patent claims licensable by such Contributor
 76 | that are necessarily infringed by their Contribution(s) alone or by combination
 77 | of their Contribution(s) with the Work to which such Contribution(s) was
 78 | submitted. If You institute patent litigation against any entity (including a
 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a
 80 | Contribution incorporated within the Work constitutes direct or contributory
 81 | patent infringement, then any patent licenses granted to You under this License
 82 | for that Work shall terminate as of the date such litigation is filed.
 83 | 
 84 | 4. Redistribution.
 85 | 
 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof
 87 | in any medium, with or without modifications, and in Source or Object form,
 88 | provided that You meet the following conditions:
 89 | 
 90 | You must give any other recipients of the Work or Derivative Works a copy of
 91 | this License; and
 92 | You must cause any modified files to carry prominent notices stating that You
 93 | changed the files; and
 94 | You must retain, in the Source form of any Derivative Works that You distribute,
 95 | all copyright, patent, trademark, and attribution notices from the Source form
 96 | of the Work, excluding those notices that do not pertain to any part of the
 97 | Derivative Works; and
 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any
 99 | Derivative Works that You distribute must include a readable copy of the
100 | attribution notices contained within such NOTICE file, excluding those notices
101 | that do not pertain to any part of the Derivative Works, in at least one of the
102 | following places: within a NOTICE text file distributed as part of the
103 | Derivative Works; within the Source form or documentation, if provided along
104 | with the Derivative Works; or, within a display generated by the Derivative
105 | Works, if and wherever such third-party notices normally appear. The contents of
106 | the NOTICE file are for informational purposes only and do not modify the
107 | License. You may add Your own attribution notices within Derivative Works that
108 | You distribute, alongside or as an addendum to the NOTICE text from the Work,
109 | provided that such additional attribution notices cannot be construed as
110 | modifying the License.
111 | You may add Your own copyright statement to Your modifications and may provide
112 | additional or different license terms and conditions for use, reproduction, or
113 | distribution of Your modifications, or for any such Derivative Works as a whole,
114 | provided Your use, reproduction, and distribution of the Work otherwise complies
115 | with the conditions stated in this License.
116 | 
117 | 5. Submission of Contributions.
118 | 
119 | Unless You explicitly state otherwise, any Contribution intentionally submitted
120 | for inclusion in the Work by You to the Licensor shall be under the terms and
121 | conditions of this License, without any additional terms or conditions.
122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of
123 | any separate license agreement you may have executed with Licensor regarding
124 | such Contributions.
125 | 
126 | 6. Trademarks.
127 | 
128 | This License does not grant permission to use the trade names, trademarks,
129 | service marks, or product names of the Licensor, except as required for
130 | reasonable and customary use in describing the origin of the Work and
131 | reproducing the content of the NOTICE file.
132 | 
133 | 7. Disclaimer of Warranty.
134 | 
135 | Unless required by applicable law or agreed to in writing, Licensor provides the
136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
138 | including, without limitation, any warranties or conditions of TITLE,
139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
140 | solely responsible for determining the appropriateness of using or
141 | redistributing the Work and assume any risks associated with Your exercise of
142 | permissions under this License.
143 | 
144 | 8. Limitation of Liability.
145 | 
146 | In no event and under no legal theory, whether in tort (including negligence),
147 | contract, or otherwise, unless required by applicable law (such as deliberate
148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be
149 | liable to You for damages, including any direct, indirect, special, incidental,
150 | or consequential damages of any character arising as a result of this License or
151 | out of the use or inability to use the Work (including but not limited to
152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or
153 | any and all other commercial damages or losses), even if such Contributor has
154 | been advised of the possibility of such damages.
155 | 
156 | 9. Accepting Warranty or Additional Liability.
157 | 
158 | While redistributing the Work or Derivative Works thereof, You may choose to
159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or
160 | other liability obligations and/or rights consistent with this License. However,
161 | in accepting such obligations, You may act only on Your own behalf and on Your
162 | sole responsibility, not on behalf of any other Contributor, and only if You
163 | agree to indemnify, defend, and hold each Contributor harmless for any liability
164 | incurred by, or claims asserted against, such Contributor by reason of your
165 | accepting any such warranty or additional liability.
166 | 
167 | END OF TERMS AND CONDITIONS
168 | 
169 | APPENDIX: How to apply the Apache License to your work
170 | 
171 | To apply the Apache License to your work, attach the following boilerplate
172 | notice, with the fields enclosed by brackets "[]" replaced with your own
173 | identifying information. (Don't include the brackets!) The text should be
174 | enclosed in the appropriate comment syntax for the file format. We also
175 | recommend that a file or class name and description of purpose be included on
176 | the same "printed page" as the copyright notice for easier identification within
177 | third-party archives.
178 | 
179 |    Copyright [2015] [eXascale Infolab]
180 | 
181 |    Licensed under the Apache License, Version 2.0 (the "License");
182 |    you may not use this file except in compliance with the License.
183 |    You may obtain a copy of the License at
184 | 
185 |      http://www.apache.org/licenses/LICENSE-2.0
186 | 
187 |    Unless required by applicable law or agreed to in writing, software
188 |    distributed under the License is distributed on an "AS IS" BASIS,
189 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 |    See the License for the specific language governing permissions and
191 |    limitations under the License.
192 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | TRank [![Build Status](https://travis-ci.org/XI-lab/TRank.png?branch=master)](https://travis-ci.org/XI-lab/TRank)
 2 | =====
 3 | TRank implements a Scala pipeline for:
 4 | * boilerplate removal on markup content
 5 | * Named Entity Recognition
 6 | * Entity linkage with *DBpedia* URIs
 7 | * Entity typing using a novel type hierarchy that combines *DBpedia*, *Yago*, and *schema.org* classes
 8 | * Type ranking based on algorithms that underwent thorough evaluation via crowdsourcing
 9 |  
10 | 
11 | For example, a document containing the label *University of Fribourg* will return:
12 | ```scala
13 | http://dbpedia.org/resource/University_of_Fribourg ->
14 | 
15 | Seq(http://dbpedia.org/class/yago/UniversitiesInSwitzerland,
16 | http://dbpedia.org/class/yago/PuBlicUniversities,
17 | http://schema.org/CollegeOrUniversity,
18 | http://dbpedia.org/ontology/University,
19 | http://dbpedia.org/ontology/EducationalInstitution,
20 | http://schema.org/EducationalOrganization,
21 | http://dbpedia.org/ontology/Organisation,
22 | http://schema.org/Organization,
23 | http://dbpedia.org/ontology/Agent)
24 | ```
25 | 
26 | How To Use TRank
27 | ----------------
28 | ### API
29 | To use TRank, it is enough to create a TRanker object with any textual content:
30 | ```scala
31 | class TRanker(content: String)
32 | ```
33 | possibly specifying an alternative ranking algorithm, instead of the default ANCESTORS:
34 | ```scala
35 | class TRanker(content: String, rankingAlgo: RankingAlgo)
36 | 
37 | trait RankingAlgo { def rank(???): Seq[URI] }
38 | ```
39 | 
40 | The results of the whole pipeline process are accessible through:
41 | ```scala
42 | TRanker.entityToTRankedTypes: Map[URI, Seq[URI]]
43 | ```
44 | for the final step, and through similar data structures for all the intermediate steps.
45 | 
46 | ### Indexes
47 | TRank requires 3 Lucene indexes that are available for
48 | [download here](http://trank.exascale.info/downloads/trank-indexes.tgz).
49 | The .tgz can be extracted in the classpath of the library, and TRank will start to use seamlessly the 3 indexes.
50 | 
51 | **IMPORTANT:** do not change the directory structure of `trank-indexes/`.
52 | 
53 | 
54 | Alternatively, TRank uses the [Typesafe Configuration](https://github.com/typesafehub/config) library to manage user
55 | settings. To override the default path to the indexes, it is enough to define the `TRank.index_basepath` property.
56 | 
57 | 
58 | Background
59 | ----------
60 | Much of Web search and browsing activity is today centered around entities. For this reason, Search Engine Result
61 | Pages (SERPs) increasingly contain information about the searched entities such as pictures, short summaries,
62 | related entities, and factual information. A key facet that is often displayed on the SERPs and that is instrumental
63 | for many applications is the entity type. However, an entity is usually not associated to a single generic type
64 | in the background knowledge bases but rather to a set of more specific types, which may be relevant or not given the
65 | document context. For example, one can find on the Linked Open Data cloud the fact that Tom Hanks is a person, an actor,
66 | and a person from Concord, California. All these types are correct but some may be too general to be interesting (e.g.,
67 | person), while other may be interesting but already known to the user (e.g., actor), or may be irrelevant given the
68 | current browsing context (e.g., person from Concord, California). In this paper, we define the new task of ranking entity
69 | types given an entity and its context. We propose and evaluate new methods to find the most relevant entity type based on
70 | collection statistics and on the graph structure interconnecting entities and types. An extensive experimental evaluation
71 | over several document collections at different levels of granularity (e.g., sentences, paragraphs, etc.) and different
72 | type hierarchies (including DBPedia, Freebase, and schema.org) shows that hierarchy-based approaches provide more accurate
73 | results when picking entity types to be displayed to the end-user.
74 | 
75 | 
76 | For more information, check the [ISWC2013 paper](https://exascale.info/assets/pdf/entityTypes.pdf).
77 | 
78 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | organization := "io.mem0r1es"
 2 | 
 3 | name := "TRank"
 4 | 
 5 | version := "1.0"
 6 | 
 7 | scalaVersion := "2.11.6"
 8 | 
 9 | scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature")
10 | 
11 | 
12 | // --------------------
13 | // --- Dependencies ---
14 | // --------------------
15 | 
16 | resolvers += "Typesafe Repo" at "http://repo.typesafe.com/typesafe/releases/"
17 | 
18 | // CoreNLP + resources
19 | libraryDependencies ++= Seq(
20 |     "edu.stanford.nlp" % "stanford-corenlp" % "3.4.1",
21 |     "edu.stanford.nlp" % "stanford-corenlp" % "3.4.1" classifier "models"
22 | )
23 | 
24 | // Lucene deps
25 | libraryDependencies ++= Seq(
26 |     "org.apache.lucene" % "lucene-core" % "4.10.4",
27 |     "org.apache.lucene" % "lucene-analyzers-common" % "4.10.4",
28 |     "org.apache.lucene" % "lucene-queries" % "4.10.4"
29 | )
30 | 
31 | // Misc
32 | libraryDependencies ++= Seq(
33 |     "org.apache.tika" % "tika-core" % "1.7",
34 |     "org.apache.tika" % "tika-parsers" % "1.7",
35 |     "commons-io" % "commons-io" % "2.4",
36 |     "com.typesafe" % "config" % "1.2.1",
37 |     "com.typesafe.play" %% "play-json" % "2.3.8",
38 |     "org.scalatest" %% "scalatest" % "2.2.1" % "test"
39 | )
40 | 
41 | 


--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")
2 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.8
2 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eXascaleInfolab/TRank/4f69dc9dfc8f60b53ce402382c654f6d7aabe827/project/plugins.sbt


--------------------------------------------------------------------------------
/src/main/resources/reference.conf:
--------------------------------------------------------------------------------
1 | TRank {
2 |     index_basepath = "trank-indexes/"
3 | }


--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/TRanker.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank
 2 | 
 3 | import com.typesafe.config.Config
 4 | import com.typesafe.config.ConfigFactory
 5 | import io.mem0r1es.trank.pipeline.EntityLinking.linkEntities
 6 | import io.mem0r1es.trank.pipeline.NER.runNER
 7 | import io.mem0r1es.trank.pipeline.PreProcessor.preProcess
 8 | import io.mem0r1es.trank.pipeline.TypeRanking.rankTypes
 9 | import io.mem0r1es.trank.pipeline.TypeRetrieval.retrieveTypes
10 | import io.mem0r1es.trank.ranking.ANCESTORS
11 | import io.mem0r1es.trank.ranking.RankingAlgo
12 | import java.io.InputStream
13 | import scala.io.Source
14 | import java.io.ByteArrayInputStream
15 | 
16 | class TRanker(content: InputStream, rankingAlgo: RankingAlgo, config: Config) {
17 | 
18 |   config.checkValid(ConfigFactory.defaultReference(), "TRank")
19 |   
20 |   /**
21 |    * Default to standard config.
22 |    */
23 |   def this(content: InputStream, rankingAlgo: RankingAlgo) {
24 |     this(content, rankingAlgo, ConfigFactory.load())
25 |   }
26 | 
27 |   /**
28 |    * Default to ANCESTORS ranking algorithm, and standard config.
29 |    */
30 |   def this(content: InputStream) {
31 |     this(content, new ANCESTORS, ConfigFactory.load())
32 |   }
33 | 
34 |   /**
35 |    * Default to standard config.
36 |    */
37 |   def this(contentStr: String, rankingAlgo: RankingAlgo) {
38 |     this(new ByteArrayInputStream(contentStr.getBytes()),
39 |          rankingAlgo,
40 |          ConfigFactory.load())
41 |   }
42 | 
43 |   /**
44 |    * Default to ANCESTORS ranking algorithm, and standard config.
45 |    */
46 |   def this(contentStr: String) {
47 |     this(new ByteArrayInputStream(contentStr.getBytes()),
48 |          new ANCESTORS,
49 |          ConfigFactory.load())
50 |   }
51 | 
52 | 
53 |   val contentRaw = content
54 | 
55 |   // TRank pipeline steps
56 |   val contentPreProcessed = preProcess(content)
57 |   private val entityLabels = runNER(contentPreProcessed)
58 |   val entityToLabel = linkEntities(entityLabels, config)
59 |   val entityURIs = entityToLabel.keySet
60 |   val entityToTypes = retrieveTypes(entityURIs, config)
61 |   val entityToTRankedTypes = rankTypes(entityToTypes, rankingAlgo, config)
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/pipeline/EntityLinking.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank.pipeline
 2 | 
 3 | import java.net.URI
 4 | 
 5 | import org.apache.lucene.index.Term
 6 | import org.apache.lucene.search.BooleanClause
 7 | import org.apache.lucene.search.BooleanClause.Occur
 8 | import org.apache.lucene.search.BooleanQuery
 9 | import org.apache.lucene.search.IndexSearcher
10 | import org.apache.lucene.search.Query
11 | import org.apache.lucene.search.TermQuery
12 | 
13 | import com.typesafe.config.Config
14 | 
15 | import io.mem0r1es.trank.util.IndexUtils
16 | import io.mem0r1es.trank.util.TRankIndexType
17 | 
18 | object EntityLinking {
19 | 
20 |   /**
21 |    * Links Named Entity labels to DBpedia URIs.
22 |    */
23 |   def linkEntities(entityLabels: Set[String], config: Config): Map[URI, String] = {
24 |     var entities = Map[URI, String]()
25 | 
26 |     entityLabels.foreach { label =>
27 |       val uri = getURI(label, config)
28 |       entities += uri -> label
29 |     }
30 |     entities.toMap
31 |   }
32 | 
33 |   private def getURI(label: String, config: Config): URI = {
34 |     val searcher = IndexUtils.getIndexSearcher(TRankIndexType.URI_INDEX, config)
35 |     val exact = exactQuery(label, searcher)
36 |     val bool = boolQuery(label, searcher)
37 | 
38 |     exact match {
39 |       case Some(x) => return x
40 |       case None =>
41 |     }
42 |     bool match {
43 |       case Some(x) => return x
44 |       case _ => return new URI("")
45 |     }
46 |   }
47 | 
48 |   private def exactQuery(label: String, searcher: IndexSearcher): Option[URI] = {
49 |     val query = new TermQuery(new Term("labelex", label.toLowerCase()))
50 |     top1(query, searcher)
51 |   }
52 | 
53 |   private def boolQuery(label: String, searcher: IndexSearcher): Option[URI] = {
54 |     val query = new BooleanQuery
55 |     label.toLowerCase().split(" ").foreach { term =>
56 |       query.add(new BooleanClause(new TermQuery(new Term("label", term)), Occur.SHOULD))
57 |     }
58 |     top1(query, searcher)
59 |   }
60 | 
61 |   private def top1(query: Query, searcher: IndexSearcher): Option[URI] = {
62 |     val docs = searcher.search(query, 1)
63 |     if (docs.scoreDocs.length > 0) {
64 |       val d = searcher.doc(docs.scoreDocs(0).doc)
65 |       Option(new URI(d.get("uri")))
66 |     } else {
67 |       None
68 |     }
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/pipeline/NER.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank.pipeline
 2 | 
 3 | import edu.stanford.nlp.ie.crf.CRFClassifier
 4 | import java.util.Properties
 5 | 
 6 | object NER {
 7 | 
 8 |   private val props = new Properties()
 9 |   props.put("annotators", "tokenize")
10 |   private val classifier = CRFClassifier.getClassifierNoExceptions(
11 |     "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz")
12 | 
13 |   /**
14 |    * Runs the Stanford Named Entity Reconizer on the given content,
15 |    * returning a Set with all the entity labels.
16 |    */
17 |   def runNER(content: String): Set[String] = {
18 |     val annotatedContent = classifier.classifyWithInlineXML(content)
19 |     extractEntities(annotatedContent)
20 |   }
21 | 
22 |   private def extractEntities(content: String): Set[String] = {
23 |     extractSingleType(content, "<PERSON>", "</PERSON>") ++
24 |       extractSingleType(content, "<LOCATION>", "</LOCATION>") ++
25 |       extractSingleType(content, "<ORGANIZATION>", "</ORGANIZATION>")
26 |   }
27 | 
28 |   private def extractSingleType(content: String, openTag: String, closeTag: String): Set[String] = {
29 |     var entities = Set[String]()
30 | 
31 |     val fragments = content.split(openTag)
32 |     fragments.slice(1, fragments.length).foreach { fragment =>
33 |       val label = fragment.split(closeTag)(0)
34 |       entities += label
35 |     }
36 | 
37 |     entities.toSet
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/pipeline/PreProcessor.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank.pipeline
 2 | 
 3 | import org.apache.tika.sax.BodyContentHandler
 4 | import org.apache.tika.metadata.Metadata
 5 | import org.apache.tika.parser.html.HtmlParser
 6 | import java.io.InputStream
 7 | import org.apache.tika.parser.ParseContext
 8 | 
 9 | 
10 | object PreProcessor {
11 | 
12 |   /**
13 |    * Runs the content pre-processing step (e.g., HTML tags removal)
14 |    */
15 |   def preProcess(content: InputStream): String = {
16 |     extractTextFromHTML(content)
17 |   }
18 | 
19 |   private def extractTextFromHTML(content: InputStream): String = {
20 |     val handler = new BodyContentHandler()
21 |     val metadata = new Metadata()
22 |     new HtmlParser().parse(content, handler, metadata, new ParseContext())
23 |     
24 |     handler.toString
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/pipeline/TypeRanking.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank.pipeline
 2 | 
 3 | import java.net.URI
 4 | 
 5 | import play.api.libs.json.{JsValue, Json}
 6 | 
 7 | import org.apache.lucene.index.Term
 8 | import org.apache.lucene.search.TermQuery
 9 | 
10 | import com.typesafe.config.Config
11 | 
12 | import io.mem0r1es.trank.ranking.HierInfo
13 | import io.mem0r1es.trank.ranking.RankingAlgo
14 | import io.mem0r1es.trank.util.IndexUtils
15 | import io.mem0r1es.trank.util.TRankIndexType
16 | 
17 | object TypeRanking {
18 | 
19 |   def rankTypes(entityTypes: Map[URI, Set[URI]],
20 |     rankingAlgo: RankingAlgo,
21 |     config: Config): Map[URI, Seq[(URI, Double)]] = {
22 | 
23 |     var ranked = Map[URI, Seq[(URI, Double)]]()
24 | 
25 |     entityTypes.foreach {
26 |       case (uri, types) =>
27 |         ranked += uri -> rankingAlgo.rank(types.map { t => t -> queryHier(t, config) }.toMap)
28 |     }
29 |     ranked
30 |   }
31 | 
32 |   private def queryHier(typeURI: URI, config: Config): HierInfo = {
33 |     val searcher = IndexUtils.getIndexSearcher(TRankIndexType.PATH_INDEX, config)
34 | 
35 |     val query = new TermQuery(new Term("uri", typeURI.toString))
36 |     val docs = searcher.search(query, 1)
37 |     if (docs.scoreDocs.length > 0) {
38 |       val d = searcher.doc(docs.scoreDocs(0).doc)
39 |       val level = d.get("level").toInt
40 |       val path: JsValue = Json.parse(d.get("path"))
41 |       path.asOpt[Array[String]] match {
42 |         case Some(l: Array[String]) => new HierInfo(level, l.map(t => new URI(t.toString)))
43 |         case _ => new HierInfo(level, Seq[URI]())
44 |       }
45 |     } else {
46 |       new HierInfo(-1, Seq[URI]())
47 |     }
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/pipeline/TypeRetrieval.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank.pipeline
 2 | 
 3 | import java.net.URI
 4 | 
 5 | import scala.Array.canBuildFrom
 6 | 
 7 | import org.apache.lucene.index.Term
 8 | import org.apache.lucene.search.TermQuery
 9 | 
10 | import com.typesafe.config.Config
11 | 
12 | import io.mem0r1es.trank.util.IndexUtils
13 | import io.mem0r1es.trank.util.TRankIndexType
14 | 
15 | object TypeRetrieval {
16 | 
17 |   /**
18 |    * Given a DBpedia resource URI, retrieve all its RDF types. 
19 |    */
20 |   def retrieveTypes(entities: Set[URI], config: Config): Map[URI, Set[URI]] = {
21 |     var typedEntities = Map[URI, Set[URI]]()
22 | 
23 |     entities.foreach { entity =>
24 |       val types = getTypes(entity, config)
25 |       typedEntities += entity -> types
26 |     }
27 |     typedEntities
28 |   }
29 | 
30 |   private def getTypes(entity: URI, config: Config): Set[URI] = {
31 |     val searcher = IndexUtils.getIndexSearcher(TRankIndexType.TYPE_INDEX, config)
32 | 
33 |     val query = new TermQuery(new Term("uri", entity.toString))
34 |     val docs = searcher.search(query, 1)
35 |     if (docs.scoreDocs.length > 0) {
36 |       val d = searcher.doc(docs.scoreDocs(0).doc)
37 |       d.getValues("type").map(new URI(_)).toSet
38 |     } else {
39 |       Set[URI]()
40 |     }
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/ranking/ANCESTORS.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank.ranking
 2 | 
 3 | import java.net.URI
 4 | 
 5 | class ANCESTORS extends RankingAlgo {
 6 | 
 7 |   /**
 8 |    * Rank types by inverse-sort on the # of ANCESTORS contained in the type set.
 9 |    */
10 |   override def rank(entityTypes: Map[URI, HierInfo]): Seq[(URI, Double)] = {
11 | 
12 |     def score(path: Seq[URI]): Double = {
13 |       path.filter (entityTypes.contains(_))
14 |       .length
15 |     }
16 | 
17 |     entityTypes.map {
18 |       case (k, v) => (k, score(v.path))
19 |     }
20 |     .toSeq
21 |     .sortBy(_._2)
22 |     .reverse
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/ranking/ANC_DEPTH.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank.ranking
 2 | 
 3 | import java.net.URI
 4 | 
 5 | class ANC_DEPTH extends RankingAlgo {
 6 | 
 7 |   /**
 8 |    * Rank types by inverse-sort on the # of ANCESTORS contained in the type set,
 9 |    * weighted by their DEPTHS.
10 |    */
11 |   override def rank(entityTypes: Map[URI, HierInfo]): Seq[(URI, Double)] = {
12 | 
13 |     def score(hier: HierInfo): Double = {
14 |       hier.path.filter (entityTypes.contains(_))
15 |       .map { case uri => entityTypes(uri).level.toDouble }
16 |       .sum
17 |     }
18 | 
19 |     entityTypes.map {
20 |       case (k, v) => (k, score(v))
21 |     }.toSeq.sortBy(_._2).reverse 
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/ranking/DEPTH.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank.ranking
 2 | 
 3 | import java.net.URI
 4 | 
 5 | import scala.collection.Seq
 6 | import scala.collection.immutable.Map
 7 | 
 8 | class DEPTH extends RankingAlgo {
 9 | 
10 |   /**
11 |    * Rank types by inverse-sort on the hierarchy level.
12 |    */
13 |   override def rank(entityTypes: Map[URI, HierInfo]): Seq[(URI, Double)] = {
14 |       entityTypes.toSeq.map { case (k,v) => (k, v.level.toDouble)}
15 |       .sortBy(_._2)
16 |       .reverse
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/ranking/HierInfo.scala:
--------------------------------------------------------------------------------
1 | package io.mem0r1es.trank.ranking
2 | 
3 | import java.net.URI
4 | 
5 | class HierInfo(val level: Int, val path: Seq[URI])  
6 | 


--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/ranking/RankingAlgo.scala:
--------------------------------------------------------------------------------
1 | package io.mem0r1es.trank.ranking
2 | 
3 | import java.net.URI
4 | 
5 | trait RankingAlgo {
6 | 
7 |   def rank(entityTypes: Map[URI, HierInfo]): Seq[(URI, Double)]
8 | }


--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/util/IndexUtils.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank.util
 2 | 
 3 | import java.io.File
 4 | 
 5 | import com.typesafe.config.Config
 6 | import org.apache.lucene.index.DirectoryReader
 7 | import org.apache.lucene.search.IndexSearcher
 8 | import org.apache.lucene.store.NIOFSDirectory
 9 | 
10 | 
11 | object IndexUtils {
12 | 
13 |   import TRankIndexType._
14 |   private var searcherCache = Map[TRankIndexType, IndexSearcher]()
15 | 
16 |   def getIndexSearcher(indexType: TRankIndexType, config: Config): IndexSearcher = {
17 |     val searcher = searcherCache.get(indexType)
18 |     
19 |     searcher match {
20 |       case Some(value) => value
21 |       case None => {
22 |         val value = createIndexSearcher(indexType, config)
23 |         searcherCache += indexType -> value
24 |         value
25 |       }
26 |     }
27 |   }
28 |   
29 |   private def createIndexSearcher(indexType: TRankIndexType, config: Config): IndexSearcher = {
30 |     val indexPath = new File(config.getString("TRank.index_basepath") + "/" + indexType)
31 |     val directory = new NIOFSDirectory(indexPath)
32 |     val reader = DirectoryReader.open(directory)
33 |     new IndexSearcher(reader)
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/scala/io/mem0r1es/trank/util/TRankIndexType.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank.util
 2 | 
 3 | object TRankIndexType extends Enumeration {
 4 |   type TRankIndexType = Value
 5 | 
 6 |   val URI_INDEX = Value("uriindex")
 7 |   val TYPE_INDEX = Value("typeindex")
 8 |   val PATH_INDEX = Value("pathindex")
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/test/resources/exascale.info.html:
--------------------------------------------------------------------------------
1 | <div style="margin-top: 0px;margin-left: -12px;">
2 | XI--the eXascale Infolab--is a new research group at the University of Fribourg, Switzerland.<br>We are designing, building and deploying next-generation infrastructures for Big Data, with a focus on social, scientific, and linked data.
3 | </div>
4 | 


--------------------------------------------------------------------------------
/src/test/resources/exascale.info.txt:
--------------------------------------------------------------------------------
1 | XI--the eXascale Infolab--is a new research group at the University of Fribourg, Switzerland.
2 | We are designing, building and deploying next-generation infrastructures for Big Data, with a focus on social, scientific, and linked data.
3 | 


--------------------------------------------------------------------------------
/src/test/scala/io/mem0r1es/trank/pipeline/NERSpec.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank.pipeline
 2 | 
 3 | import io.mem0r1es.trank.pipeline.NER._
 4 | import org.scalatest.FlatSpec
 5 | import scala.io.Source
 6 | 
 7 | class NERSpec extends FlatSpec {
 8 | 
 9 |   "A NER" should "extract entity labels" in {
10 |     val content = Source.fromFile("src/test/resources/exascale.info.txt").mkString
11 |     val entities = runNER(content)
12 |     assert(entities contains ("Switzerland"))
13 |     assert(entities contains ("University of Fribourg"))
14 |   }
15 | 
16 |   it should "not fail with content without Named Entities" in {
17 |     val content = "Just some basic text without any named entities."
18 |     val entities = runNER(content)
19 |     assert(entities.isEmpty)
20 |   }
21 | 
22 |   it should "not fail with empty content" in {
23 |     val entities = runNER("")
24 |     assert(entities.isEmpty)
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/test/scala/io/mem0r1es/trank/pipeline/PreProcessorSpec.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank.pipeline
 2 | 
 3 | import io.mem0r1es.trank.pipeline.PreProcessor._
 4 | import org.scalatest.FlatSpec
 5 | import scala.io.Source
 6 | import java.io.ByteArrayInputStream
 7 | import java.io.FileInputStream
 8 | 
 9 | class PreProcessorSpec extends FlatSpec {
10 |   
11 |   val htmlStr = Source.fromFile("src/test/resources/exascale.info.html").mkString
12 |   val txtStr = Source.fromFile("src/test/resources/exascale.info.txt").mkString
13 | 
14 |   "A PreProcessor" should "remove boilerplate from HTML content" in {
15 |     val html = new FileInputStream("src/test/resources/exascale.info.html")
16 |     assert(preProcess(html).trim === txtStr.trim)
17 |   }
18 | 
19 |   it should "leave intact textual content" in {
20 |     val txt = new FileInputStream("src/test/resources/exascale.info.txt")
21 |     assert(preProcess(txt).trim === txtStr.trim)
22 |   }
23 | 
24 |   it should "not fail with empty content" in {
25 |     assert(preProcess(new ByteArrayInputStream("".getBytes())) === "")
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/test/scala/io/mem0r1es/trank/ranking/ANCESTORSSpec.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank.ranking
 2 | 
 3 | import java.net.URI
 4 | 
 5 | import org.scalatest.FlatSpec
 6 | 
 7 | class ANCESTORSSpec extends FlatSpec {
 8 | 
 9 |   val type1 = (new URI("http://type1"),
10 |     new HierInfo(2, Seq[URI](new URI("http://path1"), new URI("http://path2"))))
11 |   val type2 = (new URI("http://type2"),
12 |     new HierInfo(5, Seq[URI](new URI("http://type1"), new URI("http://path3"))))
13 |   val type3 = (new URI("http://type3"),
14 |     new HierInfo(4, Seq[URI](new URI("http://type1"), new URI("http://type2"))))
15 | 
16 |   "An ANCESTORS ranker" should "rank types properly" in {
17 |     val ranked = new ANCESTORS().rank(Map(type1, type2, type3))
18 |     assert(ranked(0)._1.toString === "http://type3")
19 |     assert(ranked(0)._2 === 2)
20 |   }
21 | 
22 |   it should "not fail when no types are provided" in {
23 |     new ANCESTORS().rank(Map[URI, HierInfo]())
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/test/scala/io/mem0r1es/trank/ranking/ANC_DEPTHSpec.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank.ranking
 2 | 
 3 | import java.net.URI
 4 | 
 5 | import org.scalatest.FlatSpec
 6 | 
 7 | class ANC_DEPTHSpec extends FlatSpec {
 8 | 
 9 |   val type1 = (new URI("http://type1"),
10 |     new HierInfo(2, Seq[URI](new URI("http://path1"), new URI("http://path2"))))
11 |   val type2 = (new URI("http://type2"),
12 |     new HierInfo(5, Seq[URI](new URI("http://type1"), new URI("http://type3"))))
13 |   val type3 = (new URI("http://type3"),
14 |     new HierInfo(4, Seq[URI](new URI("http://type2"), new URI("http://path3"))))
15 | 
16 |   "An ANC_DEPTH ranker" should "rank types properly" in {
17 |     val ranked = new ANC_DEPTH().rank(Map(type1, type2, type3))
18 |     assert(ranked(0)._1.toString === "http://type2")
19 |     assert(ranked(0)._2 === 6)
20 |   }
21 | 
22 |   it should "not fail when no types are provided" in {
23 |     new ANC_DEPTH().rank(Map[URI, HierInfo]())
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/test/scala/io/mem0r1es/trank/ranking/DEPTHSpec.scala:
--------------------------------------------------------------------------------
 1 | package io.mem0r1es.trank.ranking
 2 | 
 3 | import java.net.URI
 4 | 
 5 | import org.scalatest.FlatSpec
 6 | 
 7 | class DEPTHSpec extends FlatSpec {
 8 |   
 9 |   val type1 = (new URI("http://type1"), 
10 |                new HierInfo(2, Seq[URI](new URI("http://path1"), new URI("http://path2")))
11 |               )
12 |   val type2 = (new URI("http://type2"), 
13 |                new HierInfo(4, Seq[URI](new URI("http://path2"), new URI("http://path3")))
14 |               )
15 | 
16 | 
17 |   "A DEPTH ranker" should "rank types by maximum depth" in {
18 |     val ranked = new DEPTH().rank(Map(type1, type2))
19 |     assert(ranked(0)._1.toString === "http://type2")
20 |     assert(ranked(0)._2 === 4)
21 |   }
22 |   
23 |   it should "not fail when no types are provided" in {
24 |     new DEPTH().rank(Map[URI, HierInfo]())
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------