├── requirements.txt ├── TODO.md ├── arxiv.sh ├── LICENSE ├── README.md ├── base.ttl ├── .gitignore ├── arxiv.py └── arxiv.ipynb /requirements.txt: -------------------------------------------------------------------------------- 1 | icecream >= 2.1 2 | matplotlib >= 3.4 3 | numpy >= 1.20 4 | pandas >= 1.4 5 | pytextrank >= 3.1 6 | python-dateutil >= 2.8 7 | spacy >= 3.2 8 | tqdm >= 4.63 9 | typer >= 0.4 10 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | ## PR 2 | * 3 | * "graph database" 4 | 5 | ## OpenAIRE 6 | * 7 | 8 | ## richcontext.scholapi 9 | * 10 | 11 | ## time series, predictive trend lines 12 | * 13 | 14 | ## prophy.science 15 | * 16 | -------------------------------------------------------------------------------- /arxiv.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -eux 3 | 4 | LOOKBACK=`date -v-2w +%Y-%m-%d` 5 | 6 | while [[ $# -gt 0 ]]; do 7 | key="$1" 8 | 9 | case $key in 10 | -l|--lookback) 11 | LOOKBACK="$2" 12 | shift # past argument 13 | shift # past value 14 | ;; 15 | esac 16 | done 17 | 18 | # query arXiv 19 | 20 | PHRASES=( "graph algorithms" "graph neural networks" "knowledge graph" ) 21 | TTL_FILE=/opt/derwen/chwedl/trends/arxiv.ttl 22 | 23 | for QUERY in "${PHRASES[@]}" 24 | do 25 | python3 arxiv.py cmd-query --min-date=$LOOKBACK --kg-path $TTL_FILE "$QUERY" 26 | done 27 | 28 | # analyze trends 29 | 30 | CSV_FILE=/opt/derwen/chwedl/trends/arxiv.csv 31 | PNG_FILE=/opt/derwen/chwedl/trends/arxiv.png 32 | 33 | python3 arxiv.py cmd-analyze --kg-path $TTL_FILE --csv-file $CSV_FILE 34 | python3 arxiv.py cmd-visualize --csv-file $CSV_FILE --png-file $PNG_FILE 35 | 36 | # extract phrases 37 | 38 | TODAY=`date +%Y%m%d` 39 | KPA_FILE=/opt/derwen/chwedl/trends/phrases.$TODAY.csv 40 | 41 | python3 arxiv.py cmd-extract --min-date=$LOOKBACK --kg-path $TTL_FILE --kpa-file $KPA_FILE 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021-2022 derwen.ai 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # arxiv-trends 2 | 3 | Analyze trends among articles published on [arXiv](https://arxiv.org/help/api) 4 | 5 | 6 | ## Install 7 | 8 | ```bash 9 | python3 -m venv venv 10 | source venv/bin/activate 11 | 12 | python3 -m pip install -U pip 13 | python3 -m pip install -r requirements.txt 14 | python3 -m spacy download en_core_web_sm 15 | cp base.ttl arxiv.ttl 16 | ``` 17 | 18 | 19 | ## Usage 20 | 21 | ```bash 22 | python3 arxiv.py cmd-query --min-date=2021-01-01 "knowledge graph" 23 | ``` 24 | 25 | ```bash 26 | python3 arxiv.py cmd-analyze 27 | python3 arxiv.py cmd-visualize 28 | ``` 29 | 30 | ```bash 31 | python3 arxiv.py cmd-extract 32 | ``` 33 | 34 | 35 | ## License and Copyright 36 | 37 | Source code for **arxiv-trends** plus its logo, documentation, and 38 | examples have an [MIT license](https://spdx.org/licenses/MIT.html) 39 | which is succinct and simplifies use in commercial applications. 40 | 41 | 42 | ## Kudos 43 | 44 | Kudos to arXiv for use of its open access interoperability; 45 | to Jürgen Müller @ BASF for the original idea; 46 | plus general support from [Derwen, Inc.](https://derwen.ai/); 47 | the [Knowledge Graph Conference](https://www.knowledgegraph.tech/) 48 | and [Connected Data World](https://connected-data.world/). 49 | -------------------------------------------------------------------------------- /base.ttl: -------------------------------------------------------------------------------- 1 | @prefix bibo: . 2 | @prefix dct: . 3 | @prefix derw: . 4 | @prefix foaf: . 5 | @prefix lcsh: . 6 | @prefix madsrdf: . 7 | @prefix skos: . 8 | @prefix wd: . 9 | @prefix xsd: . 10 | 11 | 12 | derw:topic_Graph_Algorithms a derw:Topic ; 13 | skos:broader ; 14 | skos:closeMatch lcsh:sh2002004605, 15 | ; 16 | skos:definition "A family of algorithms that operate on graphs for network analysis, measurement, ranking, partitioning, and other methods that leverage graph theory."@en ; 17 | skos:prefLabel "graph algorithms"@en . 18 | 19 | derw:topic_Graph_Database a derw:Topic ; 20 | skos:broader ; 21 | skos:prefLabel "graph database"@en . 22 | 23 | derw:topic_Graph_Embedding a derw:Topic ; 24 | skos:broader ; 25 | skos:prefLabel "graph embedding"@en . 26 | 27 | derw:topic_Graph_Neural_Networks a derw:Topic ; 28 | skos:broader ; 29 | skos:prefLabel "graph neural networks"@en . 30 | 31 | derw:topic_Knowledge_Graph a derw:Topic ; 32 | skos:altLabel "KG"@en ; 33 | skos:broader ; 34 | skos:closeMatch wd:Q33002955, 35 | ; 36 | skos:definition "A knowledge base that uses a graph-structured data model, representing and annotating interlinked descriptions of entities, with an overlay of semantic metadata."@en ; 37 | skos:prefLabel "knowledge graph"@en . 38 | 39 | derw:Topic a skos:Concept , 40 | madsrdf:Topic , 41 | madsrdf:Authority ; 42 | skos:prefLabel "Topic"@en ; 43 | dct:identifier wd:Q1969448 ; 44 | skos:definition "Subject heading used for classifying content and navigating discovery within it."@en . 45 | 46 | derw:Author a skos:Concept, 47 | foaf:Agent ; 48 | dct:identifier wd:Q482980 ; 49 | skos:definition "An author of a publication."@en ; 50 | skos:prefLabel "Author"@en ; 51 | skos:topConceptOf derw:Derwen_Vocabulary . 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | arxiv.csv 2 | arxiv.png 3 | arxiv.ttl 4 | phrases.csv 5 | *~ 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | pip-wheel-metadata/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 101 | __pypackages__/ 102 | 103 | # Celery stuff 104 | celerybeat-schedule 105 | celerybeat.pid 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # Environments 111 | .env 112 | .venv 113 | env/ 114 | venv/ 115 | ENV/ 116 | env.bak/ 117 | venv.bak/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | -------------------------------------------------------------------------------- /arxiv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # see license https://github.com/DerwenAI/arxiv-trends#license-and-copyright 4 | 5 | """ 6 | arxiv-trends: 7 | 8 | Analyze trends in articles published on *arXiv* using NLP, knowledge graph, and time-series. 9 | """ 10 | 11 | from collections import defaultdict 12 | import itertools 13 | import pathlib 14 | import re 15 | import sys 16 | import typing 17 | import unicodedata 18 | import urllib 19 | import urllib.parse 20 | import urllib.request 21 | import xml.etree.ElementTree as et 22 | 23 | from icecream import ic # type: ignore # pylint: disable=E0401 24 | from tqdm import tqdm 25 | import dateutil.tz 26 | import numpy as np 27 | import matplotlib.pyplot as plt # type: ignore # pylint: disable=E0401 28 | import pandas as pd # type: ignore # pylint: disable=E0401 29 | import rdflib # type: ignore # pylint: disable=E0401 30 | import spacy 31 | import typer 32 | 33 | import pytextrank # type: ignore # pylint: disable=E0401 34 | import kglab # type: ignore # pylint: disable=E0401 35 | 36 | 37 | APP = typer.Typer() 38 | 39 | 40 | ###################################################################### 41 | ## utility functions 42 | 43 | def strip_accents ( 44 | text: str, 45 | ) -> str: 46 | """ 47 | Strip accents from the input string. 48 | 49 | See 50 | 51 | text: 52 | the input string 53 | 54 | returns: 55 | the processed string 56 | """ 57 | text = unicodedata.normalize("NFD", text) 58 | text = text.encode("ascii", "ignore") # type: ignore 59 | text = text.decode("utf-8") # type: ignore 60 | 61 | return str(text) 62 | 63 | 64 | def text_to_id ( 65 | text: str, 66 | ) -> str: 67 | """ 68 | Convert input text to an identifier, suitable for a URI 69 | 70 | text: 71 | raw text for the label of a node in the graph 72 | 73 | returns: 74 | a string usable as a unique symbol in RDF 75 | """ 76 | text = strip_accents(text.lower()) 77 | text = re.sub("[ ]+", "_", text) 78 | text = re.sub("[^0-9a-zA-Z_-]", "", text) 79 | 80 | return text 81 | 82 | 83 | ###################################################################### 84 | ## class definitions 85 | 86 | class Trends: 87 | """ 88 | Analyze trends among papers published on arXiv. 89 | """ 90 | NS = { 91 | "atom": "http://www.w3.org/2005/Atom", 92 | "bibo": "http://purl.org/ontology/bibo/", 93 | "cito": "http://purl.org/spar/cito/", 94 | "dct": "http://purl.org/dc/terms/", 95 | "derw": "https://derwen.ai/ns/v1#", 96 | "foaf": "http://xmlns.com/foaf/0.1/", 97 | "lcsh": "http://id.loc.gov/authorities/subjects/", 98 | "madsrdf": "http://www.loc.gov/mads/rdf/v1#", 99 | "opensearch": "http://a9.com/-/spec/opensearch/1.1/", 100 | "owl": "http://www.w3.org/2002/07/owl#", 101 | "prov": "http://www.w3.org/ns/prov#", 102 | "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", 103 | "rdfs": "http://www.w3.org/2000/01/rdf-schema#", 104 | "schema": "http://schema.org/", 105 | "sh": "http://www.w3.org/ns/shacl#", 106 | "skos": "http://www.w3.org/2004/02/skos/core#", 107 | "wd": "http://www.wikidata.org/entity/", 108 | "xsd": "http://www.w3.org/2001/XMLSchema#", 109 | } 110 | 111 | API_BASE = "http://export.arxiv.org/api/query?" 112 | 113 | 114 | def __init__ ( 115 | self, 116 | *, 117 | kg_path: str = "arxiv.ttl", 118 | ): 119 | """ 120 | Constructor. 121 | 122 | kg_path: 123 | optional - path to the TTL file for kglab to load/save 124 | """ 125 | self.kg = kglab.KnowledgeGraph(namespaces=self.NS) 126 | self.kg_path = pathlib.Path(kg_path) 127 | self.topics: typing.Dict[str, rdflib.Node] = {} 128 | self.load_kg() 129 | 130 | 131 | def load_kg ( 132 | self, 133 | ) -> None: 134 | """ 135 | Load the previous definitions from a serialized KG and initialize the 136 | topics lookup. 137 | """ 138 | self.topics = {} 139 | self.kg.load_rdf(self.kg_path) 140 | 141 | sparql = """ 142 | SELECT ?entry ?label 143 | WHERE { 144 | ?entry a derw:Topic . 145 | ?entry skos:prefLabel ?label 146 | } 147 | """ 148 | 149 | for node, topic in self.kg.query(sparql): 150 | self.topics[topic.toPython()] = node 151 | 152 | 153 | def save_kg ( 154 | self, 155 | ) -> None: 156 | """ 157 | Serialize the updated KG to a file. 158 | """ 159 | self.kg.save_rdf(self.kg_path) 160 | 161 | 162 | def lookup_author ( 163 | self, 164 | name: str, 165 | ) -> rdflib.URIRef: 166 | """ 167 | Lookup an author by name, creating a node in the KG if it doesn't 168 | already exist. 169 | 170 | name: 171 | raw text for the name of the author 172 | 173 | returns: 174 | author node 175 | """ 176 | uri = self.kg.get_ns("derw") + "author_" + text_to_id(name) 177 | node = rdflib.URIRef(uri) 178 | p = self.kg.get_ns("rdf").type 179 | o = self.kg.get_ns("derw").Author 180 | 181 | if (node, p, o) not in self.kg.rdf_graph(): 182 | self.kg.add(node, p, o) 183 | self.kg.add(node, self.kg.get_ns("foaf").name, rdflib.Literal(name, lang=self.kg.language)) 184 | 185 | return node 186 | 187 | 188 | def parse_entry ( 189 | self, 190 | entry: et.Element, 191 | ) -> typing.Tuple[rdflib.URIRef, str]: 192 | """ 193 | Parse the XML from one entry in an Atom feed, and add it to the KG. 194 | 195 | entry: 196 | XML object for one Atom feed entry 197 | 198 | returns: 199 | node and date of the parsed results 200 | """ 201 | href = entry.find("atom:link[@title='pdf']", self.NS).attrib["href"] # type: ignore 202 | date = entry.find("atom:published", self.NS).text[:10] # type: ignore 203 | title = entry.find("atom:title", self.NS).text # type: ignore 204 | abstract = entry.find("atom:summary", self.NS).text.replace("\n", " ").strip() # type: ignore 205 | 206 | # lookup the specified article in the KG, and create a node if 207 | # it doesn't already exist 208 | node = rdflib.URIRef(href) 209 | p = self.kg.get_ns("rdf").type 210 | o = self.kg.get_ns("bibo").Article 211 | 212 | if (node, p, o) not in self.kg.rdf_graph(): 213 | self.kg.add(node, p, o) 214 | self.kg.add(node, self.kg.get_ns("dct").Date, self.kg.encode_date(date, [dateutil.tz.gettz("UTC")])) 215 | self.kg.add(node, self.kg.get_ns("dct").title, rdflib.Literal(title, lang=self.kg.language, normalize=False)) 216 | self.kg.add(node, self.kg.get_ns("dct").abstract, rdflib.Literal(abstract, lang=self.kg.language)) 217 | 218 | # add author list 219 | for author in entry.findall("atom:author/atom:name", self.NS): 220 | self.kg.add(node, self.kg.get_ns("bibo").authorList, self.lookup_author(author.text)) # type: ignore 221 | 222 | return node, date 223 | 224 | 225 | @classmethod 226 | def format_query ( 227 | cls, 228 | query: str, 229 | start: int, 230 | max_results: int, 231 | ) -> str: 232 | """ 233 | Format a URL to search arXiv via its API, based on the given search 234 | criteria. 235 | 236 | query: 237 | query string 238 | 239 | start: 240 | start index within the results 241 | 242 | max_results: 243 | maximum results to return per API call 244 | 245 | returns: 246 | query URL 247 | """ 248 | params: dict = { 249 | "search_query": "all:" + query, 250 | "start": start, 251 | "max_results": max_results, 252 | "sortBy": "submittedDate", 253 | "sortOrder": "descending", 254 | } 255 | 256 | return urllib.parse.urlencode(params, safe=":") 257 | 258 | 259 | def arxiv_api ( 260 | self, 261 | query: str, 262 | min_date: str, 263 | *, 264 | max_items: int = 1, 265 | page_items: int = 1, 266 | ) -> typing.Iterable: 267 | """ 268 | Access the arXiv API based on the given search criteria, parse the XML 269 | results (Atom feed), then update the KG to represent any new entries. 270 | 271 | query: 272 | query string 273 | 274 | min_date: 275 | minimum date to include in the results 276 | 277 | max_items: 278 | optional - maximum items requested per API call 279 | 280 | page_items: 281 | optional - maximum items requested per page 282 | 283 | yields: 284 | `(date, href)` tuple for each search hit within the criteria 285 | """ 286 | start_index = 0 287 | max_index = max_items 288 | 289 | while (start_index < max_index): 290 | # prepare the API query 291 | url = self.API_BASE + self.format_query(query, start_index, page_items) 292 | handle = urllib.request.urlopen(url) 293 | 294 | xml = handle.read().decode("utf-8") 295 | #print(xml) 296 | 297 | # track the API results paging 298 | root = et.fromstring(xml) 299 | total_results = int(root.findall("opensearch:totalResults", self.NS)[0].text) # type: ignore 300 | start_index = int(root.findall("opensearch:startIndex", self.NS)[0].text) # type: ignore 301 | page_items = int(root.findall("opensearch:itemsPerPage", self.NS)[0].text) # type: ignore 302 | 303 | print("---") 304 | ic(total_results) 305 | ic(start_index) 306 | ic(page_items) 307 | 308 | # parse each entry 309 | for entry in tqdm(root.findall("atom:entry", self.NS), desc="Atom entry"): 310 | node, date = self.parse_entry(entry) 311 | yield date, node 312 | 313 | if date < min_date: 314 | return 315 | 316 | # iterate to the next page of results 317 | max_index = min(max_items, total_results) 318 | start_index += page_items 319 | 320 | return 321 | 322 | 323 | ###################################################################### 324 | ## commands 325 | 326 | @APP.command() 327 | def cmd_query ( 328 | query: str, 329 | *, 330 | kg_path: str = "arxiv.ttl", 331 | min_date: str = "2021-06-15", 332 | max_items: int = 5000, 333 | ) -> None: 334 | """ 335 | Query the arXiv API for the given search, then update the KG. 336 | 337 | kg_path: 338 | optional - path to the TTL file for kglab to load/save 339 | 340 | min_date: 341 | optional - minimum date to include in the results 342 | 343 | max_items: 344 | optional - maximum items requested per API call 345 | """ 346 | trends = Trends(kg_path=kg_path) 347 | 348 | # search parameters 349 | page_items = 100 350 | 351 | # get metadata for the matching articles 352 | hit_iter = trends.arxiv_api( 353 | " AND ".join(query.split(" ")), 354 | min_date, 355 | max_items=max_items, 356 | page_items=page_items, 357 | ) 358 | 359 | for date, node in tqdm(hit_iter, desc="Hits"): 360 | trends.kg.add(node, trends.kg.get_ns("derw").fromQuery, trends.topics[query]) 361 | # TODO: what if query new? 362 | #print(query, date, node) 363 | 364 | # persist the metadata 365 | trends.save_kg() 366 | 367 | 368 | @APP.command() 369 | def cmd_extract ( 370 | kg_path: str = "arxiv.ttl", 371 | min_date: str = "2021-06-15", 372 | kpa_file: str = "phrases.csv", 373 | max_phrase: int = 10, 374 | ) -> None: 375 | """ 376 | Extract the entities from each article. 377 | 378 | kg_path: 379 | optional - path to the TTL file for kglab to load/save 380 | 381 | min_date: 382 | optional - minimum date to include in the results 383 | 384 | kpa_file: 385 | optional - path to the CSV file for extracted phrases 386 | 387 | max_phrase: 388 | optional - maximum number of extracted phrases to represent 389 | """ 390 | trends = Trends(kg_path=kg_path) 391 | 392 | # prepare the NLP pipeline 393 | nlp = spacy.load("en_core_web_sm") 394 | nlp.add_pipe("textrank") 395 | 396 | sparql = f""" 397 | SELECT ?article ?title ?abstract 398 | WHERE {{ 399 | ?article a bibo:Article . 400 | ?article dct:title ?title . 401 | ?article dct:abstract ?abstract . 402 | ?article dct:Date ?date . 403 | FILTER (?date > "{min_date}T00:00:00"^^xsd:dateTime) 404 | }} 405 | """ 406 | 407 | # run the pipeline for each article 408 | for node, title, abstract in tqdm(trends.kg.query(sparql), desc="Abstract query"): 409 | text = title.toPython() + ". " + abstract.toPython() 410 | doc = nlp(text) 411 | df_list = [] 412 | 413 | for phrase in itertools.islice(doc._.phrases, max_phrase): 414 | entity_label = " ".join(phrase.text.replace("\n", " ").strip().split()).lower() 415 | entity_id = text_to_id(entity_label) 416 | 417 | df_list.append({ 418 | "url": str(node), 419 | "rank": round(phrase.rank, 3), 420 | "count": phrase.count, 421 | "id": entity_id, 422 | "label": entity_label, 423 | }) 424 | 425 | # serialize extracted phrases to a CSV file 426 | path = pathlib.Path(kpa_file) 427 | df = pd.DataFrame(df_list) 428 | df.to_csv(path, index=False) 429 | 430 | 431 | @APP.command() 432 | def cmd_analyze ( 433 | kg_path: str = "arxiv.ttl", 434 | csv_file: str = "arxiv.csv", 435 | ) -> None: 436 | """ 437 | Analyze the article trends. 438 | 439 | kg_path: 440 | optional - path to the TTL file for kglab to load/save 441 | 442 | csv_file: 443 | optional - path to the CSV file for trend data 444 | """ 445 | trends = Trends(kg_path=kg_path) 446 | 447 | sparql = """ 448 | SELECT ?article ?date ?label 449 | WHERE { 450 | ?article a bibo:Article . 451 | ?article dct:Date ?date . 452 | ?article derw:fromQuery ?topic . 453 | ?topic skos:prefLabel ?label 454 | } 455 | """ 456 | # run the pipeline for each article 457 | df = pd.DataFrame([ 458 | { 459 | "topic": topic.toPython(), 460 | "date": date.toPython(), 461 | "counts": 0, 462 | } 463 | for article, date, topic in tqdm(trends.kg.query(sparql), desc="Article query") 464 | ]).groupby(["topic", "date"]).count() 465 | 466 | # serialize trend data to a CSV file 467 | path = pathlib.Path(csv_file) 468 | df.to_csv(path) 469 | 470 | 471 | @APP.command() 472 | def cmd_visualize ( 473 | csv_file: str = "arxiv.csv", 474 | png_file: str = "arxiv.png", 475 | start_date: str = "2020-01-01", 476 | ) -> None: 477 | """ 478 | Visualize the article trends. 479 | 480 | csv_file: 481 | optional - path to the CSV file fo trend data 482 | 483 | png_file: 484 | optional - path to the PNG file for the rendered diagram 485 | """ 486 | df = pd.read_csv(csv_file, parse_dates=True, index_col="date") 487 | df_list = [] 488 | 489 | for query in tqdm(sorted(set(df["topic"])), desc="Topic"): 490 | df_sub = df[df["topic"] == query] 491 | df_samp = df_sub.resample("M").sum() 492 | df_list.append(df_samp.rename(columns={ "counts": query })) 493 | 494 | df_full = pd.concat(df_list, axis=1, join="inner").reindex(df_samp.index).fillna(0) 495 | 496 | # drop the earliest row as an outlier 497 | df_full = df_full.iloc[1: , :] 498 | 499 | # drop rows before the start date, if any 500 | nix = set([ 501 | i 502 | for i, row in enumerate(df_full.index) 503 | if str(row) < start_date 504 | ]) 505 | 506 | if len(nix) > 0: 507 | cutoff = max(nix) + 1 508 | df_full = df_full.iloc[cutoff: , :] 509 | 510 | # drop the last row – to let arXiv settle 511 | df_full.drop(df_full.tail(1).index, inplace=True) 512 | 513 | # set up the plot 514 | plot = df_full.plot( 515 | subplots=True, 516 | legend=False, 517 | figsize=(11, 7), 518 | xlabel="date submitted" 519 | ) 520 | 521 | plot[0].set(ylabel="monthly counts") 522 | 523 | summary = list(df.groupby("topic").sum().to_dict()["counts"].items()) 524 | y_max = round(max(df_full.max(axis=1)) + 10.0) 525 | 526 | for index, ax in enumerate(plot): 527 | query, count = summary[index] 528 | ax.set(ylim=(0, y_max), title=f"{query}, total = {count}") 529 | 530 | fig = plot[0].get_figure() 531 | fig.tight_layout() 532 | fig.savefig(png_file) 533 | 534 | 535 | if __name__ == "__main__": 536 | APP() 537 | 538 | # simply reminders... 539 | query_list = [ 540 | "knowledge graph", 541 | "graph database", 542 | "graph algorithm", 543 | "graph neural networks", 544 | "graph embedding", 545 | ] 546 | -------------------------------------------------------------------------------- /arxiv.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "b31597b4-5dd0-4b69-b692-cee73fdf843e", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "csv_file = \"arxiv.csv\"\n", 14 | "png_file = \"arxiv.png\"\n", 15 | "start_date = \"2020-06-01\"\n", 16 | "\n", 17 | "df = pd.read_csv(csv_file, parse_dates=True, index_col=\"date\")" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "id": "38dc6d59-7500-4155-96c4-4a5941472a0a", 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "df_list = []\n", 28 | "\n", 29 | "for query in sorted(set(df[\"topic\"])):\n", 30 | " df_sub = df[df[\"topic\"] == query]\n", 31 | " df_samp = df_sub.resample(\"M\").sum()\n", 32 | " df_list.append(df_samp.rename(columns={ \"counts\": query }))" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "id": "fe3edc54-b8bf-4ce3-b844-4fe3ae778bfb", 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/html": [ 44 | "
\n", 45 | "\n", 58 | "\n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | "
graph algorithmsgraph neural networksknowledge graph
date
2020-01-311597159
2020-02-2923011271
2020-03-3120212077
2020-04-302349086
2020-05-3121510376
2020-06-30264179108
2020-07-3127814187
2020-08-3122610560
2020-09-30254144108
2020-10-31254181132
2020-11-3027313688
2020-12-31219150107
2021-01-3118910376
2021-02-2825513792
2021-03-3125116493
2021-04-30212164117
2021-05-31264166119
2021-06-30262230137
\n", 184 | "
" 185 | ], 186 | "text/plain": [ 187 | " graph algorithms graph neural networks knowledge graph\n", 188 | "date \n", 189 | "2020-01-31 159 71 59\n", 190 | "2020-02-29 230 112 71\n", 191 | "2020-03-31 202 120 77\n", 192 | "2020-04-30 234 90 86\n", 193 | "2020-05-31 215 103 76\n", 194 | "2020-06-30 264 179 108\n", 195 | "2020-07-31 278 141 87\n", 196 | "2020-08-31 226 105 60\n", 197 | "2020-09-30 254 144 108\n", 198 | "2020-10-31 254 181 132\n", 199 | "2020-11-30 273 136 88\n", 200 | "2020-12-31 219 150 107\n", 201 | "2021-01-31 189 103 76\n", 202 | "2021-02-28 255 137 92\n", 203 | "2021-03-31 251 164 93\n", 204 | "2021-04-30 212 164 117\n", 205 | "2021-05-31 264 166 119\n", 206 | "2021-06-30 262 230 137" 207 | ] 208 | }, 209 | "execution_count": 3, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "df_full = pd.concat(df_list, axis=1, join=\"inner\").reindex(df_samp.index).fillna(0)\n", 216 | "\n", 217 | "# delete min value as an outlier\n", 218 | "df_full = df_full.iloc[1: , :]\n", 219 | "\n", 220 | "# drop last row – to let arXiv settle\n", 221 | "df_full.drop(df_full.tail(1).index, inplace=True)\n", 222 | "\n", 223 | "df_full" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 4, 229 | "id": "93df1c10-2765-447e-83d3-00e3c876f88d", 230 | "metadata": {}, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "text/html": [ 235 | "
\n", 236 | "\n", 249 | "\n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | "
graph algorithmsgraph neural networksknowledge graph
date
2020-06-30264179108
2020-07-3127814187
2020-08-3122610560
2020-09-30254144108
2020-10-31254181132
2020-11-3027313688
2020-12-31219150107
2021-01-3118910376
2021-02-2825513792
2021-03-3125116493
2021-04-30212164117
2021-05-31264166119
2021-06-30262230137
\n", 345 | "
" 346 | ], 347 | "text/plain": [ 348 | " graph algorithms graph neural networks knowledge graph\n", 349 | "date \n", 350 | "2020-06-30 264 179 108\n", 351 | "2020-07-31 278 141 87\n", 352 | "2020-08-31 226 105 60\n", 353 | "2020-09-30 254 144 108\n", 354 | "2020-10-31 254 181 132\n", 355 | "2020-11-30 273 136 88\n", 356 | "2020-12-31 219 150 107\n", 357 | "2021-01-31 189 103 76\n", 358 | "2021-02-28 255 137 92\n", 359 | "2021-03-31 251 164 93\n", 360 | "2021-04-30 212 164 117\n", 361 | "2021-05-31 264 166 119\n", 362 | "2021-06-30 262 230 137" 363 | ] 364 | }, 365 | "execution_count": 4, 366 | "metadata": {}, 367 | "output_type": "execute_result" 368 | } 369 | ], 370 | "source": [ 371 | "# remove rows before the start date\n", 372 | "nix = set([\n", 373 | " i\n", 374 | " for i, row in enumerate(df_full.index)\n", 375 | " if str(row) < start_date\n", 376 | "])\n", 377 | "\n", 378 | "cutoff = max(nix) + 1\n", 379 | "df_full = df_full.iloc[cutoff: , :]\n", 380 | "\n", 381 | "df_full" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 5, 387 | "id": "8b19b90a-0c00-4fa4-b117-5362c89f3c71", 388 | "metadata": {}, 389 | "outputs": [ 390 | { 391 | "data": { 392 | "image/png": "\n", 393 | "text/plain": [ 394 | "
" 395 | ] 396 | }, 397 | "metadata": { 398 | "needs_background": "light" 399 | }, 400 | "output_type": "display_data" 401 | } 402 | ], 403 | "source": [ 404 | "plot = df_full.plot(\n", 405 | " subplots=True,\n", 406 | " legend=False,\n", 407 | " figsize=(11, 7),\n", 408 | " xlabel=\"date submitted\"\n", 409 | ")\n", 410 | "\n", 411 | "plot[0].set(ylabel=\"monthly counts\")\n", 412 | "\n", 413 | "summary = list(df.groupby(\"topic\").sum().to_dict()[\"counts\"].items())\n", 414 | "y_max = round(max(df_full.max(axis=1)) + 10.0)\n", 415 | "\n", 416 | "for index, ax in enumerate(plot):\n", 417 | " query, count = summary[index]\n", 418 | " ax.set(ylim=(0, y_max), title=f\"{query}, total = {count}\")\n", 419 | "\n", 420 | "fig = plot[0].get_figure()\n", 421 | "fig.tight_layout()\n", 422 | "fig.savefig(png_file)" 423 | ] 424 | } 425 | ], 426 | "metadata": { 427 | "kernelspec": { 428 | "display_name": "Python 3 (ipykernel)", 429 | "language": "python", 430 | "name": "python3" 431 | }, 432 | "language_info": { 433 | "codemirror_mode": { 434 | "name": "ipython", 435 | "version": 3 436 | }, 437 | "file_extension": ".py", 438 | "mimetype": "text/x-python", 439 | "name": "python", 440 | "nbconvert_exporter": "python", 441 | "pygments_lexer": "ipython3", 442 | "version": "3.8.10" 443 | } 444 | }, 445 | "nbformat": 4, 446 | "nbformat_minor": 5 447 | } 448 | --------------------------------------------------------------------------------