├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.rst ├── datasets └── mw_dump_stub.xml ├── demonstrate_doi_extractor_performance.py ├── mwcites ├── __init__.py ├── extractors │ ├── __init__.py │ ├── arxiv.py │ ├── doi.py │ ├── isbn.py │ ├── issn.py │ ├── pubmed.py │ └── tests │ │ ├── __init__.py │ │ ├── test_arxiv.py │ │ ├── test_doi.py │ │ ├── test_isbn.py │ │ ├── test_issn.py │ │ └── test_pubmed.py ├── identifier.py ├── mwcites.py └── utilities │ ├── __init__.py │ ├── extract.py │ └── tests │ ├── __init__.py │ └── test_extract.py ├── requirements.txt ├── setup.py ├── sql ├── cites_enwiki_20150602.create.sql └── month_type_citations.sql └── utility /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *~ 5 | 6 | # Datasets 7 | *.tsv 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Datasets 13 | *.bz2 14 | 15 | # Distribution / packaging 16 | .Python 17 | env/ 18 | bin/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | 44 | # Translations 45 | *.mo 46 | 47 | # Mr Developer 48 | .mr.developer.cfg 49 | .project 50 | .pydevproject 51 | 52 | # Rope 53 | .ropeproject 54 | 55 | # Django stuff: 56 | *.log 57 | *.pot 58 | 59 | # Sphinx documentation 60 | docs/_build/ 61 | 62 | # Pycharm directories 63 | .idea 64 | venv/ 65 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Aaron Halfaker 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE VERSION README.rst 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Extract academic citations from Wikipedia 2 | ========================================= 3 | This project contains a utility for extracting academic citation identifiers. 4 | 5 | **NOTE:** As one of its dependencies (`Mediawiki-Utilities `_) requires 6 | Python 3 so does mwcites. 7 | 8 | ``pip install mwcites`` 9 | 10 | Usage 11 | ----- 12 | There's really only one utility in this package called ``mwcitations``. 13 | 14 | :: 15 | 16 | $ mwcitations extract enwiki-20150112-pages-meta-history*.xml*.bz2 > citations.tsv 17 | 18 | 19 | Documentation 20 | ------------- 21 | Documentation is provided ``$ mwcitations extract -h``. 22 | 23 | :: 24 | 25 | Extracts academic citations from articles from the history of Wikipedia 26 | articles by processing a pages-meta-history XML dump and matching regular 27 | expressions to revision content. 28 | 29 | Currently supported identifiers include: 30 | 31 | * PubMed 32 | * DOI 33 | * ISBN 34 | * arXiv 35 | * ISSN 36 | 37 | Outputs a TSV file with the following fields: 38 | 39 | * page_id: The identifier of the Wikipedia article (int), e.g. 1325125 40 | * page_title: The title of the Wikipedia article (utf-8), e.g. Club cell 41 | * rev_id: The Wikipedia revision where the citation was first added (int), 42 | e.g. 282470030 43 | * timestamp: The timestamp of the revision where the citation was first 44 | added. (ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z 45 | * type: The type of identifier, e.g. pmid, pmcid, doi, isbn or arxiv 46 | * id: The id of the cited scholarly article (utf-8), 47 | e.g 10.1183/09031936.00213411 48 | 49 | Usage: 50 | mwcites extract -h | --help 51 | mwcites extract ... 52 | 53 | Options: 54 | -h --help Shows this documentation 55 | -------------------------------------------------------------------------------- /datasets/mw_dump_stub.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Wikipedia 4 | enwiki 5 | http://en.wikipedia.org/wiki/Main_Page 6 | MediaWiki 1.25wmf6 7 | first-letter 8 | 9 | Media 10 | Special 11 | 12 | Talk 13 | User 14 | User talk 15 | Wikipedia 16 | Wikipedia talk 17 | File 18 | File talk 19 | MediaWiki 20 | MediaWiki talk 21 | Template 22 | Template talk 23 | Help 24 | Help talk 25 | Category 26 | Category talk 27 | Portal 28 | Portal talk 29 | Book 30 | Book talk 31 | Draft 32 | Draft talk 33 | Education Program 34 | Education Program talk 35 | TimedText 36 | TimedText talk 37 | Module 38 | Module talk 39 | Topic 40 | 41 | 42 | 43 | Nagamaki naomi 44 | 0 45 | 1325004 46 | 47 | 48 | 8801038 49 | 2004-12-25T05:34:36Z 50 | 51 | 4.46.105.106 52 | 53 | This is a doi randomly placed in the text 10.0000/m1 54 | Here's a typo that might be construed as a doi 10.60 people were there. 55 | <ref>Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012). 56 | The rise and decline of an open collaboration system: How Wikipedia's 57 | reaction to popularity is causing its decline. 58 | American Behavioral Scientist, 59 | 0002764212469365 doi: 10.1177/0002764212469365</ref>. Hats pants and banana 60 | [http://dx.doi.org/10.1170/foo<bar>(herp)derp] 61 | {{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}} 62 | http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom= 63 | 10.2387/234310.2347/39423 64 | cqt09qrejym5mdf3h0irtelqd8f46h3 65 | wikitext 66 | text/x-wiki 67 | 68 | 69 | 10730831 70 | 8801038 71 | 2004-12-25T05:38:45Z 72 | 73 | Gadfium 74 | 54381 75 | 76 | 77 | Wikify 78 | This is a doi randomly placed in the text 10.0000/m1 79 | Here's a typo that might be construed as a doi 10.60 people were there. 80 | {{cite|...|doi=10.0000/m2|pmid=10559875}} 81 | <ref>Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012). 82 | The rise and decline of an open collaboration system: How Wikipedia's 83 | reaction to popularity is causing its decline. 84 | American Behavioral Scientist, 85 | 0002764212469365 doi: 10.1177/0002764212469365</ref>. Hats pants and banana 86 | [http://dx.doi.org/10.1170/foo<bar>(herp)derp] 87 | [http://dx.doi.org/10.1170/foo<bar>(herp)derp[waffles]] 88 | {{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}} 89 | http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom= 90 | 10.2387/234310.2347/39423 91 | pfjkfb1u54tnkl4exkxge4f5v1mn7cl 92 | wikitext 93 | text/x-wiki 94 | 95 | 96 | 10730832 97 | 10730831 98 | 2004-12-25T05:38:46Z 99 | 100 | Gadfium 101 | 54381 102 | 103 | 104 | Wikify 105 | This is a doi randomly placed in the text 10.0000/m1 106 | Here's a typo that might be construed as a doi 10.60 people were there. 107 | {{cite|...|doi=10.0000/m2|pmid=10559875}} 108 | <ref>Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012). 109 | The rise and decline of an open collaboration system: How Wikipedia's 110 | reaction to popularity is causing its decline. 111 | American Behavioral Scientist, 112 | 0002764212469365 doi: 10.1177/0002764212469365</ref>. Hats pants and banana 113 | {{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}} 114 | [http://arxiv.org/abs/0706.0004v1] 115 | [https://arxiv.org/abs/0706.0005v1] 116 | http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom= 117 | 10.2387/234310.2347/39423 118 | pfjkfb1u54sksl4exkxge4f5v1mn7cl 119 | wikitext 120 | text/x-wiki 121 | 122 | 123 | 10730833 124 | 10730832 125 | 2004-12-25T05:38:47Z 126 | 127 | Gadfium 128 | 54381 129 | 130 | Wikify 131 | This is a doi randomly placed in the text 10.0000/m1 132 | Here's a typo that might be construed as a doi 10.60 people were there. 133 | {{cite|...|doi=10.0000/m2|pmid=10559875}} 134 | <ref>Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012). 135 | The rise and decline of an open collaboration system: How Wikipedia's 136 | reaction to popularity is causing its decline. 137 | American Behavioral Scientist, 138 | 0002764212469365 doi: 10.1177/0002764212469365</ref>. Hats pants and banana 139 | [http://dx.doi.org/10.1170/foo<bar>(herp)derp] 140 | [http://dx.doi.org/10.1170/foo<bar>(herp)derp[waffles]] 141 | {{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}} 142 | [https://arxiv.org/abs/0706.0005v1] 143 | http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom= 144 | 10.2387/234310.2347/39423 145 | pfjkfb1u54tnksksxkxgehhgv1mn7cl 146 | wikitext 147 | text/x-wiki 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /demonstrate_doi_extractor_performance.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from mw import api 4 | 5 | from mwcites.extractors import doi 6 | 7 | session = api.Session("https://en.wikipedia.org/w/api.php", 8 | user_agent="Demo doi extractor") 9 | 10 | revisions = session.revisions.query(titles={"Psychotherapy"}, 11 | properties={'content'}) 12 | lots = next(revisions)['*'] 13 | print("Text with lots of DOIs has {0} characters".format(len(lots))) 14 | 15 | revisions = session.revisions.query(titles={"Waffle"}, 16 | properties={'content'}) 17 | few = next(revisions)['*'] 18 | print("Text with few DOIs has {0} characters".format(len(few))) 19 | 20 | 21 | start = time.time() 22 | for i in range(50): 23 | ids = set(doi.extract(lots)) 24 | ids = set(doi.extract(few)) 25 | print("Regex strategy: {0}".format(time.time() - start)) 26 | 27 | start = time.time() 28 | for i in range(50): 29 | ids = set(doi.extract_mwp(lots)) 30 | ids = set(doi.extract_mwp(few)) 31 | print("MWP strategy: {0}".format(time.time() - start)) 32 | 33 | 34 | start = time.time() 35 | for i in range(50): 36 | ids = set(doi.extract_island(lots)) 37 | ids = set(doi.extract_island(few)) 38 | print("Island parser strategy: {0}".format(time.time() - start)) 39 | 40 | start = time.time() 41 | for i in range(50): 42 | ids = set(doi.extract_search(lots)) 43 | ids = set(doi.extract_search(few)) 44 | print("Search parser strategy: {0}".format(time.time() - start)) 45 | -------------------------------------------------------------------------------- /mwcites/__init__.py: -------------------------------------------------------------------------------- 1 | from .identifier import Identifier 2 | 3 | __version__ = "0.2.0" 4 | -------------------------------------------------------------------------------- /mwcites/extractors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mediawiki-utilities/python-mwcites/2adf4b669cdbeef7d2a0ef168dd7fc26fadb6922/mwcites/extractors/__init__.py -------------------------------------------------------------------------------- /mwcites/extractors/arxiv.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from ..identifier import Identifier 4 | 5 | # From http://arxiv.org/help/arxiv_identifier 6 | old_id = r"-?(?P([a-z]+(.[a-z]+)/)?[0-9]{4}[0-9]+)" 7 | new_id = r"(?P[0-9]{4}.[0-9]+)(v[0-9]+)?" 8 | 9 | prefixes=["arxiv\s*=\s*", "//arxiv\.org/(abs/)?", "arxiv:\s?"] 10 | 11 | ARXIV_RE = re.compile(r"({0})".format("|".join(prefixes)) + 12 | r"({0}|{1})".format(old_id, new_id), re.I|re.U) 13 | 14 | def extract(text): 15 | for match in ARXIV_RE.finditer(text): 16 | id = match.group('new_id') or match.group("old_id") 17 | yield Identifier("arxiv", id.lower()) 18 | -------------------------------------------------------------------------------- /mwcites/extractors/doi.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import defaultdict 3 | 4 | from more_itertools import peekable 5 | 6 | from ..identifier import Identifier 7 | 8 | DOI_START_RE = re.compile(r'10\.[0-9]{4,}/') 9 | 10 | HTML_TAGS = ['ref', 'span', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 11 | 'b', 'u', 'i', 's', 'ins', 'del', 'code', 'tt', 'blockquote', 12 | 'pre'] 13 | 14 | TAGS_RE = re.compile(r'<(/\s*)?(' + '|'.join(HTML_TAGS) + ')(\s[^>\n\r]+)?>', re.I) 15 | 16 | ''' 17 | DOI_RE = re.compile(r'\b(10\.\d+/[^\s\|\]\}\?\,]+)') 18 | 19 | def extract_regex(text): 20 | for match in DOI_RE.finditer(text): 21 | id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".") 22 | yield Identifier("doi", id) 23 | 24 | import mwparserfromhell as mwp 25 | def extract_mwp(text): 26 | no_tags = mwp.parse(text).strip_code() 27 | for match in DOI_RE.finditer(no_tags): 28 | id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".") 29 | yield Identifier("doi", id) 30 | ''' 31 | 32 | LEXICON = [ 33 | (DOI_START_RE.pattern, 'doi_start'), 34 | (r'\(', 'open_paren'), 35 | (r'\)', 'close_paren'), 36 | (r'\[', 'open_bracket'), 37 | (r'\]', 'close_bracket'), 38 | (r'', 'comment_end'), 40 | (TAGS_RE.pattern, 'tag'), 41 | (r'<', 'open_angle'), 42 | (r'>', 'close_angle'), 43 | (r'\{', 'open_curly'), 44 | (r'\}', 'close_curly'), 45 | (r'\|', 'pipe'), 46 | (r'[,\.;!]', 'punct'), 47 | (r'[\?#]', 'url_end'), 48 | (r'[\n\r]+', 'break'), 49 | (r'\s+', 'whitespace'), 50 | (r'\w+', 'word'), 51 | (r'.', 'etc') 52 | ] 53 | 54 | def extract_island(text): 55 | tokens = tokenize_finditer(text, LEXICON) 56 | tokens = peekable(tokens) 57 | 58 | while tokens.peek(None) is not None: 59 | 60 | if tokens.peek()[0] == 'doi_start': 61 | yield ('doi', read_doi(tokens)) 62 | 63 | next(tokens) 64 | 65 | 66 | def tokenize_finditer(text, lexicon=LEXICON): 67 | pattern = '|'.join("(?P<{0}>{1})".format(name, pattern) 68 | for pattern, name in lexicon) 69 | 70 | group_regex = re.compile(pattern, re.I|re.U|re.M) 71 | 72 | for match in group_regex.finditer(text): 73 | yield match.lastgroup, match.group(0) 74 | 75 | 76 | """ 77 | def tokenize_scanner(text, lexicon=LEXICON): 78 | scanner = re.Scanner(lexicon) 79 | tokens, remainder = scanner.scan(text) 80 | return tokens 81 | """ 82 | 83 | #from mwcites.extractors.doi import tokenize_scan 84 | #list(tokenize_scan("foo bar baz.{}")) 85 | 86 | def read_doi(tokens): 87 | assert tokens.peek()[0] == 'doi_start' 88 | 89 | depth = defaultdict(lambda: 0) 90 | 91 | doi_buffer = [next(tokens)[1]] 92 | 93 | while tokens.peek(None) is not None: 94 | name, match = tokens.peek() 95 | 96 | if name in ('url_end', 'break', 'whitespace', 'tag', 'pipe', 97 | 'comment_start', 'comment_end'): 98 | break 99 | elif name == 'open_bracket': 100 | depth['bracket'] += 1 101 | doi_buffer.append(next(tokens)[1]) 102 | elif name == 'open_curly': 103 | depth['curly'] += 1 104 | doi_buffer.append(next(tokens)[1]) 105 | elif name == 'close_bracket': 106 | if depth['bracket'] > 0: 107 | depth['bracket'] -= 1 108 | doi_buffer.append(next(tokens)[1]) 109 | else: 110 | break 111 | elif name == 'close_curly': 112 | if depth['curly'] > 0: 113 | depth['curly'] -= 1 114 | doi_buffer.append(next(tokens)[1]) 115 | else: 116 | break 117 | else: 118 | doi_buffer.append(next(tokens)[1]) 119 | 120 | 121 | # Do not return a doi with punctuation at the end 122 | return re.sub(r'[\.,!]+$', '', ''.join(doi_buffer)) 123 | 124 | 125 | 126 | def tokenize_search(text, start, lexicon=LEXICON): 127 | pattern = '|'.join("(?P<{0}>{1})".format(name, pattern) 128 | for pattern, name in lexicon) 129 | 130 | group_regex = re.compile(pattern, re.I|re.U) 131 | 132 | match = group_regex.search(text, start) 133 | while match is not None: 134 | yield match.lastgroup, match.group(0) 135 | match = group_regex.search(text, match.span()[1]) 136 | 137 | def extract_search(text, lexicon=LEXICON): 138 | 139 | last_end = 0 140 | for match in DOI_START_RE.finditer(text): 141 | if match.span()[0] > last_end: 142 | tokens = tokenize_search(text, match.span()[0], lexicon=lexicon) 143 | tokens = peekable(tokens) 144 | doi = read_doi(tokens) 145 | last_end = match.span()[0] + len(doi) 146 | yield Identifier('doi', doi) 147 | else: 148 | last_end = max(match.span()[1], last_end) 149 | 150 | extract = extract_search # Setting the default to the best method 151 | -------------------------------------------------------------------------------- /mwcites/extractors/isbn.py: -------------------------------------------------------------------------------- 1 | import re 2 | from ..identifier import Identifier 3 | 4 | # Also correctly parses malformed inputs such as below: 5 | # isbn=2 906700-09-6 (notice the space instead of a hyphen) or 6 | # isbn=2 10 004179 7 (notice spaces instead of hyphens) 7 | # {{ISBN|978-83-7435-239-0​}} (notice pipe instead of equals) 8 | ISBN_RE = re.compile('isbn\s?[=|]?\s?([\d]+([\d\s\-]+)[\dXx])', re.I) 9 | 10 | 11 | def extract(text): 12 | for match in ISBN_RE.finditer(text): 13 | yield Identifier( 14 | 'isbn', 15 | match.group(1).replace('-', '').replace(' ', '').strip() 16 | ) 17 | -------------------------------------------------------------------------------- /mwcites/extractors/issn.py: -------------------------------------------------------------------------------- 1 | import re 2 | from ..identifier import Identifier 3 | 4 | ISSN_RE = re.compile('issn\s?=?\s?([0-9]{4}\-[0-9]{3}([0-9]|X))', re.I) 5 | 6 | def extract(text): 7 | for match in ISSN_RE.finditer(text): 8 | yield Identifier( 9 | 'issn', 10 | match.group(1).replace('-', '').replace(' ', '').strip() 11 | ) 12 | -------------------------------------------------------------------------------- /mwcites/extractors/pubmed.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from ..identifier import Identifier 4 | 5 | TEMPLATE_RE = re.compile(r"\b(pmid|pmc)\s*=\s*(pmc)?([0-9]+)\b", re.I) 6 | 7 | PMURL_RE = re.compile(r"//www\.ncbi\.nlm\.nih\.gov" + 8 | r"/pubmed/([0-9]+)\b", re.I) 9 | PMCURL_RE = re.compile(r"//www\.ncbi\.nlm\.nih\.gov" + 10 | r"/pmc/articles/PMC([0-9]+)\b", re.I) 11 | 12 | def extract(text): 13 | text = str(text or "") 14 | 15 | for match in TEMPLATE_RE.finditer(text): 16 | yield Identifier(match.group(1).lower(), match.group(3)) 17 | 18 | for match in PMURL_RE.finditer(text): 19 | yield Identifier("pmid", match.group(1)) 20 | 21 | for match in PMCURL_RE.finditer(text): 22 | yield Identifier("pmc", match.group(1)) 23 | -------------------------------------------------------------------------------- /mwcites/extractors/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mediawiki-utilities/python-mwcites/2adf4b669cdbeef7d2a0ef168dd7fc26fadb6922/mwcites/extractors/tests/__init__.py -------------------------------------------------------------------------------- /mwcites/extractors/tests/test_arxiv.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | 3 | from nose.tools import eq_ 4 | 5 | from .. import arxiv 6 | from ...identifier import Identifier 7 | 8 | INPUT_TEXT = """ 9 | This is a doi randomly placed in the text 10.0000/m1 10 | Here's a typo that might be construed as a doi 10.60 people were there. 11 | {{cite|...|arxiv=0706.0001v1|pmid=10559875}} 12 | Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012). 13 | The rise and decline of an open collaboration system: How Wikipedia’s 14 | reaction to popularity is causing its decline. 15 | American Behavioral Scientist, 16 | 0002764212469365 arxiv:0706.0002v1. Hats pants and banana 17 | [http://arxiv.org/0706.0003] 18 | [http://arxiv.org/abs/0706.0004v1] 19 | [https://arxiv.org/abs/0706.0005v1] 20 | [https://arxiv.org/abs/math.GT/0309001] 21 | [https://arxiv.org/abs/-math.gs/0309002] 22 | {{cite|...|arxiv=foobar.hats/0101003|issue=1656}} 23 | http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom= 24 | 10.2387/234310.2347/39423 25 | 27 | """ 28 | EXPECTED = [ 29 | Identifier('arxiv', "0706.0001"), 30 | Identifier('arxiv', "0706.0002"), 31 | Identifier('arxiv', "0706.0003"), 32 | Identifier('arxiv', "0706.0004"), 33 | Identifier('arxiv', "0706.0005"), 34 | Identifier('arxiv', "math.gt/0309001"), 35 | Identifier('arxiv', "math.gs/0309002"), 36 | Identifier('arxiv', "foobar.hats/0101003") 37 | ] 38 | 39 | def test_extract(): 40 | ids = list(arxiv.extract(INPUT_TEXT)) 41 | pprint.pprint(ids) 42 | pprint.pprint(EXPECTED) 43 | eq_(ids, EXPECTED) 44 | -------------------------------------------------------------------------------- /mwcites/extractors/tests/test_doi.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | 3 | from nose.tools import eq_ 4 | 5 | from .. import doi 6 | from ...identifier import Identifier 7 | 8 | INPUT_TEXT = """ 9 | This is a doi randomly placed in the text 10.0000/m1 10 | Here's a typo that might be construed as a doi 10.60 people were there. 11 | {{cite|...|doi=10.0000/m2|pmid=10559875}} 12 | Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012). 13 | The rise and decline of an open collaboration system: How Wikipedia’s 14 | reaction to popularity is causing its decline. 15 | American Behavioral Scientist, 16 | 0002764212469365 doi: 10.1177/0002764212469365. Hats pants and banana 17 | [http://dx.doi.org/10.1170/foo(herp)derp] 18 | [http://dx.doi.org/10.1170/foo(herp)derp[waffles]] 19 | {{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}} 20 | http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom= 21 | 10.2387/234310.2347/39423 22 | 24 | """ 25 | EXPECTED = [ 26 | Identifier('doi', "10.0000/m1"), 27 | Identifier('doi', "10.0000/m2"), 28 | Identifier('doi', "10.1177/0002764212469365"), 29 | Identifier('doi', "10.1170/foo(herp)derp"), 30 | Identifier('doi', "10.1170/foo(herp)derp[waffles]"), 31 | Identifier('doi', "10.1098/rspb.2008.1131"), 32 | Identifier('doi', "10.2387/234310.2347/39423"), 33 | Identifier('doi', "10.2387/234310.2347/39423") 34 | ] 35 | 36 | """ 37 | def test_extract_regex(): 38 | ids = list(doi.extract_regex(INPUT_TEXT)) 39 | pprint.pprint(ids) 40 | pprint.pprint(EXPECTED) 41 | eq_(ids, EXPECTED) 42 | 43 | def test_extract_mwp(): 44 | ids = list(doi.extract_mwp(INPUT_TEXT)) 45 | pprint.pprint(ids) 46 | pprint.pprint(EXPECTED) 47 | eq_(ids, EXPECTED) 48 | """ 49 | 50 | def test_extract(): 51 | ids = list(doi.extract(INPUT_TEXT)) 52 | pprint.pprint(ids) 53 | pprint.pprint(EXPECTED) 54 | eq_(ids, EXPECTED) 55 | 56 | def test_extract_island(): 57 | ids = list(doi.extract_island(INPUT_TEXT)) 58 | pprint.pprint(ids) 59 | pprint.pprint(EXPECTED) 60 | eq_(ids, EXPECTED) 61 | 62 | def test_extract_search(): 63 | ids = list(doi.extract_search(INPUT_TEXT)) 64 | pprint.pprint(ids) 65 | pprint.pprint(EXPECTED) 66 | #pprint.pprint(list(doi.tokenize_finditer(INPUT_TEXT))) 67 | eq_(ids, EXPECTED) 68 | -------------------------------------------------------------------------------- /mwcites/extractors/tests/test_isbn.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | from nose.tools import eq_ 3 | 4 | from .. import isbn 5 | from ...identifier import Identifier 6 | 7 | INPUT_TEXT = """ 8 | | isbn=2 906700-09-6 9 | | isbn=2 10 004179 7 10 | | publisher=Academic Press | isbn=0124366031 11 | | isbn=3540206310 12 | | accessdate=2008-02-05 | isbn=0-618-34342-3 13 | | isbn=978-0-140-27666-4 14 | | isbn = 0-13-054091-9 15 | | isbn=0195305736 }}</ref> schlug [[Irving Langmuir]] 1919 vor, dass das Elektronen in einem Atom verbunden oder verklumpt seien. Elektronengruppen beset 16 | | ISBN=978-3-7046-5112-9 17 | * Peter L. Bergen: ''Heiliger Krieg, Inc.: Osama bin Ladens Terrornetz''. Siedler, Berlin 2001, ISBN 3-88680-752-5. 18 | * Marwan Abou-Taam, Ruth Bigalke (Hgg) ''Die Reden des Osama bin Laden''. Diederichs, München 2006, ISBN 3-72052-773-5. (Reden und Ansprachen des b.L. im Original - ''Rezensionen: '' [http://www.sicherheit-heute.de/index.php?cccpage=readpolitik&set_z_artikel=221 ]und [http://www.fr-online.de/in_und_ausland/kultur_und_medien/buecher/?em_cnt=868715&sid=f55727] Frankf. Rundschau 26. April 2006) 19 | * Michael Pekler, Andreas Ungerböck: ''Ang Lee und seine Filme''. Schüren, Marburg 2009, ISBN 978-3-89472-665-2. 20 | <ref name="flos1">{{Literatur | Autor = René Flosdorff, Günther Hilgarth | Titel = Elektrische Energieverteilung | Verlag = Teubner | Auflage = 8. | Jahr = 2003 | Kapitel = Kapitel 1.2.2.4 | ISBN = 3-519-26424-2 }}</ref> 21 | Bei einer [[Sprungtemperatur]] von 1,2&nbsp;K wird reines Aluminium [[Supraleiter|supraleitend]].<ref>{{Literatur | Autor = Ilschner | first = Bernhard | Titel = Werkstoffwissenschaften und Fertigungstechnik Eigenschaften, Vorgänge, Technologien | Verlag = Springer | Ort = Berlin | Jahr = 2010 | ISBN = 978-3-642-01734-6 | Seiten = 277}}</ref> 22 | * {{Literatur | Autor=Michael J. Padilla, Ioannis Miaoulis, Martha Cyr | Jahr = 2002 | Titel = Prentice Hall Science Explorer: Chemical Building Blocks | Verlag = Prentice-Hall, Inc. | Ort = Upper Saddle River, New Jersey USA | ISBN = 0-13-054091-9 | |Originalsprache=en}} 23 | * ISBN 0 902 198 84 X 24 | * ISBN 1-57488-530-8 25 | * {{ISBN|978-83-7435-239-0​}} 26 | """ 27 | 28 | 29 | EXPECTED = [ 30 | Identifier('isbn', '2906700096'), 31 | Identifier('isbn', '2100041797'), 32 | Identifier('isbn', '0124366031'), 33 | Identifier('isbn', '3540206310'), 34 | Identifier('isbn', '0618343423'), 35 | Identifier('isbn', '9780140276664'), 36 | Identifier('isbn', '0130540919'), 37 | Identifier('isbn', '0195305736'), 38 | Identifier('isbn', '9783704651129'), 39 | Identifier('isbn', '3886807525'), 40 | Identifier('isbn', '3720527735'), 41 | Identifier('isbn', '9783894726652'), 42 | Identifier('isbn', '3519264242'), 43 | Identifier('isbn', '9783642017346'), 44 | Identifier('isbn', '0130540919'), 45 | Identifier('isbn', '090219884X'), 46 | Identifier('isbn', '1574885308'), 47 | Identifier('isbn', '9788374352390'), 48 | ] 49 | 50 | def test_extract(): 51 | ids = list(isbn.extract(INPUT_TEXT)) 52 | pprint.pprint(ids) 53 | pprint.pprint(EXPECTED) 54 | eq_(ids, EXPECTED) 55 | -------------------------------------------------------------------------------- /mwcites/extractors/tests/test_issn.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | from nose.tools import eq_ 3 | 4 | from .. import issn 5 | from ...identifier import Identifier 6 | 7 | INPUT_TEXT = """ 8 | {{cite book|work=Billboard|title=Sinatra FBI Files Opened|first=Bill|last=Holland|url=https://books.google.com/books?id=KQoEAAAAMBAJ&dq=Bill+Holland+1998+Billboard+page+10&q=walter+winchell#v=snippet&q=walter%20winchell&f=false|date=December 19, 1998|page=10|issn=0006-2510}} 9 | """ 10 | 11 | 12 | EXPECTED = [ 13 | Identifier('issn', '00062510'), 14 | ] 15 | 16 | def test_extract(): 17 | ids = list(issn.extract(INPUT_TEXT)) 18 | pprint.pprint(ids) 19 | pprint.pprint(EXPECTED) 20 | eq_(ids, EXPECTED) 21 | -------------------------------------------------------------------------------- /mwcites/extractors/tests/test_pubmed.py: -------------------------------------------------------------------------------- 1 | from nose.tools import eq_ 2 | 3 | from .. import pubmed 4 | from ...identifier import Identifier 5 | 6 | def test_extract(): 7 | 8 | text = """ 9 | This is some text with a template cite. {{cite|...|...|pmid=1}}. 10 | This is some text with a template cite. {{cite|...|...|pmid = 2|...}}. 11 | This is some text with a template cite. {{cite|...|...|pmc = 3|...}}. 12 | This is some text with a template cite. {{cite|...|...|pmc = pmc4|...}}. 13 | This is some text with a link [http://www.ncbi.nlm.nih.gov/pubmed/5 ID] 14 | Another link [https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6 ID] 15 | """ 16 | ids = list(pubmed.extract(text)) 17 | expected = [ 18 | Identifier('pmid', "1"), 19 | Identifier('pmid', "2"), 20 | Identifier('pmc', "3"), 21 | Identifier('pmc', "4"), 22 | Identifier('pmid', "5"), 23 | Identifier('pmc', "6") 24 | ] 25 | print(ids) 26 | print(expected) 27 | eq_(ids, expected) 28 | -------------------------------------------------------------------------------- /mwcites/identifier.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | Identifier = namedtuple("Identifier", ['type', 'id']) 4 | -------------------------------------------------------------------------------- /mwcites/mwcites.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script provides access to a set of utilities for processing academic 3 | citations in Wikipedia. 4 | 5 | Right now, there's only one utility, but there will be more to come. 6 | 7 | * extract -- Extracts citations from an XML database dump 8 | 9 | Usage: 10 | mwcites (-h | --help) 11 | mwcites [-h | --help] 12 | 13 | Options: 14 | -h | --help Shows this documentation 15 | The name of the utility to run 16 | """ 17 | import sys 18 | import traceback 19 | from importlib import import_module 20 | 21 | import docopt 22 | 23 | 24 | USAGE = """Usage: 25 | mwcites (-h | --help) 26 | mwcites [-h | --help]\n""" 27 | 28 | 29 | def main(): 30 | 31 | if len(sys.argv) < 2: 32 | sys.stderr.write(USAGE) 33 | sys.exit(1) 34 | elif sys.argv[1] in ("-h", "--help"): 35 | sys.stderr.write(__doc__ + "\n") 36 | sys.exit(1) 37 | elif sys.argv[1][:1] == "-": 38 | sys.stderr.write(USAGE) 39 | sys.exit(1) 40 | 41 | module_name = sys.argv[1] 42 | try: 43 | module = import_module(".utilities." + module_name, package="mwcites") 44 | except ImportError: 45 | sys.stderr.write(traceback.format_exc()) 46 | sys.stderr.write("Could not find utility {0}.\n".format(module_name)) 47 | sys.exit(1) 48 | 49 | module.main(sys.argv[2:]) 50 | -------------------------------------------------------------------------------- /mwcites/utilities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mediawiki-utilities/python-mwcites/2adf4b669cdbeef7d2a0ef168dd7fc26fadb6922/mwcites/utilities/__init__.py -------------------------------------------------------------------------------- /mwcites/utilities/extract.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extracts academic citations from articles from the history of Wikipedia articles 3 | by processing a pages-meta-history XML dump and matching regular expressions 4 | to revision content. 5 | 6 | Currently supported identifies include: 7 | 8 | * PubMed 9 | * DOI 10 | * ISBN 11 | * arXiv 12 | * ISSN 13 | 14 | Outputs a TSV file with the following fields: 15 | 16 | * page_id: The identifier of the Wikipedia article (int), e.g. 1325125 17 | * page_title: The title of the Wikipedia article (utf-8), e.g. Club cell 18 | * rev_id: The Wikipedia revision where the citation was first added (int), 19 | e.g. 282470030 20 | * timestamp: The timestamp of the revision where the citation was first added. 21 | (ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z 22 | * type: The type of identifier, e.g. pmid, pmcid, doi, arxiv or isbn 23 | * id: The id of the cited scholarly article (utf-8), 24 | e.g 10.1183/09031936.00213411 25 | 26 | Usage: 27 | extract -h | --help 28 | extract ... [--extractor=...] 29 | 30 | Options: 31 | -h --help Shows this documentation 32 | The path to a set of dump files to process. If no 33 | files are specified, will be read. 34 | --extractor= The class path to set of extractors to apply 35 | [default: ] 36 | """ 37 | import sys 38 | from itertools import chain 39 | 40 | import docopt 41 | import mwxml 42 | 43 | import mysqltsv 44 | 45 | from ..extractors import arxiv, doi, isbn, pubmed, issn 46 | 47 | ALL_EXTRACTORS = [doi, pubmed, isbn, arxiv, issn] 48 | 49 | HEADERS = ("page_id", "page_title", "rev_id", "timestamp", "type", "id") 50 | 51 | def main(argv=None): 52 | args = docopt.docopt(__doc__, argv=argv) 53 | dump_files = args[''] 54 | 55 | if args['--extractor'] == ['']: 56 | extractors = ALL_EXTRACTORS 57 | else: 58 | extractors = [import_from_path(path.lower) 59 | for path in args['--extractor']] 60 | 61 | run(dump_files, extractors) 62 | 63 | def run(dump_files, extractors): 64 | writer = mysqltsv.Writer(sys.stdout, headers=HEADERS) 65 | 66 | cites = extract(dump_files, extractors=extractors) 67 | for page_id, title, rev_id, timestamp, type, id in cites: 68 | writer.write([page_id, title, rev_id, timestamp.long_format(), type, id]) 69 | 70 | def extract(dump_files, extractors=ALL_EXTRACTORS): 71 | """ 72 | Extracts cites from a set of `dump_files`. 73 | 74 | :Parameters: 75 | dump_files : str | `file` 76 | A set of files MediaWiki XML dump files 77 | (expects: pages-meta-history) 78 | extractors : `list`(`extractor`) 79 | A list of extractors to apply to the text 80 | 81 | :Returns: 82 | `iterable` -- a generator of extracted cites 83 | 84 | """ 85 | # Dump processor function 86 | def process_dump(dump, path): 87 | for page in dump: 88 | if page.namespace != 0: continue 89 | else: 90 | for cite in extract_cite_history(page, extractors): 91 | yield cite 92 | 93 | # Map call 94 | return mwxml.map(process_dump, dump_files) 95 | 96 | def extract_cite_history(page, extractors): 97 | """ 98 | Extracts cites from the history of a `page` (`mwxml.Page`). 99 | 100 | :Parameters: 101 | page : `iterable`(`mwxml.Revision`) 102 | The page to extract cites from 103 | extractors : `list`(`extractor`) 104 | A list of extractors to apply to the text 105 | 106 | :Returns: 107 | `iterable` -- a generator of extracted cites 108 | 109 | """ 110 | appearances = {} # For tracking the first appearance of an ID 111 | ids = set() # For holding onto the ids in the last revision. 112 | for revision in page: 113 | ids = set(extract_ids(revision.text, extractors)) 114 | 115 | # For each ID, check to see if we have seen it before 116 | for id in ids: 117 | if id not in appearances: 118 | appearances[id] = (revision.id, revision.timestamp) 119 | 120 | for id in ids: #For the ids in the last version of the page 121 | rev_id, timestamp = appearances[id] 122 | yield (page.id, page.title, rev_id, timestamp, id.type, id.id) 123 | 124 | def extract_ids(text, extractors): 125 | """ 126 | Uses `extractors` to extract citation identifiers from a text. 127 | 128 | :Parameters: 129 | text : str 130 | The text to process 131 | extractors : `list`(`extractor`) 132 | A list of extractors to apply to the text 133 | 134 | :Returns: 135 | `iterable` -- a generator of extracted identifiers 136 | """ 137 | for extractor in extractors: 138 | for id in extractor.extract(text): 139 | yield id 140 | 141 | def import_from_path(path): 142 | """ 143 | Imports a specific attribute from a module based on a class path. 144 | 145 | :Parameters: 146 | path : str 147 | A dot delimited string representing the import path of the desired 148 | object. 149 | 150 | :Returns: 151 | object -- An imported object 152 | """ 153 | parts = path.split(".") 154 | module_path = ".".join(parts[:-1]) 155 | attribute_name = parts[-1] 156 | 157 | module = import_module(module_path) 158 | 159 | attribute = getattr(module, attribute_name) 160 | 161 | return attribute 162 | -------------------------------------------------------------------------------- /mwcites/utilities/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mediawiki-utilities/python-mwcites/2adf4b669cdbeef7d2a0ef168dd7fc26fadb6922/mwcites/utilities/tests/__init__.py -------------------------------------------------------------------------------- /mwcites/utilities/tests/test_extract.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | from mw import Timestamp 4 | from nose.tools import eq_ 5 | 6 | from ..extract import extract_cite_history 7 | from ...identifier import Identifier 8 | 9 | 10 | def test_extract_cite_history(): 11 | FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text']) 12 | 13 | FakeExtractor = namedtuple("Extractor", ['extract']) 14 | 15 | class FakePage: 16 | def __init__(self, id, title): 17 | self.id = id 18 | self.title = title 19 | def __iter__(self): 20 | return iter([ 21 | FakeRevision(1, Timestamp(1), "id1 id2"), 22 | FakeRevision(2, Timestamp(2), "id1 id3"), 23 | FakeRevision(3, Timestamp(3), "id1 id2 id3"), 24 | FakeRevision(4, Timestamp(4), "id1 id2 id4"), 25 | FakeRevision(5, Timestamp(5), "id1 id2 id4"), 26 | ]) 27 | 28 | fake_page = FakePage(1, "Title") 29 | 30 | def extract(text): 31 | return (Identifier('fake', id) for id in text.split(" ")) 32 | extractor = FakeExtractor(extract) 33 | 34 | expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"), 35 | (1, "Title", 1, Timestamp(1), "fake", "id2"), 36 | (1, "Title", 4, Timestamp(4), "fake", "id4")] 37 | 38 | citations = list(extract_cite_history(fake_page, [extractor])) 39 | eq_(len(citations), len(expected)) 40 | for cite in extract_cite_history(fake_page, [extractor]): 41 | assert cite in expected 42 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | docopt 2 | more-itertools 3 | mwparserfromhell 4 | mwxml 5 | mysqltsv 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import find_packages, setup 4 | 5 | import mwcites 6 | 7 | 8 | def read(fname): 9 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 10 | 11 | def requirements(fname): 12 | return [line.strip() 13 | for line in open(os.path.join(os.path.dirname(__file__), fname))] 14 | 15 | setup( 16 | name = "mwcites", 17 | version = mwcites.__version__, 18 | author = "Aaron Halfaker", 19 | author_email = "ahalfaker@wikimedia.org", 20 | description = "A collection of scripts and utilities for extracting " + 21 | "citations to academic literature from Wikipedia's XML " + 22 | "database dumps.", 23 | license = "MIT", 24 | url = "https://github.com/halfak/Extract-scholarly-article-citations-from-Wikipedia", 25 | packages=find_packages(), 26 | entry_points = { 27 | 'console_scripts': [ 28 | 'mwcitations=mwcites.mwcites:main', 29 | 'mwcites=mwcites.mwcites:main' 30 | ], 31 | }, 32 | long_description = read('README.rst'), 33 | install_requires = ['docopt', 'more-itertools', 'mediawiki-utilities'], 34 | classifiers=[ 35 | "Programming Language :: Python :: 3", 36 | "Development Status :: 3 - Alpha", 37 | "License :: OSI Approved :: MIT License", 38 | "Intended Audience :: Science/Research", 39 | "Intended Audience :: System Administrators", 40 | "Intended Audience :: Developers", 41 | "Operating System :: OS Independent", 42 | "Topic :: Utilities", 43 | "Topic :: Scientific/Engineering" 44 | ] 45 | ) 46 | -------------------------------------------------------------------------------- /sql/cites_enwiki_20150602.create.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE cites_enwiki_20150602 ( 2 | page_id INT, 3 | page_title VARBINARY(255), 4 | rev_id INT, 5 | timestamp VARBINARY(20), 6 | type VARCHAR(255), 7 | id VARCHAR(255) 8 | ); 9 | CREATE INDEX type_timestamp ON cites_enwiki_20150602 (type, timestamp); 10 | -------------------------------------------------------------------------------- /sql/month_type_citations.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | LEFT(timestamp, 7) AS month, 3 | type, 4 | COUNT(*) AS citations 5 | FROM cites_enwiki_20150602 6 | GROUP BY 1,2; 7 | -------------------------------------------------------------------------------- /utility: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from mwcites import mwcites 3 | 4 | mwcites.main() 5 | --------------------------------------------------------------------------------