├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.rst
├── datasets
    └── mw_dump_stub.xml
├── demonstrate_doi_extractor_performance.py
├── mwcites
    ├── __init__.py
    ├── extractors
    │   ├── __init__.py
    │   ├── arxiv.py
    │   ├── doi.py
    │   ├── isbn.py
    │   ├── issn.py
    │   ├── pubmed.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── test_arxiv.py
    │   │   ├── test_doi.py
    │   │   ├── test_isbn.py
    │   │   ├── test_issn.py
    │   │   └── test_pubmed.py
    ├── identifier.py
    ├── mwcites.py
    └── utilities
    │   ├── __init__.py
    │   ├── extract.py
    │   └── tests
    │       ├── __init__.py
    │       └── test_extract.py
├── requirements.txt
├── setup.py
├── sql
    ├── cites_enwiki_20150602.create.sql
    └── month_type_citations.sql
└── utility


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *~
 5 | 
 6 | # Datasets
 7 | *.tsv
 8 | 
 9 | # C extensions
10 | *.so
11 | 
12 | # Datasets
13 | *.bz2
14 | 
15 | # Distribution / packaging
16 | .Python
17 | env/
18 | bin/
19 | build/
20 | develop-eggs/
21 | dist/
22 | eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .cache
41 | nosetests.xml
42 | coverage.xml
43 | 
44 | # Translations
45 | *.mo
46 | 
47 | # Mr Developer
48 | .mr.developer.cfg
49 | .project
50 | .pydevproject
51 | 
52 | # Rope
53 | .ropeproject
54 | 
55 | # Django stuff:
56 | *.log
57 | *.pot
58 | 
59 | # Sphinx documentation
60 | docs/_build/
61 | 
62 | # Pycharm directories
63 | .idea
64 | venv/
65 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Aaron Halfaker
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE VERSION README.rst
2 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Extract academic citations from Wikipedia
 2 | =========================================
 3 | This project contains a utility for extracting academic citation identifiers.
 4 | 
 5 | **NOTE:** As one of its dependencies (`Mediawiki-Utilities <https://github.com/halfak/Mediawiki-Utilities>`_) requires
 6 | Python 3 so does mwcites.
 7 | 
 8 | ``pip install mwcites``
 9 | 
10 | Usage
11 | -----
12 | There's really only one utility in this package called ``mwcitations``.
13 | 
14 | ::
15 | 
16 |     $ mwcitations extract enwiki-20150112-pages-meta-history*.xml*.bz2 > citations.tsv
17 | 
18 | 
19 | Documentation
20 | -------------
21 | Documentation is provided ``$ mwcitations extract -h``.
22 | 
23 | ::
24 | 
25 |     Extracts academic citations from articles from the history of Wikipedia
26 |     articles by processing a pages-meta-history XML dump and matching regular
27 |     expressions to revision content.
28 | 
29 |     Currently supported identifiers include:
30 | 
31 |      * PubMed
32 |      * DOI
33 |      * ISBN
34 |      * arXiv
35 |      * ISSN
36 | 
37 |     Outputs a TSV file with the following fields:
38 | 
39 |      * page_id: The identifier of the Wikipedia article (int), e.g. 1325125
40 |      * page_title: The title of the Wikipedia article (utf-8), e.g. Club cell
41 |      * rev_id: The Wikipedia revision where the citation was first added (int),
42 |                e.g. 282470030
43 |      * timestamp: The timestamp of the revision where the citation was first
44 |                   added. (ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z
45 |      * type: The type of identifier, e.g. pmid, pmcid, doi, isbn or arxiv
46 |      * id: The id of the cited scholarly article (utf-8),
47 |            e.g 10.1183/09031936.00213411
48 | 
49 |     Usage:
50 |         mwcites extract -h | --help
51 |         mwcites extract <dump_file>...
52 | 
53 |     Options:
54 |         -h --help        Shows this documentation
55 | 


--------------------------------------------------------------------------------
/datasets/mw_dump_stub.xml:
--------------------------------------------------------------------------------
  1 | <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.9/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.9/ http://www.mediawiki.org/xml/export-0.9.xsd" version="0.9" xml:lang="en">
  2 |   <siteinfo>
  3 |     <sitename>Wikipedia</sitename>
  4 |     <dbname>enwiki</dbname>
  5 |     <base>http://en.wikipedia.org/wiki/Main_Page</base>
  6 |     <generator>MediaWiki 1.25wmf6</generator>
  7 |     <case>first-letter</case>
  8 |     <namespaces>
  9 |       <namespace key="-2" case="first-letter">Media</namespace>
 10 |       <namespace key="-1" case="first-letter">Special</namespace>
 11 |       <namespace key="0" case="first-letter" />
 12 |       <namespace key="1" case="first-letter">Talk</namespace>
 13 |       <namespace key="2" case="first-letter">User</namespace>
 14 |       <namespace key="3" case="first-letter">User talk</namespace>
 15 |       <namespace key="4" case="first-letter">Wikipedia</namespace>
 16 |       <namespace key="5" case="first-letter">Wikipedia talk</namespace>
 17 |       <namespace key="6" case="first-letter">File</namespace>
 18 |       <namespace key="7" case="first-letter">File talk</namespace>
 19 |       <namespace key="8" case="first-letter">MediaWiki</namespace>
 20 |       <namespace key="9" case="first-letter">MediaWiki talk</namespace>
 21 |       <namespace key="10" case="first-letter">Template</namespace>
 22 |       <namespace key="11" case="first-letter">Template talk</namespace>
 23 |       <namespace key="12" case="first-letter">Help</namespace>
 24 |       <namespace key="13" case="first-letter">Help talk</namespace>
 25 |       <namespace key="14" case="first-letter">Category</namespace>
 26 |       <namespace key="15" case="first-letter">Category talk</namespace>
 27 |       <namespace key="100" case="first-letter">Portal</namespace>
 28 |       <namespace key="101" case="first-letter">Portal talk</namespace>
 29 |       <namespace key="108" case="first-letter">Book</namespace>
 30 |       <namespace key="109" case="first-letter">Book talk</namespace>
 31 |       <namespace key="118" case="first-letter">Draft</namespace>
 32 |       <namespace key="119" case="first-letter">Draft talk</namespace>
 33 |       <namespace key="446" case="first-letter">Education Program</namespace>
 34 |       <namespace key="447" case="first-letter">Education Program talk</namespace>
 35 |       <namespace key="710" case="first-letter">TimedText</namespace>
 36 |       <namespace key="711" case="first-letter">TimedText talk</namespace>
 37 |       <namespace key="828" case="first-letter">Module</namespace>
 38 |       <namespace key="829" case="first-letter">Module talk</namespace>
 39 |       <namespace key="2600" case="first-letter">Topic</namespace>
 40 |     </namespaces>
 41 |   </siteinfo>
 42 |   <page>
 43 |     <title>Nagamaki naomi</title>
 44 |     <ns>0</ns>
 45 |     <id>1325004</id>
 46 |     <redirect title="Nagamaki" />
 47 |     <revision>
 48 |       <id>8801038</id>
 49 |       <timestamp>2004-12-25T05:34:36Z</timestamp>
 50 |       <contributor>
 51 |         <ip>4.46.105.106</ip>
 52 |       </contributor>
 53 |       <text xml:space="preserve">This is a doi randomly placed in the text 10.0000/m1
 54 |       Here's a typo that might be construed as a doi 10.60 people were there.
 55 |       &lt;ref&gt;Halfaker, A., Geiger, R. S., Morgan, J. T., &amp; Riedl, J. (2012).
 56 |       The rise and decline of an open collaboration system: How Wikipedia's
 57 |       reaction to popularity is causing its decline.
 58 |       American Behavioral Scientist,
 59 |       0002764212469365 doi: 10.1177/0002764212469365&lt;/ref&gt;.  Hats pants and banana
 60 |       [http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp]
 61 |       {{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}}
 62 |       http://www.google.com/sky/#latitude=3.362&amp;longitude=160.1238441&amp;zoom=
 63 |       10.2387/234310.2347/39423</text>
 64 |       <sha1>cqt09qrejym5mdf3h0irtelqd8f46h3</sha1>
 65 |       <model>wikitext</model>
 66 |       <format>text/x-wiki</format>
 67 |     </revision>
 68 |     <revision>
 69 |       <id>10730831</id>
 70 |       <parentid>8801038</parentid>
 71 |       <timestamp>2004-12-25T05:38:45Z</timestamp>
 72 |       <contributor>
 73 |         <username>Gadfium</username>
 74 |         <id>54381</id>
 75 |       </contributor>
 76 |       <minor />
 77 |       <comment>Wikify</comment>
 78 |       <text xml:space="preserve">This is a doi randomly placed in the text 10.0000/m1
 79 |       Here's a typo that might be construed as a doi 10.60 people were there.
 80 |       {{cite|...|doi=10.0000/m2|pmid=10559875}}
 81 |       &lt;ref&gt;Halfaker, A., Geiger, R. S., Morgan, J. T., &amp; Riedl, J. (2012).
 82 |       The rise and decline of an open collaboration system: How Wikipedia's
 83 |       reaction to popularity is causing its decline.
 84 |       American Behavioral Scientist,
 85 |       0002764212469365 doi: 10.1177/0002764212469365&lt;/ref&gt;.  Hats pants and banana
 86 |       [http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp]
 87 |       [http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp[waffles]]
 88 |       {{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}}
 89 |       http://www.google.com/sky/#latitude=3.362&amp;longitude=160.1238441&amp;zoom=
 90 |       10.2387/234310.2347/39423</text>
 91 |       <sha1>pfjkfb1u54tnkl4exkxge4f5v1mn7cl</sha1>
 92 |       <model>wikitext</model>
 93 |       <format>text/x-wiki</format>
 94 |     </revision>
 95 |     <revision>
 96 |       <id>10730832</id>
 97 |       <parentid>10730831</parentid>
 98 |       <timestamp>2004-12-25T05:38:46Z</timestamp>
 99 |       <contributor>
100 |         <username>Gadfium</username>
101 |         <id>54381</id>
102 |       </contributor>
103 |       <minor />
104 |       <comment>Wikify</comment>
105 |       <text xml:space="preserve">This is a doi randomly placed in the text 10.0000/m1
106 |       Here's a typo that might be construed as a doi 10.60 people were there.
107 |       {{cite|...|doi=10.0000/m2|pmid=10559875}}
108 |       &lt;ref&gt;Halfaker, A., Geiger, R. S., Morgan, J. T., &amp; Riedl, J. (2012).
109 |       The rise and decline of an open collaboration system: How Wikipedia's
110 |       reaction to popularity is causing its decline.
111 |       American Behavioral Scientist,
112 |       0002764212469365 doi: 10.1177/0002764212469365&lt;/ref&gt;.  Hats pants and banana
113 |       {{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}}
114 |       [http://arxiv.org/abs/0706.0004v1]
115 |       [https://arxiv.org/abs/0706.0005v1]
116 |       http://www.google.com/sky/#latitude=3.362&amp;longitude=160.1238441&amp;zoom=
117 |       10.2387/234310.2347/39423</text>
118 |       <sha1>pfjkfb1u54sksl4exkxge4f5v1mn7cl</sha1>
119 |       <model>wikitext</model>
120 |       <format>text/x-wiki</format>
121 |     </revision>
122 |     <revision>
123 |       <id>10730833</id>
124 |       <parentid>10730832</parentid>
125 |       <timestamp>2004-12-25T05:38:47Z</timestamp>
126 |       <contributor>
127 |         <username>Gadfium</username>
128 |         <id>54381</id>
129 |       </contributor>
130 |       <comment>Wikify</comment>
131 |       <text xml:space="preserve">This is a doi randomly placed in the text 10.0000/m1
132 |       Here's a typo that might be construed as a doi 10.60 people were there.
133 |       {{cite|...|doi=10.0000/m2|pmid=10559875}}
134 |       &lt;ref&gt;Halfaker, A., Geiger, R. S., Morgan, J. T., &amp; Riedl, J. (2012).
135 |       The rise and decline of an open collaboration system: How Wikipedia's
136 |       reaction to popularity is causing its decline.
137 |       American Behavioral Scientist,
138 |       0002764212469365 doi: 10.1177/0002764212469365&lt;/ref&gt;.  Hats pants and banana
139 |       [http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp]
140 |       [http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp[waffles]]
141 |       {{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}}
142 |       [https://arxiv.org/abs/0706.0005v1]
143 |       http://www.google.com/sky/#latitude=3.362&amp;longitude=160.1238441&amp;zoom=
144 |       10.2387/234310.2347/39423</text>
145 |       <sha1>pfjkfb1u54tnksksxkxgehhgv1mn7cl</sha1>
146 |       <model>wikitext</model>
147 |       <format>text/x-wiki</format>
148 |     </revision>
149 |   </page>
150 | </mediawiki>
151 | 


--------------------------------------------------------------------------------
/demonstrate_doi_extractor_performance.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from mw import api
 4 | 
 5 | from mwcites.extractors import doi
 6 | 
 7 | session = api.Session("https://en.wikipedia.org/w/api.php",
 8 |                       user_agent="Demo doi extractor")
 9 | 
10 | revisions = session.revisions.query(titles={"Psychotherapy"},
11 |                                     properties={'content'})
12 | lots = next(revisions)['*']
13 | print("Text with lots of DOIs has {0} characters".format(len(lots)))
14 | 
15 | revisions = session.revisions.query(titles={"Waffle"},
16 |                                     properties={'content'})
17 | few = next(revisions)['*']
18 | print("Text with few DOIs has {0} characters".format(len(few)))
19 | 
20 | 
21 | start = time.time()
22 | for i in range(50):
23 |     ids = set(doi.extract(lots))
24 |     ids = set(doi.extract(few))
25 | print("Regex strategy: {0}".format(time.time() - start))
26 | 
27 | start = time.time()
28 | for i in range(50):
29 |     ids = set(doi.extract_mwp(lots))
30 |     ids = set(doi.extract_mwp(few))
31 | print("MWP strategy: {0}".format(time.time() - start))
32 | 
33 | 
34 | start = time.time()
35 | for i in range(50):
36 |     ids = set(doi.extract_island(lots))
37 |     ids = set(doi.extract_island(few))
38 | print("Island parser strategy: {0}".format(time.time() - start))
39 | 
40 | start = time.time()
41 | for i in range(50):
42 |     ids = set(doi.extract_search(lots))
43 |     ids = set(doi.extract_search(few))
44 | print("Search parser strategy: {0}".format(time.time() - start))
45 | 


--------------------------------------------------------------------------------
/mwcites/__init__.py:
--------------------------------------------------------------------------------
1 | from .identifier import Identifier
2 | 
3 | __version__ = "0.2.0"
4 | 


--------------------------------------------------------------------------------
/mwcites/extractors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mediawiki-utilities/python-mwcites/2adf4b669cdbeef7d2a0ef168dd7fc26fadb6922/mwcites/extractors/__init__.py


--------------------------------------------------------------------------------
/mwcites/extractors/arxiv.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from ..identifier import Identifier
 4 | 
 5 | # From http://arxiv.org/help/arxiv_identifier
 6 | old_id = r"-?(?P<old_id>([a-z]+(.[a-z]+)/)?[0-9]{4}[0-9]+)"
 7 | new_id = r"(?P<new_id>[0-9]{4}.[0-9]+)(v[0-9]+)?"
 8 | 
 9 | prefixes=["arxiv\s*=\s*", "//arxiv\.org/(abs/)?", "arxiv:\s?"]
10 | 
11 | ARXIV_RE = re.compile(r"({0})".format("|".join(prefixes)) +
12 |                       r"({0}|{1})".format(old_id, new_id), re.I|re.U)
13 | 
14 | def extract(text):
15 |     for match in ARXIV_RE.finditer(text):
16 |         id = match.group('new_id') or match.group("old_id")
17 |         yield Identifier("arxiv", id.lower())
18 | 


--------------------------------------------------------------------------------
/mwcites/extractors/doi.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from collections import defaultdict
  3 | 
  4 | from more_itertools import peekable
  5 | 
  6 | from ..identifier import Identifier
  7 | 
  8 | DOI_START_RE = re.compile(r'10\.[0-9]{4,}/')
  9 | 
 10 | HTML_TAGS = ['ref', 'span', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
 11 |      'b', 'u', 'i', 's', 'ins', 'del', 'code', 'tt', 'blockquote',
 12 |      'pre']
 13 | 
 14 | TAGS_RE = re.compile(r'<(/\s*)?(' + '|'.join(HTML_TAGS) + ')(\s[^>\n\r]+)?>', re.I)
 15 | 
 16 | '''
 17 | DOI_RE = re.compile(r'\b(10\.\d+/[^\s\|\]\}\?\,]+)')
 18 | 
 19 | def extract_regex(text):
 20 |     for match in DOI_RE.finditer(text):
 21 |         id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
 22 |         yield Identifier("doi", id)
 23 | 
 24 | import mwparserfromhell as mwp
 25 | def extract_mwp(text):
 26 |     no_tags = mwp.parse(text).strip_code()
 27 |     for match in DOI_RE.finditer(no_tags):
 28 |         id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
 29 |         yield Identifier("doi", id)
 30 | '''
 31 | 
 32 | LEXICON = [
 33 |     (DOI_START_RE.pattern, 'doi_start'),
 34 |     (r'\(',                'open_paren'),
 35 |     (r'\)',                'close_paren'),
 36 |     (r'\[',                'open_bracket'),
 37 |     (r'\]',                'close_bracket'),
 38 |     (r'<!--',              'comment_start'),
 39 |     (r'-->',               'comment_end'),
 40 |     (TAGS_RE.pattern,      'tag'),
 41 |     (r'<',                 'open_angle'),
 42 |     (r'>',                 'close_angle'),
 43 |     (r'\{',                'open_curly'),
 44 |     (r'\}',                'close_curly'),
 45 |     (r'\|',                'pipe'),
 46 |     (r'[,\.;!]',           'punct'),
 47 |     (r'[\?#]',             'url_end'),
 48 |     (r'[\n\r]+',           'break'),
 49 |     (r'\s+',               'whitespace'),
 50 |     (r'\w+',               'word'),
 51 |     (r'.',                 'etc')
 52 | ]
 53 | 
 54 | def extract_island(text):
 55 |     tokens = tokenize_finditer(text, LEXICON)
 56 |     tokens = peekable(tokens)
 57 | 
 58 |     while tokens.peek(None) is not None:
 59 | 
 60 |         if tokens.peek()[0] == 'doi_start':
 61 |             yield ('doi', read_doi(tokens))
 62 | 
 63 |         next(tokens)
 64 | 
 65 | 
 66 | def tokenize_finditer(text, lexicon=LEXICON):
 67 |     pattern = '|'.join("(?P<{0}>{1})".format(name, pattern)
 68 |                        for pattern, name in lexicon)
 69 | 
 70 |     group_regex = re.compile(pattern, re.I|re.U|re.M)
 71 | 
 72 |     for match in group_regex.finditer(text):
 73 |         yield match.lastgroup, match.group(0)
 74 | 
 75 | 
 76 | """
 77 | def tokenize_scanner(text, lexicon=LEXICON):
 78 |     scanner = re.Scanner(lexicon)
 79 |     tokens, remainder = scanner.scan(text)
 80 |     return tokens
 81 | """
 82 | 
 83 | #from mwcites.extractors.doi import tokenize_scan
 84 | #list(tokenize_scan("foo bar baz.{}"))
 85 | 
 86 | def read_doi(tokens):
 87 |     assert tokens.peek()[0] == 'doi_start'
 88 | 
 89 |     depth = defaultdict(lambda: 0)
 90 | 
 91 |     doi_buffer = [next(tokens)[1]]
 92 | 
 93 |     while tokens.peek(None) is not None:
 94 |         name, match = tokens.peek()
 95 | 
 96 |         if name in ('url_end', 'break', 'whitespace', 'tag', 'pipe',
 97 |                     'comment_start', 'comment_end'):
 98 |             break
 99 |         elif name == 'open_bracket':
100 |             depth['bracket'] += 1
101 |             doi_buffer.append(next(tokens)[1])
102 |         elif name == 'open_curly':
103 |             depth['curly'] += 1
104 |             doi_buffer.append(next(tokens)[1])
105 |         elif name == 'close_bracket':
106 |             if depth['bracket'] > 0:
107 |                 depth['bracket'] -= 1
108 |                 doi_buffer.append(next(tokens)[1])
109 |             else:
110 |                 break
111 |         elif name == 'close_curly':
112 |             if depth['curly'] > 0:
113 |                 depth['curly'] -= 1
114 |                 doi_buffer.append(next(tokens)[1])
115 |             else:
116 |                 break
117 |         else:
118 |             doi_buffer.append(next(tokens)[1])
119 | 
120 | 
121 |     # Do not return a doi with punctuation at the end
122 |     return re.sub(r'[\.,!]+$', '', ''.join(doi_buffer))
123 | 
124 | 
125 | 
126 | def tokenize_search(text, start, lexicon=LEXICON):
127 |     pattern = '|'.join("(?P<{0}>{1})".format(name, pattern)
128 |                        for pattern, name in lexicon)
129 | 
130 |     group_regex = re.compile(pattern, re.I|re.U)
131 | 
132 |     match = group_regex.search(text, start)
133 |     while match is not None:
134 |         yield match.lastgroup, match.group(0)
135 |         match = group_regex.search(text, match.span()[1])
136 | 
137 | def extract_search(text, lexicon=LEXICON):
138 | 
139 |     last_end = 0
140 |     for match in DOI_START_RE.finditer(text):
141 |         if match.span()[0] > last_end:
142 |             tokens = tokenize_search(text, match.span()[0], lexicon=lexicon)
143 |             tokens = peekable(tokens)
144 |             doi = read_doi(tokens)
145 |             last_end = match.span()[0] + len(doi)
146 |             yield Identifier('doi', doi)
147 |         else:
148 |             last_end = max(match.span()[1], last_end)
149 | 
150 | extract = extract_search # Setting the default to the best method
151 | 


--------------------------------------------------------------------------------
/mwcites/extractors/isbn.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from ..identifier import Identifier
 3 | 
 4 | # Also correctly parses malformed inputs such as below:
 5 | # isbn=2 906700-09-6 (notice the space instead of a hyphen) or
 6 | # isbn=2 10 004179 7 (notice spaces instead of hyphens)
 7 | # {{ISBN|978-83-7435-239-0​}} (notice pipe instead of equals)
 8 | ISBN_RE = re.compile('isbn\s?[=|]?\s?([\d]+([\d\s\-]+)[\dXx])', re.I)
 9 | 
10 | 
11 | def extract(text):
12 |     for match in ISBN_RE.finditer(text):
13 |         yield Identifier(
14 |             'isbn',
15 |             match.group(1).replace('-', '').replace(' ', '').strip()
16 |         )
17 | 


--------------------------------------------------------------------------------
/mwcites/extractors/issn.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from ..identifier import Identifier
 3 | 
 4 | ISSN_RE = re.compile('issn\s?=?\s?([0-9]{4}\-[0-9]{3}([0-9]|X))', re.I)
 5 | 
 6 | def extract(text):
 7 |     for match in ISSN_RE.finditer(text):
 8 |         yield Identifier(
 9 |             'issn',
10 |             match.group(1).replace('-', '').replace(' ', '').strip()
11 |         )
12 | 


--------------------------------------------------------------------------------
/mwcites/extractors/pubmed.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from ..identifier import Identifier
 4 | 
 5 | TEMPLATE_RE = re.compile(r"\b(pmid|pmc)\s*=\s*(pmc)?([0-9]+)\b", re.I)
 6 | 
 7 | PMURL_RE = re.compile(r"//www\.ncbi\.nlm\.nih\.gov" +
 8 |                       r"/pubmed/([0-9]+)\b", re.I)
 9 | PMCURL_RE = re.compile(r"//www\.ncbi\.nlm\.nih\.gov" +
10 |                        r"/pmc/articles/PMC([0-9]+)\b", re.I)
11 | 
12 | def extract(text):
13 |     text = str(text or "")
14 |     
15 |     for match in TEMPLATE_RE.finditer(text):
16 |         yield Identifier(match.group(1).lower(), match.group(3))
17 |             
18 |     for match in PMURL_RE.finditer(text):
19 |         yield Identifier("pmid", match.group(1))
20 |     
21 |     for match in PMCURL_RE.finditer(text):
22 |         yield Identifier("pmc", match.group(1))
23 | 


--------------------------------------------------------------------------------
/mwcites/extractors/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mediawiki-utilities/python-mwcites/2adf4b669cdbeef7d2a0ef168dd7fc26fadb6922/mwcites/extractors/tests/__init__.py


--------------------------------------------------------------------------------
/mwcites/extractors/tests/test_arxiv.py:
--------------------------------------------------------------------------------
 1 | import pprint
 2 | 
 3 | from nose.tools import eq_
 4 | 
 5 | from .. import arxiv
 6 | from ...identifier import Identifier
 7 | 
 8 | INPUT_TEXT = """
 9 | This is a doi randomly placed in the text 10.0000/m1
10 | Here's a typo that might be construed as a doi 10.60 people were there.
11 | {{cite|...|arxiv=0706.0001v1|pmid=10559875}}
12 | <ref>Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012).
13 | The rise and decline of an open collaboration system: How Wikipedia’s
14 | reaction to popularity is causing its decline.
15 | American Behavioral Scientist,
16 | 0002764212469365 arxiv:0706.0002v1</ref>.  Hats pants and banana
17 | [http://arxiv.org/0706.0003]
18 | [http://arxiv.org/abs/0706.0004v1]
19 | [https://arxiv.org/abs/0706.0005v1]
20 | [https://arxiv.org/abs/math.GT/0309001]
21 | [https://arxiv.org/abs/-math.gs/0309002]
22 | {{cite|...|arxiv=foobar.hats/0101003|issue=1656}}
23 | http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom=
24 | 10.2387/234310.2347/39423
25 | <!--
26 |     10.2387/234310.2347/39423-->
27 | """
28 | EXPECTED = [
29 |     Identifier('arxiv', "0706.0001"),
30 |     Identifier('arxiv', "0706.0002"),
31 |     Identifier('arxiv', "0706.0003"),
32 |     Identifier('arxiv', "0706.0004"),
33 |     Identifier('arxiv', "0706.0005"),
34 |     Identifier('arxiv', "math.gt/0309001"),
35 |     Identifier('arxiv', "math.gs/0309002"),
36 |     Identifier('arxiv', "foobar.hats/0101003")
37 | ]
38 | 
39 | def test_extract():
40 |     ids = list(arxiv.extract(INPUT_TEXT))
41 |     pprint.pprint(ids)
42 |     pprint.pprint(EXPECTED)
43 |     eq_(ids, EXPECTED)
44 | 


--------------------------------------------------------------------------------
/mwcites/extractors/tests/test_doi.py:
--------------------------------------------------------------------------------
 1 | import pprint
 2 | 
 3 | from nose.tools import eq_
 4 | 
 5 | from .. import doi
 6 | from ...identifier import Identifier
 7 | 
 8 | INPUT_TEXT = """
 9 | This is a doi randomly placed in the text 10.0000/m1
10 | Here's a typo that might be construed as a doi 10.60 people were there.
11 | {{cite|...|doi=10.0000/m2|pmid=10559875}}
12 | <ref>Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012).
13 | The rise and decline of an open collaboration system: How Wikipedia’s
14 | reaction to popularity is causing its decline.
15 | American Behavioral Scientist,
16 | 0002764212469365 doi: 10.1177/0002764212469365</ref>.  Hats pants and banana
17 | [http://dx.doi.org/10.1170/foo<bar>(herp)derp]
18 | [http://dx.doi.org/10.1170/foo<bar>(herp)derp[waffles]]
19 | {{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}}
20 | http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom=
21 | 10.2387/234310.2347/39423
22 | <!--
23 |     10.2387/234310.2347/39423-->
24 | """
25 | EXPECTED = [
26 |     Identifier('doi', "10.0000/m1"),
27 |     Identifier('doi', "10.0000/m2"),
28 |     Identifier('doi', "10.1177/0002764212469365"),
29 |     Identifier('doi', "10.1170/foo<bar>(herp)derp"),
30 |     Identifier('doi', "10.1170/foo<bar>(herp)derp[waffles]"),
31 |     Identifier('doi', "10.1098/rspb.2008.1131"),
32 |     Identifier('doi', "10.2387/234310.2347/39423"),
33 |     Identifier('doi', "10.2387/234310.2347/39423")
34 | ]
35 | 
36 | """
37 | def test_extract_regex():
38 |     ids = list(doi.extract_regex(INPUT_TEXT))
39 |     pprint.pprint(ids)
40 |     pprint.pprint(EXPECTED)
41 |     eq_(ids, EXPECTED)
42 | 
43 | def test_extract_mwp():
44 |     ids = list(doi.extract_mwp(INPUT_TEXT))
45 |     pprint.pprint(ids)
46 |     pprint.pprint(EXPECTED)
47 |     eq_(ids, EXPECTED)
48 | """
49 | 
50 | def test_extract():
51 |     ids = list(doi.extract(INPUT_TEXT))
52 |     pprint.pprint(ids)
53 |     pprint.pprint(EXPECTED)
54 |     eq_(ids, EXPECTED)
55 | 
56 | def test_extract_island():
57 |     ids = list(doi.extract_island(INPUT_TEXT))
58 |     pprint.pprint(ids)
59 |     pprint.pprint(EXPECTED)
60 |     eq_(ids, EXPECTED)
61 | 
62 | def test_extract_search():
63 |     ids = list(doi.extract_search(INPUT_TEXT))
64 |     pprint.pprint(ids)
65 |     pprint.pprint(EXPECTED)
66 |     #pprint.pprint(list(doi.tokenize_finditer(INPUT_TEXT)))
67 |     eq_(ids, EXPECTED)
68 | 


--------------------------------------------------------------------------------
/mwcites/extractors/tests/test_isbn.py:
--------------------------------------------------------------------------------
 1 | import pprint
 2 | from nose.tools import eq_
 3 | 
 4 | from .. import isbn
 5 | from ...identifier import Identifier
 6 | 
 7 | INPUT_TEXT = """
 8 |     | isbn=2 906700-09-6
 9 |     | isbn=2 10 004179 7
10 |     | publisher=Academic Press | isbn=0124366031
11 |     | isbn=3540206310
12 |     | accessdate=2008-02-05 | isbn=0-618-34342-3
13 |     | isbn=978-0-140-27666-4
14 |     | isbn = 0-13-054091-9
15 |     | isbn=0195305736 }}&lt;/ref&gt; schlug [[Irving Langmuir]] 1919 vor, dass das Elektronen in einem Atom verbunden oder verklumpt seien. Elektronengruppen beset
16 |     | ISBN=978-3-7046-5112-9
17 |     * Peter L. Bergen: ''Heiliger Krieg, Inc.: Osama bin Ladens Terrornetz''. Siedler, Berlin 2001, ISBN 3-88680-752-5.
18 |     * Marwan Abou-Taam, Ruth Bigalke (Hgg) ''Die Reden des Osama bin Laden''. Diederichs, München 2006, ISBN 3-72052-773-5. (Reden und Ansprachen des b.L. im Original - ''Rezensionen: '' [http://www.sicherheit-heute.de/index.php?cccpage=readpolitik&amp;set_z_artikel=221 ]und [http://www.fr-online.de/in_und_ausland/kultur_und_medien/buecher/?em_cnt=868715&amp;sid=f55727] Frankf. Rundschau 26. April 2006)
19 |     * Michael Pekler, Andreas Ungerböck: ''Ang Lee und seine Filme''. Schüren, Marburg 2009, ISBN 978-3-89472-665-2.
20 |     &lt;ref name=&quot;flos1&quot;&gt;{{Literatur | Autor = René Flosdorff, Günther Hilgarth | Titel = Elektrische Energieverteilung | Verlag = Teubner | Auflage = 8. | Jahr = 2003 | Kapitel = Kapitel 1.2.2.4 | ISBN = 3-519-26424-2 }}&lt;/ref&gt;
21 |     Bei einer [[Sprungtemperatur]] von 1,2&amp;nbsp;K wird reines Aluminium [[Supraleiter|supraleitend]].&lt;ref&gt;{{Literatur | Autor = Ilschner | first = Bernhard | Titel = Werkstoffwissenschaften und Fertigungstechnik Eigenschaften, Vorgänge, Technologien | Verlag = Springer | Ort = Berlin | Jahr = 2010 | ISBN = 978-3-642-01734-6 | Seiten = 277}}&lt;/ref&gt;
22 |     * {{Literatur | Autor=Michael J. Padilla, Ioannis Miaoulis, Martha Cyr | Jahr = 2002 | Titel = Prentice Hall Science Explorer: Chemical Building Blocks | Verlag = Prentice-Hall, Inc. | Ort = Upper Saddle River, New Jersey USA | ISBN = 0-13-054091-9 | |Originalsprache=en}}
23 |     * ISBN 0 902 198 84 X
24 |     * ISBN 1-57488-530-8
25 |     * {{ISBN|978-83-7435-239-0​}}
26 |     """
27 | 
28 | 
29 | EXPECTED = [
30 |     Identifier('isbn', '2906700096'),
31 |     Identifier('isbn', '2100041797'),
32 |     Identifier('isbn', '0124366031'),
33 |     Identifier('isbn', '3540206310'),
34 |     Identifier('isbn', '0618343423'),
35 |     Identifier('isbn', '9780140276664'),
36 |     Identifier('isbn', '0130540919'),
37 |     Identifier('isbn', '0195305736'),
38 |     Identifier('isbn', '9783704651129'),
39 |     Identifier('isbn', '3886807525'),
40 |     Identifier('isbn', '3720527735'),
41 |     Identifier('isbn', '9783894726652'),
42 |     Identifier('isbn', '3519264242'),
43 |     Identifier('isbn', '9783642017346'),
44 |     Identifier('isbn', '0130540919'),
45 |     Identifier('isbn', '090219884X'),
46 |     Identifier('isbn', '1574885308'),
47 |     Identifier('isbn', '9788374352390'),
48 | ]
49 | 
50 | def test_extract():
51 |     ids = list(isbn.extract(INPUT_TEXT))
52 |     pprint.pprint(ids)
53 |     pprint.pprint(EXPECTED)
54 |     eq_(ids, EXPECTED)
55 | 


--------------------------------------------------------------------------------
/mwcites/extractors/tests/test_issn.py:
--------------------------------------------------------------------------------
 1 | import pprint
 2 | from nose.tools import eq_
 3 | 
 4 | from .. import issn
 5 | from ...identifier import Identifier
 6 | 
 7 | INPUT_TEXT = """
 8 |  {{cite book|work=Billboard|title=Sinatra FBI Files Opened|first=Bill|last=Holland|url=https://books.google.com/books?id=KQoEAAAAMBAJ&dq=Bill+Holland+1998+Billboard+page+10&q=walter+winchell#v=snippet&q=walter%20winchell&f=false|date=December 19, 1998|page=10|issn=0006-2510}}
 9 |     """
10 | 
11 | 
12 | EXPECTED = [
13 |     Identifier('issn', '00062510'),
14 | ]
15 | 
16 | def test_extract():
17 |     ids = list(issn.extract(INPUT_TEXT))
18 |     pprint.pprint(ids)
19 |     pprint.pprint(EXPECTED)
20 |     eq_(ids, EXPECTED)
21 | 


--------------------------------------------------------------------------------
/mwcites/extractors/tests/test_pubmed.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import eq_
 2 | 
 3 | from .. import pubmed
 4 | from ...identifier import Identifier
 5 | 
 6 | def test_extract():
 7 | 
 8 |     text = """
 9 |     This is some text with a template cite. {{cite|...|...|pmid=1}}.
10 |     This is some text with a template cite. {{cite|...|...|pmid = 2|...}}.
11 |     This is some text with a template cite. {{cite|...|...|pmc = 3|...}}.
12 |     This is some text with a template cite. {{cite|...|...|pmc = pmc4|...}}.
13 |     This is some text with a link [http://www.ncbi.nlm.nih.gov/pubmed/5 ID]
14 |     Another link [https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6 ID]
15 |     """
16 |     ids = list(pubmed.extract(text))
17 |     expected = [
18 |         Identifier('pmid', "1"),
19 |         Identifier('pmid', "2"),
20 |         Identifier('pmc', "3"),
21 |         Identifier('pmc', "4"),
22 |         Identifier('pmid', "5"),
23 |         Identifier('pmc', "6")
24 |     ]
25 |     print(ids)
26 |     print(expected)
27 |     eq_(ids, expected)
28 | 


--------------------------------------------------------------------------------
/mwcites/identifier.py:
--------------------------------------------------------------------------------
1 | from collections import namedtuple
2 | 
3 | Identifier = namedtuple("Identifier", ['type', 'id'])
4 | 


--------------------------------------------------------------------------------
/mwcites/mwcites.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script provides access to a set of utilities for processing academic
 3 | citations in Wikipedia.
 4 | 
 5 | Right now, there's only one utility, but there will be more to come.
 6 | 
 7 | * extract -- Extracts citations from an XML database dump
 8 | 
 9 | Usage:
10 |     mwcites (-h | --help)
11 |     mwcites <utility> [-h | --help]
12 | 
13 | Options:
14 |     -h | --help  Shows this documentation
15 |     <utility>    The name of the utility to run
16 | """
17 | import sys
18 | import traceback
19 | from importlib import import_module
20 | 
21 | import docopt
22 | 
23 | 
24 | USAGE = """Usage:
25 |     mwcites (-h | --help)
26 |     mwcites <utility> [-h | --help]\n"""
27 | 
28 | 
29 | def main():
30 | 
31 |     if len(sys.argv) < 2:
32 |         sys.stderr.write(USAGE)
33 |         sys.exit(1)
34 |     elif sys.argv[1] in ("-h", "--help"):
35 |         sys.stderr.write(__doc__ + "\n")
36 |         sys.exit(1)
37 |     elif sys.argv[1][:1] == "-":
38 |         sys.stderr.write(USAGE)
39 |         sys.exit(1)
40 | 
41 |     module_name = sys.argv[1]
42 |     try:
43 |         module = import_module(".utilities." + module_name, package="mwcites")
44 |     except ImportError:
45 |         sys.stderr.write(traceback.format_exc())
46 |         sys.stderr.write("Could not find utility {0}.\n".format(module_name))
47 |         sys.exit(1)
48 | 
49 |     module.main(sys.argv[2:])
50 | 


--------------------------------------------------------------------------------
/mwcites/utilities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mediawiki-utilities/python-mwcites/2adf4b669cdbeef7d2a0ef168dd7fc26fadb6922/mwcites/utilities/__init__.py


--------------------------------------------------------------------------------
/mwcites/utilities/extract.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Extracts academic citations from articles from the history of Wikipedia articles
  3 | by processing a pages-meta-history XML dump and matching regular expressions
  4 | to revision content.
  5 | 
  6 | Currently supported identifies include:
  7 | 
  8 |  * PubMed
  9 |  * DOI
 10 |  * ISBN
 11 |  * arXiv
 12 |  * ISSN
 13 | 
 14 | Outputs a TSV file with the following fields:
 15 | 
 16 |  * page_id: The identifier of the Wikipedia article (int), e.g. 1325125
 17 |  * page_title: The title of the Wikipedia article (utf-8), e.g. Club cell
 18 |  * rev_id: The Wikipedia revision where the citation was first added (int),
 19 |            e.g. 282470030
 20 |  * timestamp: The timestamp of the revision where the citation was first added.
 21 |               (ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z
 22 |  * type: The type of identifier, e.g. pmid, pmcid, doi, arxiv or isbn
 23 |  * id: The id of the cited scholarly article (utf-8),
 24 |        e.g 10.1183/09031936.00213411
 25 | 
 26 | Usage:
 27 |     extract -h | --help
 28 |     extract <dump_file>... [--extractor=<classpath>...]
 29 | 
 30 | Options:
 31 |     -h --help                Shows this documentation
 32 |     <dump_file>              The path to a set of dump files to process.  If no
 33 |                              files are specified, <stdin> will be read.
 34 |     --extractor=<classpath>  The class path to set of extractors to apply
 35 |                              [default: <all>]
 36 | """
 37 | import sys
 38 | from itertools import chain
 39 | 
 40 | import docopt
 41 | import mwxml
 42 | 
 43 | import mysqltsv
 44 | 
 45 | from ..extractors import arxiv, doi, isbn, pubmed, issn
 46 | 
 47 | ALL_EXTRACTORS = [doi, pubmed, isbn, arxiv, issn]
 48 | 
 49 | HEADERS = ("page_id", "page_title", "rev_id", "timestamp", "type", "id")
 50 | 
 51 | def main(argv=None):
 52 |     args = docopt.docopt(__doc__, argv=argv)
 53 |     dump_files = args['<dump_file>']
 54 | 
 55 |     if args['--extractor'] == ['<all>']:
 56 |         extractors = ALL_EXTRACTORS
 57 |     else:
 58 |         extractors = [import_from_path(path.lower)
 59 |                       for path in args['--extractor']]
 60 | 
 61 |     run(dump_files, extractors)
 62 | 
 63 | def run(dump_files, extractors):
 64 |     writer = mysqltsv.Writer(sys.stdout, headers=HEADERS)
 65 | 
 66 |     cites = extract(dump_files, extractors=extractors)
 67 |     for page_id, title, rev_id, timestamp, type, id in cites:
 68 |         writer.write([page_id, title, rev_id, timestamp.long_format(), type, id])
 69 | 
 70 | def extract(dump_files, extractors=ALL_EXTRACTORS):
 71 |     """
 72 |     Extracts cites from a set of `dump_files`.
 73 | 
 74 |     :Parameters:
 75 |         dump_files : str | `file`
 76 |             A set of files MediaWiki XML dump files
 77 |             (expects: pages-meta-history)
 78 |         extractors : `list`(`extractor`)
 79 |             A list of extractors to apply to the text
 80 | 
 81 |     :Returns:
 82 |         `iterable` -- a generator of extracted cites
 83 | 
 84 |     """
 85 |     # Dump processor function
 86 |     def process_dump(dump, path):
 87 |         for page in dump:
 88 |             if page.namespace != 0: continue
 89 |             else:
 90 |                 for cite in extract_cite_history(page, extractors):
 91 |                     yield cite
 92 | 
 93 |     # Map call
 94 |     return mwxml.map(process_dump, dump_files)
 95 | 
 96 | def extract_cite_history(page, extractors):
 97 |     """
 98 |     Extracts cites from the history of a `page` (`mwxml.Page`).
 99 | 
100 |     :Parameters:
101 |         page : `iterable`(`mwxml.Revision`)
102 |             The page to extract cites from
103 |         extractors : `list`(`extractor`)
104 |             A list of extractors to apply to the text
105 | 
106 |     :Returns:
107 |         `iterable` -- a generator of extracted cites
108 | 
109 |     """
110 |     appearances = {} # For tracking the first appearance of an ID
111 |     ids = set() # For holding onto the ids in the last revision.
112 |     for revision in page:
113 |         ids = set(extract_ids(revision.text, extractors))
114 | 
115 |         # For each ID, check to see if we have seen it before
116 |         for id in ids:
117 |             if id not in appearances:
118 |                appearances[id] = (revision.id, revision.timestamp)
119 | 
120 |     for id in ids: #For the ids in the last version of the page
121 |         rev_id, timestamp = appearances[id]
122 |         yield (page.id, page.title, rev_id, timestamp, id.type, id.id)
123 | 
124 | def extract_ids(text, extractors):
125 |     """
126 |     Uses `extractors` to extract citation identifiers from a text.
127 | 
128 |     :Parameters:
129 |         text : str
130 |             The text to process
131 |         extractors : `list`(`extractor`)
132 |             A list of extractors to apply to the text
133 | 
134 |     :Returns:
135 |         `iterable` -- a generator of extracted identifiers
136 |     """
137 |     for extractor in extractors:
138 |         for id in extractor.extract(text):
139 |             yield id
140 | 
141 | def import_from_path(path):
142 |     """
143 |     Imports a specific attribute from a module based on a class path.
144 | 
145 |     :Parameters:
146 |         path : str
147 |             A dot delimited string representing the import path of the desired
148 |             object.
149 | 
150 |     :Returns:
151 |         object -- An imported object
152 |     """
153 |     parts = path.split(".")
154 |     module_path = ".".join(parts[:-1])
155 |     attribute_name = parts[-1]
156 | 
157 |     module = import_module(module_path)
158 | 
159 |     attribute = getattr(module, attribute_name)
160 | 
161 |     return attribute
162 | 


--------------------------------------------------------------------------------
/mwcites/utilities/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mediawiki-utilities/python-mwcites/2adf4b669cdbeef7d2a0ef168dd7fc26fadb6922/mwcites/utilities/tests/__init__.py


--------------------------------------------------------------------------------
/mwcites/utilities/tests/test_extract.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | 
 3 | from mw import Timestamp
 4 | from nose.tools import eq_
 5 | 
 6 | from ..extract import extract_cite_history
 7 | from ...identifier import Identifier
 8 | 
 9 | 
10 | def test_extract_cite_history():
11 |     FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text'])
12 | 
13 |     FakeExtractor = namedtuple("Extractor", ['extract'])
14 | 
15 |     class FakePage:
16 |         def __init__(self, id, title):
17 |             self.id = id
18 |             self.title = title
19 |         def __iter__(self):
20 |             return iter([
21 |                 FakeRevision(1, Timestamp(1), "id1 id2"),
22 |                 FakeRevision(2, Timestamp(2), "id1 id3"),
23 |                 FakeRevision(3, Timestamp(3), "id1 id2 id3"),
24 |                 FakeRevision(4, Timestamp(4), "id1 id2 id4"),
25 |                 FakeRevision(5, Timestamp(5), "id1 id2 id4"),
26 |             ])
27 | 
28 |     fake_page = FakePage(1, "Title")
29 | 
30 |     def extract(text):
31 |         return (Identifier('fake', id) for id in text.split(" "))
32 |     extractor = FakeExtractor(extract)
33 | 
34 |     expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"),
35 |                 (1, "Title", 1, Timestamp(1), "fake", "id2"),
36 |                 (1, "Title", 4, Timestamp(4), "fake", "id4")]
37 | 
38 |     citations = list(extract_cite_history(fake_page, [extractor]))
39 |     eq_(len(citations), len(expected))
40 |     for cite in extract_cite_history(fake_page, [extractor]):
41 |         assert cite in expected
42 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | docopt
2 | more-itertools
3 | mwparserfromhell
4 | mwxml
5 | mysqltsv
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | import mwcites
 6 | 
 7 | 
 8 | def read(fname):
 9 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
10 | 
11 | def requirements(fname):
12 |     return [line.strip()
13 |             for line in open(os.path.join(os.path.dirname(__file__), fname))]
14 | 
15 | setup(
16 |     name = "mwcites",
17 |     version = mwcites.__version__,
18 |     author = "Aaron Halfaker",
19 |     author_email = "ahalfaker@wikimedia.org",
20 |     description = "A collection of scripts and utilities for extracting " +
21 |                   "citations to academic literature from Wikipedia's XML " +
22 |                   "database dumps.",
23 |     license = "MIT",
24 |     url = "https://github.com/halfak/Extract-scholarly-article-citations-from-Wikipedia",
25 |     packages=find_packages(),
26 |     entry_points = {
27 |         'console_scripts': [
28 |             'mwcitations=mwcites.mwcites:main',
29 |             'mwcites=mwcites.mwcites:main'
30 |         ],
31 |     },
32 |     long_description = read('README.rst'),
33 |     install_requires = ['docopt', 'more-itertools', 'mediawiki-utilities'],
34 |     classifiers=[
35 |         "Programming Language :: Python :: 3",
36 |         "Development Status :: 3 - Alpha",
37 |         "License :: OSI Approved :: MIT License",
38 |         "Intended Audience :: Science/Research",
39 |         "Intended Audience :: System Administrators",
40 |         "Intended Audience :: Developers",
41 |         "Operating System :: OS Independent",
42 |         "Topic :: Utilities",
43 |         "Topic :: Scientific/Engineering"
44 |     ]
45 | )
46 | 


--------------------------------------------------------------------------------
/sql/cites_enwiki_20150602.create.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE cites_enwiki_20150602 (
 2 |   page_id INT,
 3 |   page_title VARBINARY(255),
 4 |   rev_id INT,
 5 |   timestamp VARBINARY(20),
 6 |   type VARCHAR(255),
 7 |   id VARCHAR(255)
 8 | );
 9 | CREATE INDEX type_timestamp ON cites_enwiki_20150602 (type, timestamp);
10 | 


--------------------------------------------------------------------------------
/sql/month_type_citations.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 |   LEFT(timestamp, 7) AS month,
3 |   type,
4 |   COUNT(*) AS citations
5 | FROM cites_enwiki_20150602
6 | GROUP BY 1,2;
7 | 


--------------------------------------------------------------------------------
/utility:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from mwcites import mwcites
3 | 
4 | mwcites.main()
5 | 


--------------------------------------------------------------------------------