├── screenshot_map.png
├── README.MD
├── layout_scanner.py
└── analyze_papers.py
/screenshot_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaks6/citation_map/HEAD/screenshot_map.png
--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
1 | # Create a Citation Graph based on Simplistic Text Analysis
2 |
3 | _Inspired by A.R. Siders' R Script from this [ResearchGate question]( https://www.researchgate.net/post/Is_there_any_recommended_software_to_visualise_articles_papers_references_when_conducting_a_systematic_review_or_meta-analysis )_
4 |
5 | _Based on dpapathanasiou's [example script for pdfminer](https://github.com/dpapathanasiou/pdfminer-layout-scanner)_
6 |
7 | ## Takes Zotero .CSV Article collections and creates Gephi-compatible files for Graph Edges and Nodes based on citations
8 |
9 |
10 | 
11 |
12 | ### Principle:
13 | * Let A be a set of known articles
14 | * For any a in A, let _title_a_ be its title, and _text_a_ be its text content
15 | * For some x in A and y in A, x!=y:
16 | * cites(x,y) is true if _title_y_ appears in _text_x_
17 |
18 | For the above to work, we do some text normalization (removing punctuation, whitespace, special characters) and assume that
19 | the title_y would only appear in text_x if it appears in the references section...
20 |
21 | ### Usage:
22 | 1. Export list of articles as .csv from Zotero, (articles should have File attachments)
23 | 2. Run `analyze_papers.py zotero_file.csv`
24 | 3. Script should produce two files: Edges_titles.csv and Nodes_titles.csv in folder "gephi"
25 | 4. Load them into [Gephi](https://gephi.org) with "Load Spreadsheet"
26 |
27 |
28 | ## Notes
29 | * Tested with Python3
30 | * Uses the library [pdfminer](https://pypi.org/project/pdfminer/)
31 | * You can specify number of processes the script uses to parse the PDFs with parameter --processes (default value is 4)
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/layout_scanner.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import sys
4 | import os
5 | from binascii import b2a_hex
6 |
7 |
8 | ###
9 | ### pdf-miner requirements
10 | ###
11 |
12 | from pdfminer.pdfparser import PDFParser
13 | from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
14 | from pdfminer.pdfpage import PDFPage
15 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
16 | from pdfminer.converter import PDFPageAggregator
17 | from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar
18 |
19 | def with_pdf (pdf_doc, fn, pdf_pwd, *args):
20 | """Open the pdf document, and apply the function, returning the results"""
21 | result = None
22 | try:
23 | # open the pdf file
24 | fp = open(pdf_doc, 'rb')
25 | # create a parser object associated with the file object
26 | parser = PDFParser(fp)
27 | # create a PDFDocument object that stores the document structure
28 | doc = PDFDocument(parser)
29 | # connect the parser and document objects
30 | parser.set_document(doc)
31 | # supply the password for initialization
32 |
33 | #doc.initialize(pdf_pwd)
34 |
35 | if doc.is_extractable:
36 | # apply the function and return the result
37 | result = fn(doc, *args)
38 |
39 | # close the pdf file
40 | fp.close()
41 | except IOError:
42 | print("[!] Error opening file in with_pdf()", file=sys.stderr)
43 | # the file doesn't exist or similar problem
44 | pass
45 | return result
46 |
47 |
48 | ###
49 | ### Table of Contents
50 | ###
51 |
52 | def _parse_toc (doc):
53 | """With an open PDFDocument object, get the table of contents (toc) data
54 | [this is a higher-order function to be passed to with_pdf()]"""
55 | toc = []
56 | try:
57 | outlines = doc.get_outlines()
58 | for (level,title,dest,a,se) in outlines:
59 | toc.append( (level, title) )
60 | except PDFNoOutlines:
61 | pass
62 | return toc
63 |
64 | def get_toc (pdf_doc, pdf_pwd=''):
65 | """Return the table of contents (toc), if any, for this pdf file"""
66 | return with_pdf(pdf_doc, _parse_toc, pdf_pwd)
67 |
68 |
69 | ###
70 | ### Extracting Images
71 | ###
72 |
73 | def write_file (folder, filename, filedata, flags='w'):
74 | """Write the file data to the folder and filename combination
75 | (flags: 'w' for write text, 'wb' for write binary, use 'a' instead of 'w' for append)"""
76 | result = False
77 | if os.path.isdir(folder):
78 | try:
79 | file_obj = open(os.path.join(folder, filename), flags)
80 | file_obj.write(filedata)
81 | file_obj.close()
82 | result = True
83 | except IOError:
84 | pass
85 | return result
86 |
87 | def determine_image_type (stream_first_4_bytes):
88 | """Find out the image file type based on the magic number comparison of the first 4 (or 2) bytes"""
89 | file_type = None
90 | bytes_as_hex = b2a_hex(stream_first_4_bytes)
91 | if bytes_as_hex.startswith('ffd8'):
92 | file_type = '.jpeg'
93 | elif bytes_as_hex == '89504e47':
94 | file_type = '.png'
95 | elif bytes_as_hex == '47494638':
96 | file_type = '.gif'
97 | elif bytes_as_hex.startswith('424d'):
98 | file_type = '.bmp'
99 | return file_type
100 |
101 | def save_image (lt_image, page_number, images_folder):
102 | """Try to save the image data from this LTImage object, and return the file name, if successful"""
103 | result = None
104 | if lt_image.stream:
105 | file_stream = lt_image.stream.get_rawdata()
106 | if file_stream:
107 | file_ext = determine_image_type(file_stream[0:4])
108 | if file_ext:
109 | file_name = ''.join([str(page_number), '_', lt_image.name, file_ext])
110 | if write_file(images_folder, file_name, file_stream, flags='wb'):
111 | result = file_name
112 | return result
113 |
114 |
115 | ###
116 | ### Extracting Text
117 | ###
118 |
119 | def to_bytestring (s, enc='utf-8'):
120 | """Convert the given unicode string to a bytestring, using the standard encoding,
121 | unless it's already a bytestring"""
122 | if s:
123 | if isinstance(s, str):
124 | return s
125 | else:
126 | return s.encode(enc)
127 |
128 | def update_page_text_hash (h, lt_obj, pct=0.2):
129 | """Use the bbox x0,x1 values within pct% to produce lists of associated text within the hash"""
130 |
131 | x0 = lt_obj.bbox[0]
132 | x1 = lt_obj.bbox[2]
133 |
134 | key_found = False
135 | for k, v in h.items():
136 | hash_x0 = k[0]
137 | if x0 >= (hash_x0 * (1.0-pct)) and (hash_x0 * (1.0+pct)) >= x0:
138 | hash_x1 = k[1]
139 | if x1 >= (hash_x1 * (1.0-pct)) and (hash_x1 * (1.0+pct)) >= x1:
140 | # the text inside this LT* object was positioned at the same
141 | # width as a prior series of text, so it belongs together
142 | key_found = True
143 | v.append(to_bytestring(lt_obj.get_text()))
144 | h[k] = v
145 | if not key_found:
146 | # the text, based on width, is a new series,
147 | # so it gets its own series (entry in the hash)
148 | h[(x0,x1)] = [to_bytestring(lt_obj.get_text())]
149 |
150 | return h
151 |
152 | def parse_lt_objs (lt_objs, page_number, images_folder, text_content=None):
153 | """Iterate through the list of LT* objects and capture the text or image data contained in each"""
154 | if text_content is None:
155 | text_content = []
156 |
157 | skip_images = False
158 | if images_folder is None:
159 | skip_images = True
160 | page_text = {} # k=(x0, x1) of the bbox, v=list of text strings within that bbox width (physical column)
161 | for lt_obj in lt_objs:
162 | if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
163 | # text, so arrange is logically based on its column width
164 | page_text = update_page_text_hash(page_text, lt_obj)
165 | elif isinstance(lt_obj, LTImage):
166 | if skip_images:
167 | continue
168 | # an image, so save it to the designated folder, and note its place in the text
169 | saved_file = save_image(lt_obj, page_number, images_folder)
170 | if saved_file:
171 | # use html style
tag to mark the position of the image within the text
172 | text_content.append('
')
173 | else:
174 | print("[!] error saving image on page", page_number, lt_obj.__repr__, file=sys.stderr)
175 | elif isinstance(lt_obj, LTFigure):
176 | if skip_images:
177 | continue
178 | # LTFigure objects are containers for other LT* objects, so recurse through the children
179 | text_content.append(parse_lt_objs(lt_obj, page_number, images_folder, text_content))
180 |
181 | for k, v in sorted([(key,value) for (key,value) in page_text.items()]):
182 | # sort the page_text hash by the keys (x0,x1 values of the bbox),
183 | # which produces a top-down, left-to-right sequence of related columns
184 | text_content.append(''.join(v))
185 |
186 | return '\n'.join(text_content)
187 |
188 |
189 | ###
190 | ### Processing Pages
191 | ###
192 |
193 | def _parse_pages (doc, images_folder):
194 | """With an open PDFDocument object, get the pages and parse each one
195 | [this is a higher-order function to be passed to with_pdf()]"""
196 | rsrcmgr = PDFResourceManager()
197 | laparams = LAParams()
198 | # device = PDFPageAggregator(rsrcmgr, laparams=laparams)
199 | device = PDFPageAggregator(rsrcmgr, laparams=laparams)
200 | interpreter = PDFPageInterpreter(rsrcmgr, device)
201 |
202 | text_content = []
203 | for i, page in enumerate(PDFPage.create_pages(doc)):
204 | interpreter.process_page(page)
205 | # receive the LTPage object for this page
206 | layout = device.get_result()
207 | # layout is an LTPage object which may contain child objects like LTTextBox, LTFigure, LTImage, etc.
208 | text_content.append(parse_lt_objs(layout, (i+1), images_folder))
209 |
210 | return text_content
211 |
212 | def get_pages (pdf_doc, pdf_pwd='', images_folder='/tmp'):
213 | """Process each of the pages in this pdf file and return a list of strings representing the text found in each page"""
214 | return with_pdf(pdf_doc, _parse_pages, pdf_pwd, *tuple([images_folder]))
215 |
--------------------------------------------------------------------------------
/analyze_papers.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import argparse
3 | import csv
4 | import re
5 | import string
6 | import sys, os, time
7 |
8 | from functools import partial
9 | from multiprocessing import Pool
10 |
11 | import errno
12 | import layout_scanner
13 |
14 | # Zotero CSV Column indices
15 | YEAR_I = 2
16 | AUTHOR_I = 3
17 | TITLE_I = 4
18 | FILE_I = 37
19 |
20 | DEFAULT_OUTPUT_CSV_NAME = "titles.csv"
21 | DEFAULT_OUTPUT_DELIMITER = "\t"
22 |
23 | used_filenames = []
24 | graph = []
25 |
26 |
27 | def pdf_to_text_list(file_loc):
28 | """
29 | Extracts text (string) of PDF file contents. Images, figures are ignored.
30 | :param str file_loc: Path to .PDF document on local disk
31 | :return: The last 10 pages of the PDF document as string text, a list of strings
32 | :rtype: list
33 | """
34 | # Read PDF pages as text
35 | pages = layout_scanner.get_pages(file_loc, images_folder=None) # you can try os.path.abspath("output/imgs")
36 | try:
37 | page_len = len(pages)
38 | except TypeError:
39 | print("[!] Issue parsing PDF file", file=sys.stderr)
40 | return (-1, [])
41 |
42 | # Take only last 10 pages (We assume references never take more) TODO:HARDCODE
43 | pages = pages[-10:]
44 |
45 | return (page_len, pages)
46 |
47 |
48 | def get_pretty_filename(metadata):
49 | fixed_title = re.sub('[^A-Za-z0-9]+', '', "_".join(metadata["title"].split(" ")[:10]))
50 | authors = metadata["author"].split(";")
51 | author_2nd = ""
52 | if len(authors) > 2:
53 | author_2nd = "et al."
54 | elif len(authors) == 2:
55 | author_2nd = "& " + authors[1].split(",")[0]
56 | author_1st = authors[0].split(",")[0]
57 | txt_filename = "%s %s %s" % (author_1st, author_2nd, metadata["year"])
58 | if txt_filename in used_filenames:
59 | txt_filename = txt_filename + fixed_title[:-20]
60 | used_filenames.append(txt_filename)
61 | return txt_filename
62 |
63 | def create_missing_dirs(filename):
64 | if not os.path.exists(os.path.dirname(filename)):
65 | try:
66 | os.makedirs(os.path.dirname(filename))
67 | except OSError as exc: # Guard against race condition
68 | if exc.errno != errno.EEXIST:
69 | raise
70 |
71 |
72 | def read_titles(zotero_csv):
73 | titles = {}
74 | with open(zotero_csv, 'rt') as csvfile:
75 | reader = csv.reader(csvfile, delimiter=',')
76 | next(csvfile) # Skip header
77 | for r in reader:
78 | titles[pre_process(r[TITLE_I])] = \
79 | {'title': r[TITLE_I],
80 | 'author': r[AUTHOR_I],
81 | 'file': r[FILE_I],
82 | 'year': r[YEAR_I]}
83 | return titles
84 |
85 |
86 | def process_pdf(metadata, write_to_disk=False):
87 | """
88 | Reads text from PDF file specified in the CSV lines' file column, optionally saves it to .txt on disk
89 | :param dict csv_line: the CSV line to process
90 | :param bool write_to_disk: whether the text will be written to disk also
91 | :return: a tuple (bool, text, log), where bool indicates whether text was extracted successfully,
92 | text the pdf text contents, log is log/debug messages
93 | :rtype: tuple
94 | """
95 |
96 | log = []
97 | log.append(" ".join(metadata['author'].split(";")[:3]) + metadata['year'] + metadata['title'][:32])
98 |
99 | if len(metadata['file']) < 1:
100 | return False, 'Missing Zotero file attachment', log
101 |
102 | all_files = metadata['file'].split(';')
103 | first_pdf = None
104 | for file in all_files:
105 | if file.lower().strip().endswith(".pdf"):
106 | first_pdf = file
107 | break
108 |
109 | if first_pdf == None:
110 | return False, 'No PDF File attached to article entry', log
111 | else:
112 | log.append("\t-- Found %s attachments, using pdf: %s" % (len(all_files), first_pdf))
113 |
114 |
115 | original_page_count, pages = pdf_to_text_list(first_pdf)
116 | if original_page_count != -1:
117 | log.append("\t-- Checking last %s PDF pages out of %s total" % (len(pages), original_page_count))
118 |
119 | if write_to_disk: # Kind of deprecated, this was used by the R script of A.R. Siders
120 | output_filename = get_pretty_filename(metadata)
121 | paper_txt_filename = args.txts_dir + os.sep + output_filename + '.txt'
122 | create_missing_dirs(paper_txt_filename)
123 | with open(paper_txt_filename, 'w') as outfile:
124 | for p in pages:
125 | print >> outfile, p
126 | outfile.close()
127 |
128 | all_pages = "\n".join(pages)
129 |
130 | return len(all_pages) > 0, all_pages, log
131 |
132 |
133 | def find_citations(paper_text, all_titles, metadata):
134 | log = []
135 | cited_ids = []
136 | # Check which titles this paper cited:
137 | fixed_paper_title = pre_process(metadata["title"])
138 | fixed_text = pre_process(paper_text)
139 |
140 | # remove whitespace
141 | for title in all_titles:
142 | if (title != fixed_paper_title) and \
143 | (title.replace(' ', '') in fixed_text.replace(' ', '')): # Stripping whitespace!
144 | log.append("\t---- citation found:" + title)
145 | cited_ids.append(title)
146 | # graph.append([fixed_paper_title, title])
147 | return cited_ids, log
148 |
149 |
150 | def article_worker(dict_item, all_titles):
151 | t0 = time.time()
152 |
153 | print_log = []
154 |
155 | title, metadata = dict_item
156 | pdf_result, text, pdf_log = process_pdf(metadata)
157 |
158 | t1 = time.time()
159 | if pdf_result:
160 | print_log.append("Processed in %s seconds :" % (t1 - t0))
161 | else:
162 | print_log.append("Error processing:")
163 | print_log.append("\t-- " + text)
164 |
165 | print_log += pdf_log
166 |
167 | cited_papers = []
168 | if pdf_result:
169 | cited_papers, citations_log = find_citations(text, all_titles, metadata)
170 | print_log += citations_log
171 | t2 = time.time()
172 | print_log.append("\t-- processed text cites in % seconds" % (t2 - t1))
173 |
174 | print("\n".join(print_log) + "\n\n")
175 |
176 | return title, pdf_result, text, cited_papers
177 |
178 |
179 | def pre_process(text):
180 | # to lowercase
181 | text = text.lower()
182 | # remove punctuation
183 | text = text.translate(str.maketrans('', '', string.punctuation))
184 | # remove linebreaks
185 | # text = re.sub(r"(?<=[a-z])\r?\n", " ", text)
186 | text = text.replace('\r', '').replace('\n', '')
187 | # remove numbers
188 | text = re.sub(r'\d+', '', text)
189 | # remove whitespace
190 | text = " ".join(re.findall(r'[a-z]+', text))
191 |
192 | return text
193 |
194 | def make_directory_if_missing(directory_path):
195 | if not os.path.exists(os.path.dirname(directory_path)):
196 | try:
197 | os.makedirs(os.path.dirname(directory_path))
198 | except OSError as exc: # Guard against race condition
199 | if exc.errno != errno.EEXIST:
200 | raise
201 |
202 |
203 | if __name__ == '__main__':
204 | parser = argparse.ArgumentParser(description=
205 | 'Extract text from PDF files whose locations are given by a Zotero CSV file')
206 | parser.add_argument('zotero_csv', type=str, help='the Zotero exported CSV file of papers')
207 | parser.add_argument('--gephi_dir', default="gephi", type=str,
208 | help='Output dir for gephi Edges and Nodes files (default: "gephi")')
209 | parser.add_argument('--processes', default=4, type=int,
210 | help='How many worker processes to create for the time-consuming PDF parsing (default: 4)')
211 | parser.add_argument('--txts_dir', default="papers", type=str,
212 | help='Output dir for article txt files (default: "papers")')
213 | parser.add_argument('--out_csv', default=DEFAULT_OUTPUT_CSV_NAME, type=str,
214 | help='Output csv filename (default: ' + DEFAULT_OUTPUT_CSV_NAME + ')')
215 | parser.add_argument('--delimiter', default=DEFAULT_OUTPUT_DELIMITER, type=str,
216 | help='Output csv delimiter (default: ' + DEFAULT_OUTPUT_CSV_NAME + ')')
217 |
218 | args = parser.parse_args()
219 | OUTPUT_CSV_NAME = args.out_csv
220 | OUTPUT_GEPHI_DIR = args.gephi_dir
221 | OUTPUT_DELIMITER = args.delimiter
222 | WORKER_PROCESSES = args.processes
223 |
224 | out_edges_filedir = OUTPUT_GEPHI_DIR + os.sep + "Edges_" + OUTPUT_CSV_NAME
225 | out_nodes_filedir = OUTPUT_GEPHI_DIR + os.sep + "Nodes_" + OUTPUT_CSV_NAME
226 |
227 | make_directory_if_missing(out_edges_filedir)
228 | make_directory_if_missing(out_nodes_filedir)
229 |
230 | error_documents = []
231 |
232 | # First, just get the titles in the csv
233 | titles_dict = read_titles(args.zotero_csv)
234 | title_ids = list(titles_dict.keys())
235 |
236 | # Now process the PDFs
237 | pool_start_time = time.time()
238 |
239 | pool = Pool(processes=WORKER_PROCESSES) # start n worker processes
240 |
241 | list_worker = partial(article_worker, all_titles=title_ids)
242 | result = pool.map(list_worker, list(titles_dict.items()), chunksize=5)
243 | for title, pdf_result, text, cited_papers in result:
244 | if pdf_result:
245 | for paper in cited_papers:
246 | graph.append([title, paper])
247 |
248 | else:
249 | error_documents.append([title, text])
250 | total_time = time.time() - pool_start_time
251 |
252 | # Print finish report, show failed documents
253 | print("\n---- Finished -----\n" \
254 | "Processed ", len(title_ids), " papers in ", total_time, "seconds")
255 | print("%s documents were not extracted due to errors:" % len(error_documents))
256 | for i, (doc_id, reason) in enumerate(error_documents):
257 | doc = titles_dict[doc_id]
258 | print( "%s. %s %s %s %s" % (i, doc["author"], doc["year"], doc["title"], doc["file"]))
259 | print("\t--", reason)
260 |
261 | # Write Graph Edges to csv
262 | with open(OUTPUT_GEPHI_DIR + os.sep + "Edges_" + OUTPUT_CSV_NAME, "a") as graph_csv:
263 | # Header
264 | graph_csv.write(OUTPUT_DELIMITER.join(["Source", "Target", "Weight"]) + "\n")
265 | for (src, target) in graph:
266 | graph_csv.write(OUTPUT_DELIMITER.join([src, target, "1"]) + "\n")
267 |
268 | # Write Graph Nodes with Labels to csv
269 | with open(OUTPUT_GEPHI_DIR + os.sep + "Nodes_" + OUTPUT_CSV_NAME, "a") as nodes_csv:
270 | # Header
271 | nodes_csv.write(OUTPUT_DELIMITER.join(["Id", "Label", "Author", "PrettyName"]) + "\n")
272 | for title in title_ids:
273 | metadata = titles_dict[title]
274 | nodes_csv.write(OUTPUT_DELIMITER.join(
275 | [title, metadata["title"], metadata["author"], get_pretty_filename(metadata)]) + "\n")
276 |
--------------------------------------------------------------------------------