├── screenshot_map.png
├── README.MD
├── layout_scanner.py
└── analyze_papers.py


/screenshot_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaks6/citation_map/HEAD/screenshot_map.png


--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
 1 | # Create a Citation Graph based on Simplistic Text Analysis
 2 | 
 3 | _Inspired by A.R. Siders' R Script from this [ResearchGate question]( https://www.researchgate.net/post/Is_there_any_recommended_software_to_visualise_articles_papers_references_when_conducting_a_systematic_review_or_meta-analysis )_
 4 | 
 5 | _Based on dpapathanasiou's [example script for pdfminer](https://github.com/dpapathanasiou/pdfminer-layout-scanner)_
 6 | 
 7 | ## Takes Zotero .CSV Article collections and creates Gephi-compatible files for Graph Edges and Nodes based on citations
 8 | 
 9 | 
10 | ![screenshot]( https://github.com/jaks6/citation_map/blob/master/screenshot_map.png )
11 | 
12 | ### Principle:
13 | * Let A be a set of known articles
14 | * For any a in A, let _title_a_ be its title, and _text_a_ be its text content
15 | * For some x in A and y in A, x!=y:
16 |     * cites(x,y) is true if _title_y_ appears in _text_x_
17 | 
18 | For the above to work, we do some text normalization (removing punctuation, whitespace, special characters) and assume that
19 | the title_y would only appear in text_x if it appears in the references section...
20 | 
21 | ### Usage:
22 | 1. Export list of articles as .csv from Zotero, (articles should have File attachments)
23 | 2. Run `analyze_papers.py zotero_file.csv`
24 | 3. Script should produce two files: Edges_titles.csv and Nodes_titles.csv in folder "gephi"
25 | 4. Load them into [Gephi](https://gephi.org) with "Load Spreadsheet"
26 | 
27 | 
28 | ## Notes
29 | * Tested with Python3
30 | * Uses the library [pdfminer](https://pypi.org/project/pdfminer/)
31 | * You can specify number of processes the script uses to parse the PDFs with parameter --processes (default value is 4)
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/layout_scanner.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os
  5 | from binascii import b2a_hex
  6 | 
  7 | 
  8 | ###
  9 | ### pdf-miner requirements
 10 | ###
 11 | 
 12 | from pdfminer.pdfparser import PDFParser
 13 | from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
 14 | from pdfminer.pdfpage import PDFPage
 15 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 16 | from pdfminer.converter import PDFPageAggregator
 17 | from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar
 18 | 
 19 | def with_pdf (pdf_doc, fn, pdf_pwd, *args):
 20 |     """Open the pdf document, and apply the function, returning the results"""
 21 |     result = None
 22 |     try:
 23 |         # open the pdf file
 24 |         fp = open(pdf_doc, 'rb')
 25 |         # create a parser object associated with the file object
 26 |         parser = PDFParser(fp)
 27 |         # create a PDFDocument object that stores the document structure
 28 |         doc = PDFDocument(parser)
 29 |         # connect the parser and document objects
 30 |         parser.set_document(doc)
 31 |         # supply the password for initialization
 32 | 
 33 |         #doc.initialize(pdf_pwd)
 34 | 
 35 |         if doc.is_extractable:
 36 |             # apply the function and return the result
 37 |             result = fn(doc, *args)
 38 | 
 39 |         # close the pdf file
 40 |         fp.close()
 41 |     except IOError:
 42 |         print("[!] Error opening file in with_pdf()", file=sys.stderr)
 43 |         # the file doesn't exist or similar problem
 44 |         pass
 45 |     return result
 46 | 
 47 | 
 48 | ###
 49 | ### Table of Contents
 50 | ###
 51 | 
 52 | def _parse_toc (doc):
 53 |     """With an open PDFDocument object, get the table of contents (toc) data
 54 |     [this is a higher-order function to be passed to with_pdf()]"""
 55 |     toc = []
 56 |     try:
 57 |         outlines = doc.get_outlines()
 58 |         for (level,title,dest,a,se) in outlines:
 59 |             toc.append( (level, title) )
 60 |     except PDFNoOutlines:
 61 |         pass
 62 |     return toc
 63 | 
 64 | def get_toc (pdf_doc, pdf_pwd=''):
 65 |     """Return the table of contents (toc), if any, for this pdf file"""
 66 |     return with_pdf(pdf_doc, _parse_toc, pdf_pwd)
 67 | 
 68 | 
 69 | ###
 70 | ### Extracting Images
 71 | ###
 72 | 
 73 | def write_file (folder, filename, filedata, flags='w'):
 74 |     """Write the file data to the folder and filename combination
 75 |     (flags: 'w' for write text, 'wb' for write binary, use 'a' instead of 'w' for append)"""
 76 |     result = False
 77 |     if os.path.isdir(folder):
 78 |         try:
 79 |             file_obj = open(os.path.join(folder, filename), flags)
 80 |             file_obj.write(filedata)
 81 |             file_obj.close()
 82 |             result = True
 83 |         except IOError:
 84 |             pass
 85 |     return result
 86 | 
 87 | def determine_image_type (stream_first_4_bytes):
 88 |     """Find out the image file type based on the magic number comparison of the first 4 (or 2) bytes"""
 89 |     file_type = None
 90 |     bytes_as_hex = b2a_hex(stream_first_4_bytes)
 91 |     if bytes_as_hex.startswith('ffd8'):
 92 |         file_type = '.jpeg'
 93 |     elif bytes_as_hex == '89504e47':
 94 |         file_type = '.png'
 95 |     elif bytes_as_hex == '47494638':
 96 |         file_type = '.gif'
 97 |     elif bytes_as_hex.startswith('424d'):
 98 |         file_type = '.bmp'
 99 |     return file_type
100 | 
101 | def save_image (lt_image, page_number, images_folder):
102 |     """Try to save the image data from this LTImage object, and return the file name, if successful"""
103 |     result = None
104 |     if lt_image.stream:
105 |         file_stream = lt_image.stream.get_rawdata()
106 |         if file_stream:
107 |             file_ext = determine_image_type(file_stream[0:4])
108 |             if file_ext:
109 |                 file_name = ''.join([str(page_number), '_', lt_image.name, file_ext])
110 |                 if write_file(images_folder, file_name, file_stream, flags='wb'):
111 |                     result = file_name
112 |     return result
113 | 
114 | 
115 | ###
116 | ### Extracting Text
117 | ###
118 | 
119 | def to_bytestring (s, enc='utf-8'):
120 |     """Convert the given unicode string to a bytestring, using the standard encoding,
121 |     unless it's already a bytestring"""
122 |     if s:
123 |         if isinstance(s, str):
124 |             return s
125 |         else:
126 |             return s.encode(enc)
127 | 
128 | def update_page_text_hash (h, lt_obj, pct=0.2):
129 |     """Use the bbox x0,x1 values within pct% to produce lists of associated text within the hash"""
130 | 
131 |     x0 = lt_obj.bbox[0]
132 |     x1 = lt_obj.bbox[2]
133 | 
134 |     key_found = False
135 |     for k, v in h.items():
136 |         hash_x0 = k[0]
137 |         if x0 >= (hash_x0 * (1.0-pct)) and (hash_x0 * (1.0+pct)) >= x0:
138 |             hash_x1 = k[1]
139 |             if x1 >= (hash_x1 * (1.0-pct)) and (hash_x1 * (1.0+pct)) >= x1:
140 |                 # the text inside this LT* object was positioned at the same
141 |                 # width as a prior series of text, so it belongs together
142 |                 key_found = True
143 |                 v.append(to_bytestring(lt_obj.get_text()))
144 |                 h[k] = v
145 |     if not key_found:
146 |         # the text, based on width, is a new series,
147 |         # so it gets its own series (entry in the hash)
148 |         h[(x0,x1)] = [to_bytestring(lt_obj.get_text())]
149 | 
150 |     return h
151 | 
152 | def parse_lt_objs (lt_objs, page_number, images_folder, text_content=None):
153 |     """Iterate through the list of LT* objects and capture the text or image data contained in each"""
154 |     if text_content is None:
155 |         text_content = []
156 | 
157 |     skip_images = False
158 |     if images_folder is None:
159 |         skip_images = True
160 |     page_text = {} # k=(x0, x1) of the bbox, v=list of text strings within that bbox width (physical column)
161 |     for lt_obj in lt_objs:
162 |         if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
163 |             # text, so arrange is logically based on its column width
164 |             page_text = update_page_text_hash(page_text, lt_obj)
165 |         elif isinstance(lt_obj, LTImage):
166 |             if skip_images:
167 |                 continue
168 |             # an image, so save it to the designated folder, and note its place in the text
169 |             saved_file = save_image(lt_obj, page_number, images_folder)
170 |             if saved_file:
171 |                 # use html style <img /> tag to mark the position of the image within the text
172 |                 text_content.append('<img src="'+os.path.join(images_folder, saved_file)+'" />')
173 |             else:
174 |                 print("[!] error saving image on page", page_number, lt_obj.__repr__, file=sys.stderr)
175 |         elif isinstance(lt_obj, LTFigure):
176 |             if skip_images:
177 |                 continue
178 |             # LTFigure objects are containers for other LT* objects, so recurse through the children
179 |             text_content.append(parse_lt_objs(lt_obj, page_number, images_folder, text_content))
180 | 
181 |     for k, v in sorted([(key,value) for (key,value) in page_text.items()]):
182 |         # sort the page_text hash by the keys (x0,x1 values of the bbox),
183 |         # which produces a top-down, left-to-right sequence of related columns
184 |         text_content.append(''.join(v))
185 | 
186 |     return '\n'.join(text_content)
187 | 
188 | 
189 | ###
190 | ### Processing Pages
191 | ###
192 | 
193 | def _parse_pages (doc, images_folder):
194 |     """With an open PDFDocument object, get the pages and parse each one
195 |     [this is a higher-order function to be passed to with_pdf()]"""
196 |     rsrcmgr = PDFResourceManager()
197 |     laparams = LAParams()
198 |     # device = PDFPageAggregator(rsrcmgr, laparams=laparams)
199 |     device = PDFPageAggregator(rsrcmgr, laparams=laparams)
200 |     interpreter = PDFPageInterpreter(rsrcmgr, device)
201 | 
202 |     text_content = []
203 |     for i, page in enumerate(PDFPage.create_pages(doc)):
204 |         interpreter.process_page(page)
205 |         # receive the LTPage object for this page
206 |         layout = device.get_result()
207 |         # layout is an LTPage object which may contain child objects like LTTextBox, LTFigure, LTImage, etc.
208 |         text_content.append(parse_lt_objs(layout, (i+1), images_folder))
209 | 
210 |     return text_content
211 | 
212 | def get_pages (pdf_doc, pdf_pwd='', images_folder='/tmp'):
213 |     """Process each of the pages in this pdf file and return a list of strings representing the text found in each page"""
214 |     return with_pdf(pdf_doc, _parse_pages, pdf_pwd, *tuple([images_folder]))
215 | 


--------------------------------------------------------------------------------
/analyze_papers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import argparse
  3 | import csv
  4 | import re
  5 | import string
  6 | import sys, os, time
  7 | 
  8 | from functools import partial
  9 | from multiprocessing import Pool
 10 | 
 11 | import errno
 12 | import layout_scanner
 13 | 
 14 | # Zotero CSV Column indices
 15 | YEAR_I = 2
 16 | AUTHOR_I = 3
 17 | TITLE_I = 4
 18 | FILE_I = 37
 19 | 
 20 | DEFAULT_OUTPUT_CSV_NAME = "titles.csv"
 21 | DEFAULT_OUTPUT_DELIMITER = "\t"
 22 | 
 23 | used_filenames = []
 24 | graph = []
 25 | 
 26 | 
 27 | def pdf_to_text_list(file_loc):
 28 |     """
 29 |      Extracts text (string) of PDF file contents. Images, figures are ignored.
 30 |     :param str file_loc: Path to .PDF document on local disk
 31 |     :return: The last 10 pages of the PDF document as string text, a list of strings
 32 |     :rtype: list
 33 |     """
 34 |     # Read PDF pages as text
 35 |     pages = layout_scanner.get_pages(file_loc, images_folder=None)  # you can try os.path.abspath("output/imgs")
 36 |     try:
 37 |         page_len = len(pages)
 38 |     except TypeError:
 39 |         print("[!] Issue parsing PDF file", file=sys.stderr)
 40 |         return (-1, [])
 41 | 
 42 |     # Take only last 10 pages (We assume references never take more) TODO:HARDCODE
 43 |     pages = pages[-10:]
 44 | 
 45 |     return (page_len, pages)
 46 | 
 47 | 
 48 | def get_pretty_filename(metadata):
 49 |     fixed_title = re.sub('[^A-Za-z0-9]+', '', "_".join(metadata["title"].split(" ")[:10]))
 50 |     authors = metadata["author"].split(";")
 51 |     author_2nd = ""
 52 |     if len(authors) > 2:
 53 |         author_2nd = "et al."
 54 |     elif len(authors) == 2:
 55 |         author_2nd = "& " + authors[1].split(",")[0]
 56 |     author_1st = authors[0].split(",")[0]
 57 |     txt_filename = "%s %s %s" % (author_1st, author_2nd, metadata["year"])
 58 |     if txt_filename in used_filenames:
 59 |         txt_filename = txt_filename + fixed_title[:-20]
 60 |     used_filenames.append(txt_filename)
 61 |     return txt_filename
 62 | 
 63 | def create_missing_dirs(filename):
 64 |     if not os.path.exists(os.path.dirname(filename)):
 65 |         try:
 66 |             os.makedirs(os.path.dirname(filename))
 67 |         except OSError as exc:  # Guard against race condition
 68 |             if exc.errno != errno.EEXIST:
 69 |                 raise
 70 | 
 71 | 
 72 | def read_titles(zotero_csv):
 73 |     titles = {}
 74 |     with open(zotero_csv, 'rt') as csvfile:
 75 |         reader = csv.reader(csvfile, delimiter=',')
 76 |         next(csvfile)  # Skip header
 77 |         for r in reader:
 78 |             titles[pre_process(r[TITLE_I])] = \
 79 |                 {'title': r[TITLE_I],
 80 |                  'author': r[AUTHOR_I],
 81 |                  'file': r[FILE_I],
 82 |                  'year': r[YEAR_I]}
 83 |     return titles
 84 | 
 85 | 
 86 | def process_pdf(metadata, write_to_disk=False):
 87 |     """
 88 |     Reads text from PDF file specified in the CSV lines' file column, optionally saves it to .txt on disk
 89 |     :param dict csv_line: the CSV line to process
 90 |     :param bool write_to_disk: whether the text will be written to disk also
 91 |     :return: a tuple (bool, text, log), where bool indicates whether text was extracted successfully,
 92 |         text the pdf text contents, log is log/debug messages
 93 |     :rtype: tuple
 94 |     """
 95 | 
 96 |     log = []
 97 |     log.append(" ".join(metadata['author'].split(";")[:3]) + metadata['year'] + metadata['title'][:32])
 98 | 
 99 |     if len(metadata['file']) < 1:
100 |         return False, 'Missing Zotero file attachment', log
101 | 
102 |     all_files = metadata['file'].split(';')
103 |     first_pdf = None
104 |     for file in all_files:
105 |         if file.lower().strip().endswith(".pdf"):
106 |             first_pdf = file
107 |             break
108 | 
109 |     if first_pdf == None:
110 |         return False, 'No PDF File attached to article entry', log
111 |     else:
112 |         log.append("\t-- Found %s attachments, using pdf: %s" % (len(all_files), first_pdf))
113 | 
114 | 
115 |     original_page_count, pages = pdf_to_text_list(first_pdf)
116 |     if original_page_count != -1:
117 |         log.append("\t-- Checking last %s PDF pages out of %s total" % (len(pages), original_page_count))
118 | 
119 |     if write_to_disk:  # Kind of deprecated, this was used by the R script of A.R. Siders
120 |         output_filename = get_pretty_filename(metadata)
121 |         paper_txt_filename = args.txts_dir + os.sep + output_filename + '.txt'
122 |         create_missing_dirs(paper_txt_filename)
123 |         with open(paper_txt_filename, 'w') as outfile:
124 |             for p in pages:
125 |                 print >> outfile, p
126 |             outfile.close()
127 | 
128 |     all_pages = "\n".join(pages)
129 | 
130 |     return len(all_pages) > 0, all_pages, log
131 | 
132 | 
133 | def find_citations(paper_text, all_titles, metadata):
134 |     log = []
135 |     cited_ids = []
136 |     # Check which titles this paper cited:
137 |     fixed_paper_title = pre_process(metadata["title"])
138 |     fixed_text = pre_process(paper_text)
139 | 
140 |     # remove whitespace
141 |     for title in all_titles:
142 |         if (title != fixed_paper_title) and \
143 |                 (title.replace(' ', '') in fixed_text.replace(' ', '')):  # Stripping whitespace!
144 |             log.append("\t---- citation found:" + title)
145 |             cited_ids.append(title)
146 |             # graph.append([fixed_paper_title, title])
147 |     return cited_ids, log
148 | 
149 | 
150 | def article_worker(dict_item, all_titles):
151 |     t0 = time.time()
152 | 
153 |     print_log = []
154 | 
155 |     title, metadata = dict_item
156 |     pdf_result, text, pdf_log = process_pdf(metadata)
157 | 
158 |     t1 = time.time()
159 |     if pdf_result:
160 |         print_log.append("Processed in %s seconds :" %  (t1 - t0))
161 |     else:
162 |         print_log.append("Error processing:")
163 |         print_log.append("\t-- " + text)
164 | 
165 |     print_log += pdf_log
166 | 
167 |     cited_papers = []
168 |     if pdf_result:
169 |         cited_papers, citations_log = find_citations(text, all_titles, metadata)
170 |         print_log += citations_log
171 |         t2 = time.time()
172 |         print_log.append("\t-- processed text cites in % seconds" % (t2 - t1))
173 | 
174 |     print("\n".join(print_log) + "\n\n")
175 | 
176 |     return title, pdf_result, text, cited_papers
177 | 
178 | 
179 | def pre_process(text):
180 |     # to lowercase
181 |     text = text.lower()
182 |     # remove punctuation
183 |     text = text.translate(str.maketrans('', '', string.punctuation))
184 |     # remove linebreaks
185 |     # text = re.sub(r"(?<=[a-z])\r?\n", " ", text)
186 |     text = text.replace('\r', '').replace('\n', '')
187 |     # remove numbers
188 |     text = re.sub(r'\d+', '', text)
189 |     # remove whitespace
190 |     text = " ".join(re.findall(r'[a-z]+', text))
191 | 
192 |     return text
193 | 
194 | def make_directory_if_missing(directory_path):
195 |     if not os.path.exists(os.path.dirname(directory_path)):
196 |         try:
197 |             os.makedirs(os.path.dirname(directory_path))
198 |         except OSError as exc:  # Guard against race condition
199 |             if exc.errno != errno.EEXIST:
200 |                 raise
201 | 
202 | 
203 | if __name__ == '__main__':
204 |     parser = argparse.ArgumentParser(description=
205 |                                      'Extract text from PDF files whose locations are given by a Zotero CSV file')
206 |     parser.add_argument('zotero_csv', type=str, help='the Zotero exported CSV file of papers')
207 |     parser.add_argument('--gephi_dir', default="gephi", type=str,
208 |                         help='Output dir for gephi Edges and Nodes files (default: "gephi")')
209 |     parser.add_argument('--processes', default=4, type=int,
210 |                         help='How many worker processes to create for the time-consuming PDF parsing (default: 4)')
211 |     parser.add_argument('--txts_dir', default="papers", type=str,
212 |                         help='Output dir for article txt files (default: "papers")')
213 |     parser.add_argument('--out_csv', default=DEFAULT_OUTPUT_CSV_NAME, type=str,
214 |                         help='Output csv filename (default: ' + DEFAULT_OUTPUT_CSV_NAME + ')')
215 |     parser.add_argument('--delimiter', default=DEFAULT_OUTPUT_DELIMITER, type=str,
216 |                         help='Output csv delimiter  (default: ' + DEFAULT_OUTPUT_CSV_NAME + ')')
217 | 
218 |     args = parser.parse_args()
219 |     OUTPUT_CSV_NAME = args.out_csv
220 |     OUTPUT_GEPHI_DIR = args.gephi_dir
221 |     OUTPUT_DELIMITER = args.delimiter
222 |     WORKER_PROCESSES = args.processes
223 | 
224 |     out_edges_filedir = OUTPUT_GEPHI_DIR + os.sep + "Edges_" + OUTPUT_CSV_NAME
225 |     out_nodes_filedir = OUTPUT_GEPHI_DIR + os.sep + "Nodes_" + OUTPUT_CSV_NAME
226 | 
227 |     make_directory_if_missing(out_edges_filedir)
228 |     make_directory_if_missing(out_nodes_filedir)
229 | 
230 |     error_documents = []
231 | 
232 |     # First, just get the titles in the csv
233 |     titles_dict = read_titles(args.zotero_csv)
234 |     title_ids = list(titles_dict.keys())
235 | 
236 |     # Now process the PDFs
237 |     pool_start_time = time.time()
238 | 
239 |     pool = Pool(processes=WORKER_PROCESSES)  # start n worker processes
240 | 
241 |     list_worker = partial(article_worker, all_titles=title_ids)
242 |     result = pool.map(list_worker, list(titles_dict.items()), chunksize=5)
243 |     for title, pdf_result, text, cited_papers in result:
244 |         if pdf_result:
245 |             for paper in cited_papers:
246 |                 graph.append([title, paper])
247 | 
248 |         else:
249 |             error_documents.append([title, text])
250 |     total_time = time.time() - pool_start_time
251 | 
252 |     # Print finish report, show failed documents
253 |     print("\n---- Finished -----\n" \
254 |           "Processed ", len(title_ids), " papers in  ", total_time, "seconds")
255 |     print("%s documents were not extracted due to errors:" % len(error_documents))
256 |     for i, (doc_id, reason) in enumerate(error_documents):
257 |         doc = titles_dict[doc_id]
258 |         print( "%s. %s %s %s %s" % (i, doc["author"], doc["year"], doc["title"], doc["file"]))
259 |         print("\t--", reason)
260 | 
261 |     # Write Graph Edges to csv
262 |     with open(OUTPUT_GEPHI_DIR + os.sep + "Edges_" + OUTPUT_CSV_NAME, "a") as graph_csv:
263 |         # Header
264 |         graph_csv.write(OUTPUT_DELIMITER.join(["Source", "Target", "Weight"]) + "\n")
265 |         for (src, target) in graph:
266 |             graph_csv.write(OUTPUT_DELIMITER.join([src, target, "1"]) + "\n")
267 | 
268 |     # Write Graph Nodes with Labels to csv
269 |     with open(OUTPUT_GEPHI_DIR + os.sep + "Nodes_" + OUTPUT_CSV_NAME, "a") as nodes_csv:
270 |         # Header
271 |         nodes_csv.write(OUTPUT_DELIMITER.join(["Id", "Label", "Author", "PrettyName"]) + "\n")
272 |         for title in title_ids:
273 |             metadata = titles_dict[title]
274 |             nodes_csv.write(OUTPUT_DELIMITER.join(
275 |                 [title, metadata["title"], metadata["author"], get_pretty_filename(metadata)]) + "\n")
276 | 


--------------------------------------------------------------------------------