├── screenshot_map.png ├── README.MD ├── layout_scanner.py └── analyze_papers.py /screenshot_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaks6/citation_map/HEAD/screenshot_map.png -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # Create a Citation Graph based on Simplistic Text Analysis 2 | 3 | _Inspired by A.R. Siders' R Script from this [ResearchGate question]( https://www.researchgate.net/post/Is_there_any_recommended_software_to_visualise_articles_papers_references_when_conducting_a_systematic_review_or_meta-analysis )_ 4 | 5 | _Based on dpapathanasiou's [example script for pdfminer](https://github.com/dpapathanasiou/pdfminer-layout-scanner)_ 6 | 7 | ## Takes Zotero .CSV Article collections and creates Gephi-compatible files for Graph Edges and Nodes based on citations 8 | 9 | 10 | ![screenshot]( https://github.com/jaks6/citation_map/blob/master/screenshot_map.png ) 11 | 12 | ### Principle: 13 | * Let A be a set of known articles 14 | * For any a in A, let _title_a_ be its title, and _text_a_ be its text content 15 | * For some x in A and y in A, x!=y: 16 | * cites(x,y) is true if _title_y_ appears in _text_x_ 17 | 18 | For the above to work, we do some text normalization (removing punctuation, whitespace, special characters) and assume that 19 | the title_y would only appear in text_x if it appears in the references section... 20 | 21 | ### Usage: 22 | 1. Export list of articles as .csv from Zotero, (articles should have File attachments) 23 | 2. Run `analyze_papers.py zotero_file.csv` 24 | 3. Script should produce two files: Edges_titles.csv and Nodes_titles.csv in folder "gephi" 25 | 4. Load them into [Gephi](https://gephi.org) with "Load Spreadsheet" 26 | 27 | 28 | ## Notes 29 | * Tested with Python3 30 | * Uses the library [pdfminer](https://pypi.org/project/pdfminer/) 31 | * You can specify number of processes the script uses to parse the PDFs with parameter --processes (default value is 4) 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /layout_scanner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | from binascii import b2a_hex 6 | 7 | 8 | ### 9 | ### pdf-miner requirements 10 | ### 11 | 12 | from pdfminer.pdfparser import PDFParser 13 | from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines 14 | from pdfminer.pdfpage import PDFPage 15 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 16 | from pdfminer.converter import PDFPageAggregator 17 | from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar 18 | 19 | def with_pdf (pdf_doc, fn, pdf_pwd, *args): 20 | """Open the pdf document, and apply the function, returning the results""" 21 | result = None 22 | try: 23 | # open the pdf file 24 | fp = open(pdf_doc, 'rb') 25 | # create a parser object associated with the file object 26 | parser = PDFParser(fp) 27 | # create a PDFDocument object that stores the document structure 28 | doc = PDFDocument(parser) 29 | # connect the parser and document objects 30 | parser.set_document(doc) 31 | # supply the password for initialization 32 | 33 | #doc.initialize(pdf_pwd) 34 | 35 | if doc.is_extractable: 36 | # apply the function and return the result 37 | result = fn(doc, *args) 38 | 39 | # close the pdf file 40 | fp.close() 41 | except IOError: 42 | print("[!] Error opening file in with_pdf()", file=sys.stderr) 43 | # the file doesn't exist or similar problem 44 | pass 45 | return result 46 | 47 | 48 | ### 49 | ### Table of Contents 50 | ### 51 | 52 | def _parse_toc (doc): 53 | """With an open PDFDocument object, get the table of contents (toc) data 54 | [this is a higher-order function to be passed to with_pdf()]""" 55 | toc = [] 56 | try: 57 | outlines = doc.get_outlines() 58 | for (level,title,dest,a,se) in outlines: 59 | toc.append( (level, title) ) 60 | except PDFNoOutlines: 61 | pass 62 | return toc 63 | 64 | def get_toc (pdf_doc, pdf_pwd=''): 65 | """Return the table of contents (toc), if any, for this pdf file""" 66 | return with_pdf(pdf_doc, _parse_toc, pdf_pwd) 67 | 68 | 69 | ### 70 | ### Extracting Images 71 | ### 72 | 73 | def write_file (folder, filename, filedata, flags='w'): 74 | """Write the file data to the folder and filename combination 75 | (flags: 'w' for write text, 'wb' for write binary, use 'a' instead of 'w' for append)""" 76 | result = False 77 | if os.path.isdir(folder): 78 | try: 79 | file_obj = open(os.path.join(folder, filename), flags) 80 | file_obj.write(filedata) 81 | file_obj.close() 82 | result = True 83 | except IOError: 84 | pass 85 | return result 86 | 87 | def determine_image_type (stream_first_4_bytes): 88 | """Find out the image file type based on the magic number comparison of the first 4 (or 2) bytes""" 89 | file_type = None 90 | bytes_as_hex = b2a_hex(stream_first_4_bytes) 91 | if bytes_as_hex.startswith('ffd8'): 92 | file_type = '.jpeg' 93 | elif bytes_as_hex == '89504e47': 94 | file_type = '.png' 95 | elif bytes_as_hex == '47494638': 96 | file_type = '.gif' 97 | elif bytes_as_hex.startswith('424d'): 98 | file_type = '.bmp' 99 | return file_type 100 | 101 | def save_image (lt_image, page_number, images_folder): 102 | """Try to save the image data from this LTImage object, and return the file name, if successful""" 103 | result = None 104 | if lt_image.stream: 105 | file_stream = lt_image.stream.get_rawdata() 106 | if file_stream: 107 | file_ext = determine_image_type(file_stream[0:4]) 108 | if file_ext: 109 | file_name = ''.join([str(page_number), '_', lt_image.name, file_ext]) 110 | if write_file(images_folder, file_name, file_stream, flags='wb'): 111 | result = file_name 112 | return result 113 | 114 | 115 | ### 116 | ### Extracting Text 117 | ### 118 | 119 | def to_bytestring (s, enc='utf-8'): 120 | """Convert the given unicode string to a bytestring, using the standard encoding, 121 | unless it's already a bytestring""" 122 | if s: 123 | if isinstance(s, str): 124 | return s 125 | else: 126 | return s.encode(enc) 127 | 128 | def update_page_text_hash (h, lt_obj, pct=0.2): 129 | """Use the bbox x0,x1 values within pct% to produce lists of associated text within the hash""" 130 | 131 | x0 = lt_obj.bbox[0] 132 | x1 = lt_obj.bbox[2] 133 | 134 | key_found = False 135 | for k, v in h.items(): 136 | hash_x0 = k[0] 137 | if x0 >= (hash_x0 * (1.0-pct)) and (hash_x0 * (1.0+pct)) >= x0: 138 | hash_x1 = k[1] 139 | if x1 >= (hash_x1 * (1.0-pct)) and (hash_x1 * (1.0+pct)) >= x1: 140 | # the text inside this LT* object was positioned at the same 141 | # width as a prior series of text, so it belongs together 142 | key_found = True 143 | v.append(to_bytestring(lt_obj.get_text())) 144 | h[k] = v 145 | if not key_found: 146 | # the text, based on width, is a new series, 147 | # so it gets its own series (entry in the hash) 148 | h[(x0,x1)] = [to_bytestring(lt_obj.get_text())] 149 | 150 | return h 151 | 152 | def parse_lt_objs (lt_objs, page_number, images_folder, text_content=None): 153 | """Iterate through the list of LT* objects and capture the text or image data contained in each""" 154 | if text_content is None: 155 | text_content = [] 156 | 157 | skip_images = False 158 | if images_folder is None: 159 | skip_images = True 160 | page_text = {} # k=(x0, x1) of the bbox, v=list of text strings within that bbox width (physical column) 161 | for lt_obj in lt_objs: 162 | if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): 163 | # text, so arrange is logically based on its column width 164 | page_text = update_page_text_hash(page_text, lt_obj) 165 | elif isinstance(lt_obj, LTImage): 166 | if skip_images: 167 | continue 168 | # an image, so save it to the designated folder, and note its place in the text 169 | saved_file = save_image(lt_obj, page_number, images_folder) 170 | if saved_file: 171 | # use html style tag to mark the position of the image within the text 172 | text_content.append('') 173 | else: 174 | print("[!] error saving image on page", page_number, lt_obj.__repr__, file=sys.stderr) 175 | elif isinstance(lt_obj, LTFigure): 176 | if skip_images: 177 | continue 178 | # LTFigure objects are containers for other LT* objects, so recurse through the children 179 | text_content.append(parse_lt_objs(lt_obj, page_number, images_folder, text_content)) 180 | 181 | for k, v in sorted([(key,value) for (key,value) in page_text.items()]): 182 | # sort the page_text hash by the keys (x0,x1 values of the bbox), 183 | # which produces a top-down, left-to-right sequence of related columns 184 | text_content.append(''.join(v)) 185 | 186 | return '\n'.join(text_content) 187 | 188 | 189 | ### 190 | ### Processing Pages 191 | ### 192 | 193 | def _parse_pages (doc, images_folder): 194 | """With an open PDFDocument object, get the pages and parse each one 195 | [this is a higher-order function to be passed to with_pdf()]""" 196 | rsrcmgr = PDFResourceManager() 197 | laparams = LAParams() 198 | # device = PDFPageAggregator(rsrcmgr, laparams=laparams) 199 | device = PDFPageAggregator(rsrcmgr, laparams=laparams) 200 | interpreter = PDFPageInterpreter(rsrcmgr, device) 201 | 202 | text_content = [] 203 | for i, page in enumerate(PDFPage.create_pages(doc)): 204 | interpreter.process_page(page) 205 | # receive the LTPage object for this page 206 | layout = device.get_result() 207 | # layout is an LTPage object which may contain child objects like LTTextBox, LTFigure, LTImage, etc. 208 | text_content.append(parse_lt_objs(layout, (i+1), images_folder)) 209 | 210 | return text_content 211 | 212 | def get_pages (pdf_doc, pdf_pwd='', images_folder='/tmp'): 213 | """Process each of the pages in this pdf file and return a list of strings representing the text found in each page""" 214 | return with_pdf(pdf_doc, _parse_pages, pdf_pwd, *tuple([images_folder])) 215 | -------------------------------------------------------------------------------- /analyze_papers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import argparse 3 | import csv 4 | import re 5 | import string 6 | import sys, os, time 7 | 8 | from functools import partial 9 | from multiprocessing import Pool 10 | 11 | import errno 12 | import layout_scanner 13 | 14 | # Zotero CSV Column indices 15 | YEAR_I = 2 16 | AUTHOR_I = 3 17 | TITLE_I = 4 18 | FILE_I = 37 19 | 20 | DEFAULT_OUTPUT_CSV_NAME = "titles.csv" 21 | DEFAULT_OUTPUT_DELIMITER = "\t" 22 | 23 | used_filenames = [] 24 | graph = [] 25 | 26 | 27 | def pdf_to_text_list(file_loc): 28 | """ 29 | Extracts text (string) of PDF file contents. Images, figures are ignored. 30 | :param str file_loc: Path to .PDF document on local disk 31 | :return: The last 10 pages of the PDF document as string text, a list of strings 32 | :rtype: list 33 | """ 34 | # Read PDF pages as text 35 | pages = layout_scanner.get_pages(file_loc, images_folder=None) # you can try os.path.abspath("output/imgs") 36 | try: 37 | page_len = len(pages) 38 | except TypeError: 39 | print("[!] Issue parsing PDF file", file=sys.stderr) 40 | return (-1, []) 41 | 42 | # Take only last 10 pages (We assume references never take more) TODO:HARDCODE 43 | pages = pages[-10:] 44 | 45 | return (page_len, pages) 46 | 47 | 48 | def get_pretty_filename(metadata): 49 | fixed_title = re.sub('[^A-Za-z0-9]+', '', "_".join(metadata["title"].split(" ")[:10])) 50 | authors = metadata["author"].split(";") 51 | author_2nd = "" 52 | if len(authors) > 2: 53 | author_2nd = "et al." 54 | elif len(authors) == 2: 55 | author_2nd = "& " + authors[1].split(",")[0] 56 | author_1st = authors[0].split(",")[0] 57 | txt_filename = "%s %s %s" % (author_1st, author_2nd, metadata["year"]) 58 | if txt_filename in used_filenames: 59 | txt_filename = txt_filename + fixed_title[:-20] 60 | used_filenames.append(txt_filename) 61 | return txt_filename 62 | 63 | def create_missing_dirs(filename): 64 | if not os.path.exists(os.path.dirname(filename)): 65 | try: 66 | os.makedirs(os.path.dirname(filename)) 67 | except OSError as exc: # Guard against race condition 68 | if exc.errno != errno.EEXIST: 69 | raise 70 | 71 | 72 | def read_titles(zotero_csv): 73 | titles = {} 74 | with open(zotero_csv, 'rt') as csvfile: 75 | reader = csv.reader(csvfile, delimiter=',') 76 | next(csvfile) # Skip header 77 | for r in reader: 78 | titles[pre_process(r[TITLE_I])] = \ 79 | {'title': r[TITLE_I], 80 | 'author': r[AUTHOR_I], 81 | 'file': r[FILE_I], 82 | 'year': r[YEAR_I]} 83 | return titles 84 | 85 | 86 | def process_pdf(metadata, write_to_disk=False): 87 | """ 88 | Reads text from PDF file specified in the CSV lines' file column, optionally saves it to .txt on disk 89 | :param dict csv_line: the CSV line to process 90 | :param bool write_to_disk: whether the text will be written to disk also 91 | :return: a tuple (bool, text, log), where bool indicates whether text was extracted successfully, 92 | text the pdf text contents, log is log/debug messages 93 | :rtype: tuple 94 | """ 95 | 96 | log = [] 97 | log.append(" ".join(metadata['author'].split(";")[:3]) + metadata['year'] + metadata['title'][:32]) 98 | 99 | if len(metadata['file']) < 1: 100 | return False, 'Missing Zotero file attachment', log 101 | 102 | all_files = metadata['file'].split(';') 103 | first_pdf = None 104 | for file in all_files: 105 | if file.lower().strip().endswith(".pdf"): 106 | first_pdf = file 107 | break 108 | 109 | if first_pdf == None: 110 | return False, 'No PDF File attached to article entry', log 111 | else: 112 | log.append("\t-- Found %s attachments, using pdf: %s" % (len(all_files), first_pdf)) 113 | 114 | 115 | original_page_count, pages = pdf_to_text_list(first_pdf) 116 | if original_page_count != -1: 117 | log.append("\t-- Checking last %s PDF pages out of %s total" % (len(pages), original_page_count)) 118 | 119 | if write_to_disk: # Kind of deprecated, this was used by the R script of A.R. Siders 120 | output_filename = get_pretty_filename(metadata) 121 | paper_txt_filename = args.txts_dir + os.sep + output_filename + '.txt' 122 | create_missing_dirs(paper_txt_filename) 123 | with open(paper_txt_filename, 'w') as outfile: 124 | for p in pages: 125 | print >> outfile, p 126 | outfile.close() 127 | 128 | all_pages = "\n".join(pages) 129 | 130 | return len(all_pages) > 0, all_pages, log 131 | 132 | 133 | def find_citations(paper_text, all_titles, metadata): 134 | log = [] 135 | cited_ids = [] 136 | # Check which titles this paper cited: 137 | fixed_paper_title = pre_process(metadata["title"]) 138 | fixed_text = pre_process(paper_text) 139 | 140 | # remove whitespace 141 | for title in all_titles: 142 | if (title != fixed_paper_title) and \ 143 | (title.replace(' ', '') in fixed_text.replace(' ', '')): # Stripping whitespace! 144 | log.append("\t---- citation found:" + title) 145 | cited_ids.append(title) 146 | # graph.append([fixed_paper_title, title]) 147 | return cited_ids, log 148 | 149 | 150 | def article_worker(dict_item, all_titles): 151 | t0 = time.time() 152 | 153 | print_log = [] 154 | 155 | title, metadata = dict_item 156 | pdf_result, text, pdf_log = process_pdf(metadata) 157 | 158 | t1 = time.time() 159 | if pdf_result: 160 | print_log.append("Processed in %s seconds :" % (t1 - t0)) 161 | else: 162 | print_log.append("Error processing:") 163 | print_log.append("\t-- " + text) 164 | 165 | print_log += pdf_log 166 | 167 | cited_papers = [] 168 | if pdf_result: 169 | cited_papers, citations_log = find_citations(text, all_titles, metadata) 170 | print_log += citations_log 171 | t2 = time.time() 172 | print_log.append("\t-- processed text cites in % seconds" % (t2 - t1)) 173 | 174 | print("\n".join(print_log) + "\n\n") 175 | 176 | return title, pdf_result, text, cited_papers 177 | 178 | 179 | def pre_process(text): 180 | # to lowercase 181 | text = text.lower() 182 | # remove punctuation 183 | text = text.translate(str.maketrans('', '', string.punctuation)) 184 | # remove linebreaks 185 | # text = re.sub(r"(?<=[a-z])\r?\n", " ", text) 186 | text = text.replace('\r', '').replace('\n', '') 187 | # remove numbers 188 | text = re.sub(r'\d+', '', text) 189 | # remove whitespace 190 | text = " ".join(re.findall(r'[a-z]+', text)) 191 | 192 | return text 193 | 194 | def make_directory_if_missing(directory_path): 195 | if not os.path.exists(os.path.dirname(directory_path)): 196 | try: 197 | os.makedirs(os.path.dirname(directory_path)) 198 | except OSError as exc: # Guard against race condition 199 | if exc.errno != errno.EEXIST: 200 | raise 201 | 202 | 203 | if __name__ == '__main__': 204 | parser = argparse.ArgumentParser(description= 205 | 'Extract text from PDF files whose locations are given by a Zotero CSV file') 206 | parser.add_argument('zotero_csv', type=str, help='the Zotero exported CSV file of papers') 207 | parser.add_argument('--gephi_dir', default="gephi", type=str, 208 | help='Output dir for gephi Edges and Nodes files (default: "gephi")') 209 | parser.add_argument('--processes', default=4, type=int, 210 | help='How many worker processes to create for the time-consuming PDF parsing (default: 4)') 211 | parser.add_argument('--txts_dir', default="papers", type=str, 212 | help='Output dir for article txt files (default: "papers")') 213 | parser.add_argument('--out_csv', default=DEFAULT_OUTPUT_CSV_NAME, type=str, 214 | help='Output csv filename (default: ' + DEFAULT_OUTPUT_CSV_NAME + ')') 215 | parser.add_argument('--delimiter', default=DEFAULT_OUTPUT_DELIMITER, type=str, 216 | help='Output csv delimiter (default: ' + DEFAULT_OUTPUT_CSV_NAME + ')') 217 | 218 | args = parser.parse_args() 219 | OUTPUT_CSV_NAME = args.out_csv 220 | OUTPUT_GEPHI_DIR = args.gephi_dir 221 | OUTPUT_DELIMITER = args.delimiter 222 | WORKER_PROCESSES = args.processes 223 | 224 | out_edges_filedir = OUTPUT_GEPHI_DIR + os.sep + "Edges_" + OUTPUT_CSV_NAME 225 | out_nodes_filedir = OUTPUT_GEPHI_DIR + os.sep + "Nodes_" + OUTPUT_CSV_NAME 226 | 227 | make_directory_if_missing(out_edges_filedir) 228 | make_directory_if_missing(out_nodes_filedir) 229 | 230 | error_documents = [] 231 | 232 | # First, just get the titles in the csv 233 | titles_dict = read_titles(args.zotero_csv) 234 | title_ids = list(titles_dict.keys()) 235 | 236 | # Now process the PDFs 237 | pool_start_time = time.time() 238 | 239 | pool = Pool(processes=WORKER_PROCESSES) # start n worker processes 240 | 241 | list_worker = partial(article_worker, all_titles=title_ids) 242 | result = pool.map(list_worker, list(titles_dict.items()), chunksize=5) 243 | for title, pdf_result, text, cited_papers in result: 244 | if pdf_result: 245 | for paper in cited_papers: 246 | graph.append([title, paper]) 247 | 248 | else: 249 | error_documents.append([title, text]) 250 | total_time = time.time() - pool_start_time 251 | 252 | # Print finish report, show failed documents 253 | print("\n---- Finished -----\n" \ 254 | "Processed ", len(title_ids), " papers in ", total_time, "seconds") 255 | print("%s documents were not extracted due to errors:" % len(error_documents)) 256 | for i, (doc_id, reason) in enumerate(error_documents): 257 | doc = titles_dict[doc_id] 258 | print( "%s. %s %s %s %s" % (i, doc["author"], doc["year"], doc["title"], doc["file"])) 259 | print("\t--", reason) 260 | 261 | # Write Graph Edges to csv 262 | with open(OUTPUT_GEPHI_DIR + os.sep + "Edges_" + OUTPUT_CSV_NAME, "a") as graph_csv: 263 | # Header 264 | graph_csv.write(OUTPUT_DELIMITER.join(["Source", "Target", "Weight"]) + "\n") 265 | for (src, target) in graph: 266 | graph_csv.write(OUTPUT_DELIMITER.join([src, target, "1"]) + "\n") 267 | 268 | # Write Graph Nodes with Labels to csv 269 | with open(OUTPUT_GEPHI_DIR + os.sep + "Nodes_" + OUTPUT_CSV_NAME, "a") as nodes_csv: 270 | # Header 271 | nodes_csv.write(OUTPUT_DELIMITER.join(["Id", "Label", "Author", "PrettyName"]) + "\n") 272 | for title in title_ids: 273 | metadata = titles_dict[title] 274 | nodes_csv.write(OUTPUT_DELIMITER.join( 275 | [title, metadata["title"], metadata["author"], get_pretty_filename(metadata)]) + "\n") 276 | --------------------------------------------------------------------------------