├── news_extract.egg-info ├── dependency_links.txt ├── requires.txt ├── top_level.txt ├── SOURCES.txt └── PKG-INFO ├── news_extract ├── __init__.py ├── striprtf2.py └── news_extract.py ├── .gitattributes ├── dist ├── news_extract-1.0.2.tar.gz └── news_extract-1.0.2-py3-none-any.whl ├── setup.py ├── LICENSE └── README.md /news_extract.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /news_extract.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | -------------------------------------------------------------------------------- /news_extract.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | news_extract 2 | -------------------------------------------------------------------------------- /news_extract/__init__.py: -------------------------------------------------------------------------------- 1 | from .news_extract import * 2 | from .striprtf2 import * -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /dist/news_extract-1.0.2.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfreelon/news_extract/HEAD/dist/news_extract-1.0.2.tar.gz -------------------------------------------------------------------------------- /dist/news_extract-1.0.2-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfreelon/news_extract/HEAD/dist/news_extract-1.0.2-py3-none-any.whl -------------------------------------------------------------------------------- /news_extract.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | setup.py 2 | news_extract/__init__.py 3 | news_extract/news_extract.py 4 | news_extract/striprtf2.py 5 | news_extract.egg-info/PKG-INFO 6 | news_extract.egg-info/SOURCES.txt 7 | news_extract.egg-info/dependency_links.txt 8 | news_extract.egg-info/requires.txt 9 | news_extract.egg-info/top_level.txt -------------------------------------------------------------------------------- /news_extract.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.1 2 | Name: news-extract 3 | Version: 1.0.2 4 | Summary: news_extract 5 | Home-page: https://github.com/dfreelon/news_extract/ 6 | Author: Deen Freelon 7 | Author-email: dfreelon@gmail.com 8 | License: UNKNOWN 9 | Download-URL: https://github.com/dfreelon/news_extract/ 10 | Description: UNKNOWN 11 | Keywords: information retrieval,nexisuni,factiva,deduplication 12 | Platform: UNKNOWN 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | setup( 3 | name = 'news_extract', 4 | packages = ['news_extract'], # this must be the same as the name above 5 | version = '1.0.2', 6 | description = 'news_extract', 7 | author = 'Deen Freelon', 8 | author_email = 'dfreelon@gmail.com', 9 | url = 'https://github.com/dfreelon/news_extract/', # use the URL to the github repo 10 | download_url = 'https://github.com/dfreelon/news_extract/', 11 | install_requires = ['pandas'], 12 | keywords = ['information retrieval', 'nexisuni', 'factiva', 'deduplication'], # arbitrary keywords 13 | classifiers = [], 14 | include_package_data=True 15 | ) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Deen Freelon 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # news_extract 2 | Python module to extract articles from NexisUni and Factiva. 3 | 4 | ## Requirements 5 | 6 | * Python 3 7 | * [pandas](https://pandas.pydata.org/) 8 | 9 | ## Installation 10 | 11 | ```pip install news_extract``` 12 | 13 | ## Overview 14 | 15 | ```news_extract``` allows the output of the NexisUni and Factiva databases to be imported into Python. **Note, you must export your documents manually first! This module does not scrape the databases directly; rather, it extracts articles and associated metadata from pre-exported output files.** To use it, you must subscribe to at least one of these databases and use the following instructions to export your articles from each database: 16 | 17 | ### NexisUni export instructions 18 | 19 | 1. Make sure you are exporting full documents with no attachments, not just the results list. 20 | 2. Export in RTF format. (Note: you can export up to 100 articles at a time if you create an individual NexisUni account and change your personal settings accordingly.) 21 | 3. Save documents in a single file. 22 | 4. Uncheck all options on the "Formatting Options" tab. 23 | 24 | ### Factiva export instructions 25 | 26 | 1. For Factiva, you must export your documents using the Firefox browser. 27 | 2. After conducting your search, click the "View Selected Articles" button that looks like an eye. 28 | 3. On the right, click the "Display Options" text and select "Full Article/Report plus Indexing." 29 | 4. Click the "Format for Saving" button that looks like a 3.5" floppy disk and select "Article Format." 30 | 5. On the resulting page, select "Save Page As..." from the Firefox menu. 31 | 6. In the "Save as type" dropdown, select "Text Files" and save your file. 32 | 7. [This animated gif](https://rtemis.hypotheses.org/files/2017/02/Factiva-animated-tutorial.gif) shows how to do steps 2-4 (warning: French) 33 | 34 | Once you've exported your file(s), you can do the following: 35 | 36 | ```python 37 | import news_extract as ne 38 | nu_file = 'results1.rtf' #file exported from NexisUni 39 | fc_file = 'results2.txt' #file exported from Factiva 40 | nu_data = ne.nexis_rtf_extract(nu_file) 41 | fc_data = ne.factiva_extract(fc_file) 42 | 43 | print(nu_data[0].keys()) #view field names for NexisUni articles 44 | print(fc_data[0].keys()) #view field names for first Factiva article 45 | 46 | for i in nu_data: 47 | print(i['HEADLINE']) #show all NexisUni headlines 48 | for i in fc_data: 49 | print(i['HD']) #show all Factiva headlines 50 | ``` 51 | 52 | ## Output 53 | 54 | Both ```nexis_rtf_extract``` and ```factiva_extract``` return lists of dicts wherein each dict corresponds to an article. The dict keys are field names, while the dict values are the metadata. One major difference between the two functions is that ```nexis_rtf_extract``` outputs the same set of metadata for all articles, while ```factiva_extract``` auto-extracts the specific field names and values attached to each article. This is due to differences in how the two types of export files are formatted. 55 | 56 | ## Combining Factiva and NexisUni output 57 | 58 | ### Converting fieldnames 59 | 60 | You can use the function ```fix_fac_fieldnames``` to convert Factiva fieldnames to their longer and more descriptive NexisUni equivalents like so: 61 | 62 | ```python 63 | #note that this will only convert eight common field names, leaving the rest intact 64 | fc_converted = ne.fix_fac_fieldnames(fc_data) 65 | ``` 66 | 67 | ### Merging Factiva and NexisUni data into a single Pandas variable 68 | 69 | If you want to analyze data from NexisUni and Factiva in the same project, here's how to do it: 70 | 71 | ```python 72 | nu_plus_fc = nu_data + fc_converted 73 | combined = ne.news_export(nu_plus_fc) 74 | ``` 75 | 76 | The ```news_export``` function performs several operations, including removing duplicates (using a custom algorithm based on the Jaccard coefficient and time of publication) and resolving conflicts between articles with different metadata fields. For the latter, the function attempts to export all fields included in at least half the articles by default. This proportion can be adjusted using the ```field_threshold``` parameter, which accepts proportions between 0 and 1. 0 will attempt to include every metadata field present in at least one article, while 1 will include only those fields present in all articles. 77 | 78 | By default, ```news_export``` returns a Pandas DataFrame containing the output data. You can save individual JSON files to disk (i.e. one article per file) by setting the ```to_pandas``` parameter to ```False```. 79 | -------------------------------------------------------------------------------- /news_extract/striprtf2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Extract text in RTF Files. Refactored to use with Python 3.x 5 | Source: 6 | http://stackoverflow.com/a/188877 7 | Code created by Markus Jarderot: http://mizardx.blogspot.com 8 | """ 9 | 10 | import re 11 | 12 | def striprtf2(text,exclude_dest=['datafield', 13 | 'field', 14 | 'fldrslt']): 15 | pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I) 16 | # control words which specify a "destionation". 17 | destinations = set([ 18 | 'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid', 19 | 'atnparent','atnref','atntime','atrfend','atrfstart','author','background', 20 | 'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping', 21 | 'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap', 22 | 'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt', 23 | 'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl', 24 | 'ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype', 25 | 'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr', 26 | 'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl', 27 | 'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc', 28 | 'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers', 29 | 'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride', 30 | 'listoverridetable','listpicture','liststylename','listtable','listtext', 31 | 'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr', 32 | 'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr', 33 | 'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me', 34 | 'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr', 35 | 'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag', 36 | 'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname', 37 | 'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr', 38 | 'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject', 39 | 'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname', 40 | 'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl', 41 | 'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr', 42 | 'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu', 43 | 'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr', 44 | 'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup', 45 | 'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide', 46 | 'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol', 47 | 'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables', 48 | 'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops', 49 | 'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password', 50 | 'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta', 51 | 'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe', 52 | 'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst', 53 | 'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv', 54 | 'svb','tc','template','themedata','title','txe','ud','upr','userprops', 55 | 'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform', 56 | 'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl', 57 | 'xmlopen', 58 | ]) 59 | if len(exclude_dest) > 0: 60 | for i in exclude_dest: 61 | destinations.remove(i) 62 | 63 | # Translation of some special characters. 64 | specialchars = { 65 | 'par': '\n', 66 | 'sect': '\n\n', 67 | 'page': '\n\n', 68 | 'line': '\n', 69 | 'tab': '\t', 70 | 'emdash': '\u2014', 71 | 'endash': '\u2013', 72 | 'emspace': '\u2003', 73 | 'enspace': '\u2002', 74 | 'qmspace': '\u2005', 75 | 'bullet': '\u2022', 76 | 'lquote': '\u2018', 77 | 'rquote': '\u2019', 78 | 'ldblquote': '\201C', 79 | 'rdblquote': '\u201D', 80 | } 81 | stack = [] 82 | ignorable = False # Whether this group (and all inside it) are "ignorable". 83 | ucskip = 1 # Number of ASCII characters to skip after a unicode character. 84 | curskip = 0 # Number of ASCII characters left to skip 85 | out = [] # Output buffer. 86 | for match in pattern.finditer(text): 87 | word,arg,hex,char,brace,tchar = match.groups() 88 | if brace: 89 | curskip = 0 90 | if brace == '{': 91 | # Push state 92 | stack.append((ucskip,ignorable)) 93 | elif brace == '}': 94 | # Pop state 95 | ucskip,ignorable = stack.pop() 96 | elif char: # \x (not a letter) 97 | curskip = 0 98 | if char == '~': 99 | if not ignorable: 100 | out.append('\xA0') 101 | elif char in '{}\\': 102 | if not ignorable: 103 | out.append(char) 104 | elif char == '*': 105 | ignorable = True 106 | elif word: # \foo 107 | curskip = 0 108 | if word in destinations: 109 | ignorable = True 110 | elif ignorable: 111 | pass 112 | elif word in specialchars: 113 | out.append(specialchars[word]) 114 | elif word == 'uc': 115 | ucskip = int(arg) 116 | elif word == 'u': 117 | c = int(arg) 118 | if c < 0: c += 0x10000 119 | if c > 127: out.append(chr(c)) #NOQA 120 | else: out.append(chr(c)) 121 | curskip = ucskip 122 | elif hex: # \'xx 123 | if curskip > 0: 124 | curskip -= 1 125 | elif not ignorable: 126 | c = int(hex,16) 127 | if c > 127: out.append(chr(c)) #NOQA 128 | else: out.append(chr(c)) 129 | elif tchar: 130 | if curskip > 0: 131 | curskip -= 1 132 | elif not ignorable: 133 | out.append(tchar) 134 | return ''.join(out) -------------------------------------------------------------------------------- /news_extract/news_extract.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 8 11:23:28 2018 4 | 5 | @author: freelon 6 | """ 7 | 8 | import collections 9 | from datetime import datetime 10 | import json 11 | import pandas as pd 12 | import re 13 | from .striprtf2 import striprtf2 14 | import unicodedata 15 | 16 | ### OLD NEXIS REGEXES ### 17 | 18 | re_split_ln = re.compile('[0-9]+\sof\s[0-9]+\sDOCUMENTS') 19 | re_get_field_names = re.compile('(?<=\n\n)([A-Z]+?\:\s)(?!\s)') 20 | re_get_fields = re.compile('(?<=\n\n)(?:[A-Z]+?\:\s)(?!\s)(.+?)(?=\n{2,}|$)',re.DOTALL) 21 | re_field_sub = re.compile('[^A-Z]') 22 | re_get_body_field = re.compile('(?:LENGTH\:\s[0-9]+?\s+?words\n+?)(.+?)(?:\n\n[A-Z]+\:\s[A-Z]{3,})',re.DOTALL) 23 | re_get_date_field = re.compile('[A-Za-z]+?\s[0-9]{1,2}\,\s[0-9]{4}') 24 | re_get_pub_field = re.compile('(?:\s+)(.+?)(?=\n)') 25 | re_get_headline_field = re.compile('(?:\d{4}.*?\n\n)(.+?)(?=\n\n)',re.DOTALL) 26 | 27 | ### NEXISUNI REGEXES ### 28 | 29 | re_date_nu = re.compile("\w+\s\d{1,2},\s\d{4}") 30 | re_section_nu = re.compile("(?<=Section: )(.+)") 31 | re_length_nu = re.compile("(?<=Length: )(\d+)(?= words)") 32 | re_byline_nu = re.compile("(?<=Byline: )(.+)") 33 | re_byline2_nu = re.compile("([A-Z\-\s]+)(?=[^A-Z\-\s])") 34 | re_anchors_nu = re.compile("(?<=Anchors: )(.+)") 35 | re_geo_nu = re.compile("(?<=Geographic: )(.+)") 36 | 37 | ### FACTIVA REGEXES ### 38 | 39 | re_split_factiva = re.compile('\nDocument .+\n\n+') 40 | re_get_field_names2 = re.compile('\*[A-Z]{2}\*') 41 | re_get_field_names3 = re.compile('\*[A-Z]{3}\*') 42 | re_get_fields2 = re.compile('(?<=\*[A-Z]{2}\*)(.+?)(?=\s+\*[A-Z]{2,}\*)',re.DOTALL) 43 | re_get_fields3 = re.compile('(?<=\*[A-Z]{3}\*)(.+?)(?=\s+\*[A-Z]{2,}\*)',re.DOTALL) 44 | 45 | ### SHARED REGEXES ### 46 | 47 | re_fix_whitespace = re.compile('\s+') 48 | 49 | ### CURRENTLY UNUSED REGEXES ### 50 | 51 | re_get_housereps = re.compile('(?:Rep\. )([A-Za-z\.\s]+)(?: \(|\,| of)') 52 | re_get_senators = re.compile('(?:Sen\. )([A-Za-z\.\s]+)(?: \(|\,| of)') 53 | 54 | ### FUNCTIONS ### 55 | 56 | def nexis_rtf_extract(nex_rtf): 57 | is_filename = nex_rtf[-4:].lower() == '.rtf' 58 | if is_filename: 59 | rtf_str = open(nex_rtf,"r").read() 60 | nex_str = striprtf2(rtf_str) 61 | else: 62 | nex_str = nex_rtf 63 | nex_str = unicodedata.normalize('NFKD',nex_str) 64 | nex_articles = nex_str.split("End of Document")[:-1] 65 | nex_list = [] 66 | 67 | for x,nex in enumerate(nex_articles): 68 | nex_dict = {} 69 | nex_split = nex.split("\n") 70 | nex_split = [i for i in nex_split if i != ''] 71 | #fields 72 | headline = nex_split[0] 73 | outlet = nex_split[1] 74 | date_str = re_date_nu.findall(nex_split[2])[0] 75 | pub_date = datetime.strptime(date_str, 76 | "%B %d, %Y").isoformat()[:10] 77 | try: 78 | sec_index = [n for n,i 79 | in enumerate(nex_split) 80 | if i[:8] == "Section:"][0] 81 | section = re_section_nu.findall( 82 | nex_split[sec_index])[0] 83 | except IndexError: 84 | section = "" 85 | try: 86 | ct_index = [n for n,i 87 | in enumerate(nex_split) 88 | if i[:7] == "Length:"][0] 89 | word_ct = int( 90 | re_length_nu.findall( 91 | nex_split[ct_index])[0]) 92 | except (IndexError,ValueError): 93 | word_ct = 0 94 | #bylines are a little tricky 95 | try: 96 | by_index = [n for n,i 97 | in enumerate(nex_split) 98 | if i[:7] == "Byline:"][0] 99 | byline = re_byline_nu.findall( 100 | nex_split[by_index])[0] 101 | except IndexError: 102 | by_index = None 103 | if by_index is None: 104 | try: 105 | by_index = [n for n,i 106 | in enumerate(nex_split) 107 | if i[:8] == "Anchors:"][0] 108 | byline = re_anchors_nu.findall( 109 | nex_split[by_index])[0] 110 | except IndexError: 111 | by_index = None 112 | 113 | try: 114 | loc_index = [n for n,i 115 | in enumerate(nex_split) 116 | if i[:11] == "Geographic:"][0] 117 | loc = re_geo_nu.findall(nex_split[loc_index])[0] 118 | except IndexError: 119 | loc = "" 120 | body_start = [n for n,i 121 | in enumerate(nex_split) 122 | if i == "Body"][0] + 1 123 | body_end = [n for n,i 124 | in enumerate(nex_split) 125 | if i == "Classification"][0] 126 | body_text = " ".join(nex_split[body_start:body_end]) 127 | 128 | if by_index is None: 129 | try: 130 | byline = re_byline2_nu.findall(body_text)[0] 131 | except IndexError: 132 | byline = "" 133 | if len(byline) == 1: 134 | byline = "" 135 | 136 | nex_dict['HEADLINE'] = headline 137 | nex_dict['OUTLET'] = outlet 138 | nex_dict['DATE'] = pub_date 139 | nex_dict['SECTION'] = section 140 | nex_dict['LENGTH'] = word_ct 141 | nex_dict['BYLINE'] = byline 142 | nex_dict['LOCATION'] = loc 143 | nex_dict['BODY'] = body_text 144 | if is_filename: 145 | nex_dict['FILENAME'] = nex_rtf 146 | else: 147 | nex_dict['FILENAME'] = "" 148 | nex_list.append(nex_dict) 149 | 150 | return nex_list 151 | 152 | def factiva_extract(article_fn): 153 | factiva_list = [] 154 | txt = open(article_fn,encoding='utf8').read() 155 | txt = re_split_factiva.split(txt)[:-1] 156 | for t in txt: 157 | field_names2 = [fn.replace('*','') 158 | for fn 159 | in re_get_field_names2.findall(t)] 160 | field_names3 = [fn.replace('*','') 161 | for fn 162 | in re_get_field_names3.findall(t)] 163 | field_names3.append('TXT') 164 | fields2 = [f.strip() 165 | for f 166 | in re_get_fields2.findall(t)] 167 | fields3 = [f.strip() 168 | for f 169 | in re_get_fields3.findall(t)] 170 | fields3.append('') 171 | 172 | article_dict2 = dict(zip(field_names2,fields2)) 173 | article_dict3 = dict(zip(field_names3,fields3)) 174 | 175 | if 'LP' in article_dict2: 176 | article_dict3['TXT'] += article_dict2['LP'] 177 | del article_dict2['LP'] 178 | if 'TD' in article_dict2: 179 | article_dict3['TXT'] += " " + article_dict2['TD'] 180 | del article_dict2['TD'] 181 | article_dict3['TXT'] = re_fix_whitespace.sub( 182 | ' ',article_dict3['TXT']) 183 | 184 | article_dict = {} 185 | article_dict.update(article_dict2) 186 | article_dict.update(article_dict3) 187 | article_dict['FILENAME'] = article_fn 188 | article_dict['PD'] = datetime.strptime( 189 | article_dict['PD'], 190 | '%d %B %Y').isoformat()[:10] 191 | article_dict['WC'] = int(article_dict['WC'].replace( 192 | " words","")) 193 | factiva_list.append(article_dict) 194 | 195 | return factiva_list 196 | 197 | def fix_fac_fieldnames(factiva_list): 198 | fff_dict = {"SE":"SECTION", 199 | "HD":"HEADLINE", 200 | "PD":"DATE", 201 | "WC":"LENGTH", 202 | "TXT":"BODY", 203 | "SN":"OUTLET", 204 | "RE":"LOCATION", 205 | "BY":"BYLINE"} 206 | 207 | for n,i in enumerate(factiva_list): 208 | for j in fff_dict: 209 | if j in factiva_list[n]: 210 | factiva_list[n][fff_dict[j]] = factiva_list[n][j] 211 | del factiva_list[n][j] 212 | return factiva_list 213 | 214 | def news_export(news_list, 215 | to_pandas=True, 216 | fn_template='nexis', 217 | jacc_threshold=0.75, 218 | show_dup_rows=True, 219 | master_fields=[], 220 | field_threshold=0.5, 221 | dup_days=14): 222 | news_dates = [] 223 | remove_rows = [] 224 | for n,i in enumerate(news_list): 225 | try: 226 | news_dates.append(datetime.strptime(i['DATE'], 227 | "%Y-%m-%d")) 228 | except ValueError: 229 | remove_rows.append(n) 230 | 231 | news_list = [i for n,i 232 | in enumerate(news_list) 233 | if n not in remove_rows] 234 | 235 | print("Removed",len(remove_rows),"articles with bad dates.") 236 | dup_split = [set(i['BODY'].split()) for i in news_list] 237 | dup_rows = [] 238 | news_len = len(news_list) 239 | 240 | for n,i in enumerate(dup_split): 241 | for x,j in enumerate(dup_split): 242 | day_diff = abs(news_dates[x] - news_dates[n]).days 243 | if x > n and len(j) > 0 and day_diff <= dup_days: 244 | jacc = len(dup_split[n].intersection(dup_split[x]))/ \ 245 | len(dup_split[n].union(dup_split[x])) 246 | if jacc >= jacc_threshold: 247 | dup_rows.append(x) 248 | if n % 100 == 0: 249 | print(100*n/news_len,"% done.") 250 | 251 | remove_list = set(dup_rows) 252 | n_dups = len(remove_list) 253 | print(n_dups,'duplicates removed.') 254 | if show_dup_rows == True: 255 | print(dup_rows) 256 | news_list = [i for n,i 257 | in enumerate(news_list) 258 | if n not in remove_list] 259 | 260 | if master_fields != []: 261 | master_fields = sorted(master_fields) 262 | else: 263 | for a in news_list: 264 | master_fields.extend(list(a.keys())) 265 | master_top = collections.Counter(master_fields).most_common() 266 | n_articles = len(news_list) 267 | master_fields = [i[0] 268 | for i 269 | in master_top 270 | if i[1]/n_articles >= field_threshold] 271 | master_fields = sorted(list(set(master_fields))) 272 | 273 | if to_pandas == True: 274 | to_df_list = [] 275 | for a in news_list: 276 | article_list = [] 277 | for f in master_fields: 278 | try: 279 | article_list.append(a[f]) 280 | except KeyError: 281 | article_list.append('') 282 | to_df_list.append(article_list) 283 | 284 | news_df = pd.DataFrame(to_df_list,columns=master_fields) 285 | return news_df 286 | 287 | else: 288 | nlen = len(str(len(news_list))) 289 | for n,a in enumerate(news_list): 290 | for f in master_fields: 291 | if f not in a: 292 | a[f] = '' 293 | with open(fn_template + '_' + 294 | str(n+1).zfill(nlen) + 295 | '.json','w') as f: 296 | f.write(json.dumps(a)) 297 | 298 | #old nexis extraction code 299 | def ln_extract(article_fn): 300 | ln_list = [] 301 | txt = open(article_fn,encoding='utf8').read() 302 | txt = re_split_ln.split(txt)[1:] 303 | for n,t in enumerate(txt): 304 | field_names = [] 305 | fields = [] 306 | field_names.append('BODY') 307 | try: 308 | fields.append(re_get_body_field.findall(t)[0] + '.') 309 | except IndexError: 310 | print("Fulltext not found for article at index",n,"in file",article_fn) 311 | continue 312 | field_names.append('DATE') 313 | try: 314 | iso_date = datetime.strptime( 315 | re_get_date_field.findall(t)[0], 316 | "%B %d, %Y").isoformat()[:10] 317 | except ValueError: 318 | print("No date found for article", 319 | n,"in file",article_fn) 320 | continue 321 | fields.append(iso_date) 322 | field_names.append('PUBLICATION') 323 | fields.append(re_get_pub_field.findall(t)[0].strip()) 324 | field_names.append('HEADLINE') 325 | try: 326 | fields.append(re_get_headline_field.findall(t)[0].strip()) 327 | except IndexError: 328 | print("Year not found for article",n,"in file",article_fn) 329 | continue 330 | field_names.append('FILENAME') 331 | fields.append(article_fn) 332 | 333 | field_names.extend([re_field_sub.sub('',fn) 334 | for fn 335 | in re_get_field_names.findall(t)]) 336 | fields.extend([f.strip() 337 | for f 338 | in re_get_fields.findall(t)]) 339 | 340 | article_dict = dict(zip(field_names,fields)) 341 | article_dict['LENGTH'] = article_dict['LENGTH'].replace( 342 | " words","") 343 | if len(fields) != len(field_names): 344 | article_dict['BROKEN'] = True 345 | else: 346 | article_dict['BROKEN'] = False 347 | ln_list.append(article_dict) 348 | 349 | for n,i in enumerate(ln_list): 350 | for j in i: 351 | if j != "BROKEN": 352 | ln_list[n][j] = re_fix_whitespace.sub(' ', 353 | ln_list[n][j]) 354 | return ln_list --------------------------------------------------------------------------------