├── news_extract.egg-info
    ├── dependency_links.txt
    ├── requires.txt
    ├── top_level.txt
    ├── SOURCES.txt
    └── PKG-INFO
├── news_extract
    ├── __init__.py
    ├── striprtf2.py
    └── news_extract.py
├── .gitattributes
├── dist
    ├── news_extract-1.0.2.tar.gz
    └── news_extract-1.0.2-py3-none-any.whl
├── setup.py
├── LICENSE
└── README.md


/news_extract.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/news_extract.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | 


--------------------------------------------------------------------------------
/news_extract.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | news_extract
2 | 


--------------------------------------------------------------------------------
/news_extract/__init__.py:
--------------------------------------------------------------------------------
1 | from .news_extract import *
2 | from .striprtf2 import *


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/dist/news_extract-1.0.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfreelon/news_extract/HEAD/dist/news_extract-1.0.2.tar.gz


--------------------------------------------------------------------------------
/dist/news_extract-1.0.2-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfreelon/news_extract/HEAD/dist/news_extract-1.0.2-py3-none-any.whl


--------------------------------------------------------------------------------
/news_extract.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | setup.py
2 | news_extract/__init__.py
3 | news_extract/news_extract.py
4 | news_extract/striprtf2.py
5 | news_extract.egg-info/PKG-INFO
6 | news_extract.egg-info/SOURCES.txt
7 | news_extract.egg-info/dependency_links.txt
8 | news_extract.egg-info/requires.txt
9 | news_extract.egg-info/top_level.txt


--------------------------------------------------------------------------------
/news_extract.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: news-extract
 3 | Version: 1.0.2
 4 | Summary: news_extract
 5 | Home-page: https://github.com/dfreelon/news_extract/
 6 | Author: Deen Freelon
 7 | Author-email: dfreelon@gmail.com
 8 | License: UNKNOWN
 9 | Download-URL: https://github.com/dfreelon/news_extract/
10 | Description: UNKNOWN
11 | Keywords: information retrieval,nexisuni,factiva,deduplication
12 | Platform: UNKNOWN
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | setup(
 3 |   name = 'news_extract',
 4 |   packages = ['news_extract'], # this must be the same as the name above
 5 |   version = '1.0.2',
 6 |   description = 'news_extract',
 7 |   author = 'Deen Freelon',
 8 |   author_email = 'dfreelon@gmail.com',
 9 |   url = 'https://github.com/dfreelon/news_extract/', # use the URL to the github repo
10 |   download_url = 'https://github.com/dfreelon/news_extract/', 
11 |   install_requires = ['pandas'],
12 |   keywords = ['information retrieval', 'nexisuni', 'factiva', 'deduplication'], # arbitrary keywords
13 |   classifiers = [],
14 |   include_package_data=True
15 | )


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Deen Freelon
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # news_extract
 2 |  Python module to extract articles from NexisUni and Factiva.
 3 |  
 4 | ## Requirements
 5 | 
 6 | * Python 3
 7 | * [pandas](https://pandas.pydata.org/)
 8 | 
 9 | ## Installation
10 | 
11 | ```pip install news_extract```
12 | 
13 | ## Overview
14 | 
15 | ```news_extract``` allows the output of the NexisUni and Factiva databases to be imported into Python. **Note, you must export your documents manually first! This module does not scrape the databases directly; rather, it extracts articles and associated metadata from pre-exported output files.** To use it, you must subscribe to at least one of these databases and use the following instructions to export your articles from each database:
16 | 
17 | ### NexisUni export instructions
18 | 
19 | 1. Make sure you are exporting full documents with no attachments, not just the results list.
20 | 2. Export in RTF format. (Note: you can export up to 100 articles at a time if you create an individual NexisUni account and change your personal settings accordingly.)
21 | 3. Save documents in a single file.
22 | 4. Uncheck all options on the "Formatting Options" tab.
23 | 
24 | ### Factiva export instructions
25 | 
26 | 1. For Factiva, you must export your documents using the Firefox browser.
27 | 2. After conducting your search, click the "View Selected Articles" button that looks like an eye.
28 | 3. On the right, click the "Display Options" text and select "Full Article/Report plus Indexing."
29 | 4. Click the "Format for Saving" button that looks like a 3.5" floppy disk and select "Article Format."
30 | 5. On the resulting page, select "Save Page As..." from the Firefox menu.
31 | 6. In the "Save as type" dropdown, select "Text Files" and save your file.
32 | 7. [This animated gif](https://rtemis.hypotheses.org/files/2017/02/Factiva-animated-tutorial.gif) shows how to do steps 2-4 (warning: French) 
33 | 
34 | Once you've exported your file(s), you can do the following:
35 | 
36 | ```python
37 | import news_extract as ne
38 | nu_file = 'results1.rtf' #file exported from NexisUni
39 | fc_file = 'results2.txt' #file exported from Factiva
40 | nu_data = ne.nexis_rtf_extract(nu_file)
41 | fc_data = ne.factiva_extract(fc_file)
42 | 
43 | print(nu_data[0].keys()) #view field names for NexisUni articles
44 | print(fc_data[0].keys()) #view field names for first Factiva article
45 | 
46 | for i in nu_data:
47 |     print(i['HEADLINE']) #show all NexisUni headlines
48 | for i in fc_data:
49 |     print(i['HD']) #show all Factiva headlines
50 | ```
51 | 
52 | ## Output
53 | 
54 | Both ```nexis_rtf_extract``` and ```factiva_extract``` return lists of dicts wherein each dict corresponds to an article. The dict keys are field names, while the dict values are the metadata. One major difference between the two functions is that ```nexis_rtf_extract``` outputs the same set of metadata for all articles, while ```factiva_extract``` auto-extracts the specific field names and values attached to each article. This is due to differences in how the two types of export files are formatted.
55 | 
56 | ## Combining Factiva and NexisUni output
57 | 
58 | ### Converting fieldnames
59 | 
60 | You can use the function ```fix_fac_fieldnames``` to convert Factiva fieldnames to their longer and more descriptive NexisUni equivalents like so:
61 | 
62 | ```python
63 | #note that this will only convert eight common field names, leaving the rest intact
64 | fc_converted = ne.fix_fac_fieldnames(fc_data) 
65 | ```
66 | 
67 | ### Merging Factiva and NexisUni data into a single Pandas variable
68 | 
69 | If you want to analyze data from NexisUni and Factiva in the same project, here's how to do it:
70 | 
71 | ```python
72 | nu_plus_fc = nu_data + fc_converted
73 | combined = ne.news_export(nu_plus_fc)
74 | ```
75 | 
76 | The ```news_export``` function performs several operations, including removing duplicates (using a custom algorithm based on the Jaccard coefficient and time of publication) and resolving conflicts between articles with different metadata fields. For the latter, the function attempts to export all fields included in at least half the articles by default. This proportion can be adjusted using the ```field_threshold``` parameter, which accepts proportions between 0 and 1. 0 will attempt to include every metadata field present in at least one article, while 1 will include only those fields present in all articles.
77 | 
78 | By default, ```news_export``` returns a Pandas DataFrame containing the output data. You can save individual JSON files to disk (i.e. one article per file) by setting the ```to_pandas``` parameter to ```False```.
79 | 


--------------------------------------------------------------------------------
/news_extract/striprtf2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Extract text in RTF Files. Refactored to use with Python 3.x
  5 | Source:
  6 |     http://stackoverflow.com/a/188877
  7 | Code created by Markus Jarderot: http://mizardx.blogspot.com
  8 | """
  9 | 
 10 | import re
 11 | 
 12 | def striprtf2(text,exclude_dest=['datafield', 
 13 |                                  'field', 
 14 |                                  'fldrslt']):
 15 |    pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
 16 |    # control words which specify a "destionation".
 17 |    destinations = set([
 18 |       'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
 19 |       'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
 20 |       'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping',
 21 |       'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap',
 22 |       'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
 23 |       'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
 24 |       'ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype',
 25 |       'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr',
 26 |       'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
 27 |       'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
 28 |       'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
 29 |       'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride',
 30 |       'listoverridetable','listpicture','liststylename','listtable','listtext',
 31 |       'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr',
 32 |       'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr',
 33 |       'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me',
 34 |       'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr',
 35 |       'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag',
 36 |       'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname',
 37 |       'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr',
 38 |       'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject',
 39 |       'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname',
 40 |       'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl',
 41 |       'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr',
 42 |       'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu',
 43 |       'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr',
 44 |       'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup',
 45 |       'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide',
 46 |       'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol',
 47 |       'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables',
 48 |       'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops',
 49 |       'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password',
 50 |       'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta',
 51 |       'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe',
 52 |       'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst',
 53 |       'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv',
 54 |       'svb','tc','template','themedata','title','txe','ud','upr','userprops',
 55 |       'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
 56 |       'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
 57 |       'xmlopen',
 58 |    ])
 59 |    if len(exclude_dest) > 0:
 60 |        for i in exclude_dest:
 61 |            destinations.remove(i)
 62 |           
 63 |    # Translation of some special characters.
 64 |    specialchars = {
 65 |       'par': '\n',
 66 |       'sect': '\n\n',
 67 |       'page': '\n\n',
 68 |       'line': '\n',
 69 |       'tab': '\t',
 70 |       'emdash': '\u2014',
 71 |       'endash': '\u2013',
 72 |       'emspace': '\u2003',
 73 |       'enspace': '\u2002',
 74 |       'qmspace': '\u2005',
 75 |       'bullet': '\u2022',
 76 |       'lquote': '\u2018',
 77 |       'rquote': '\u2019',
 78 |       'ldblquote': '\201C',
 79 |       'rdblquote': '\u201D',
 80 |    }
 81 |    stack = []
 82 |    ignorable = False       # Whether this group (and all inside it) are "ignorable".
 83 |    ucskip = 1              # Number of ASCII characters to skip after a unicode character.
 84 |    curskip = 0             # Number of ASCII characters left to skip
 85 |    out = []                # Output buffer.
 86 |    for match in pattern.finditer(text):
 87 |       word,arg,hex,char,brace,tchar = match.groups()
 88 |       if brace:
 89 |          curskip = 0
 90 |          if brace == '{':
 91 |             # Push state
 92 |             stack.append((ucskip,ignorable))
 93 |          elif brace == '}':
 94 |             # Pop state
 95 |             ucskip,ignorable = stack.pop()
 96 |       elif char: # \x (not a letter)
 97 |          curskip = 0
 98 |          if char == '~':
 99 |             if not ignorable:
100 |                 out.append('\xA0')
101 |          elif char in '{}\\':
102 |             if not ignorable:
103 |                out.append(char)
104 |          elif char == '*':
105 |             ignorable = True
106 |       elif word: # \foo
107 |          curskip = 0
108 |          if word in destinations:
109 |             ignorable = True
110 |          elif ignorable:
111 |             pass
112 |          elif word in specialchars:
113 |             out.append(specialchars[word])
114 |          elif word == 'uc':
115 |             ucskip = int(arg)
116 |          elif word == 'u':
117 |             c = int(arg)
118 |             if c < 0: c += 0x10000
119 |             if c > 127: out.append(chr(c)) #NOQA
120 |             else: out.append(chr(c))
121 |             curskip = ucskip
122 |       elif hex: # \'xx
123 |          if curskip > 0:
124 |             curskip -= 1
125 |          elif not ignorable:
126 |             c = int(hex,16)
127 |             if c > 127: out.append(chr(c)) #NOQA
128 |             else: out.append(chr(c))
129 |       elif tchar:
130 |          if curskip > 0:
131 |             curskip -= 1
132 |          elif not ignorable:
133 |             out.append(tchar)
134 |    return ''.join(out)


--------------------------------------------------------------------------------
/news_extract/news_extract.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Oct  8 11:23:28 2018
  4 | 
  5 | @author: freelon
  6 | """
  7 | 
  8 | import collections
  9 | from datetime import datetime
 10 | import json
 11 | import pandas as pd
 12 | import re
 13 | from .striprtf2 import striprtf2
 14 | import unicodedata
 15 | 
 16 | ### OLD NEXIS REGEXES ###
 17 | 
 18 | re_split_ln = re.compile('[0-9]+\sof\s[0-9]+\sDOCUMENTS')
 19 | re_get_field_names = re.compile('(?<=\n\n)([A-Z]+?\:\s)(?!\s)')
 20 | re_get_fields = re.compile('(?<=\n\n)(?:[A-Z]+?\:\s)(?!\s)(.+?)(?=\n{2,}|$)',re.DOTALL)
 21 | re_field_sub = re.compile('[^A-Z]')
 22 | re_get_body_field = re.compile('(?:LENGTH\:\s[0-9]+?\s+?words\n+?)(.+?)(?:\n\n[A-Z]+\:\s[A-Z]{3,})',re.DOTALL)
 23 | re_get_date_field = re.compile('[A-Za-z]+?\s[0-9]{1,2}\,\s[0-9]{4}')
 24 | re_get_pub_field = re.compile('(?:\s+)(.+?)(?=\n)')
 25 | re_get_headline_field = re.compile('(?:\d{4}.*?\n\n)(.+?)(?=\n\n)',re.DOTALL)
 26 | 
 27 | ### NEXISUNI REGEXES ###
 28 | 
 29 | re_date_nu = re.compile("\w+\s\d{1,2},\s\d{4}")
 30 | re_section_nu = re.compile("(?<=Section: )(.+)")
 31 | re_length_nu = re.compile("(?<=Length: )(\d+)(?= words)")
 32 | re_byline_nu = re.compile("(?<=Byline: )(.+)")
 33 | re_byline2_nu = re.compile("([A-Z\-\s]+)(?=[^A-Z\-\s])")
 34 | re_anchors_nu = re.compile("(?<=Anchors: )(.+)")
 35 | re_geo_nu = re.compile("(?<=Geographic: )(.+)")
 36 | 
 37 | ### FACTIVA REGEXES ###
 38 | 
 39 | re_split_factiva = re.compile('\nDocument .+\n\n+')
 40 | re_get_field_names2 = re.compile('\*[A-Z]{2}\*')
 41 | re_get_field_names3 = re.compile('\*[A-Z]{3}\*')
 42 | re_get_fields2 = re.compile('(?<=\*[A-Z]{2}\*)(.+?)(?=\s+\*[A-Z]{2,}\*)',re.DOTALL)
 43 | re_get_fields3 = re.compile('(?<=\*[A-Z]{3}\*)(.+?)(?=\s+\*[A-Z]{2,}\*)',re.DOTALL)
 44 | 
 45 | ### SHARED REGEXES ###
 46 | 
 47 | re_fix_whitespace = re.compile('\s+')
 48 | 
 49 | ### CURRENTLY UNUSED REGEXES ###
 50 | 
 51 | re_get_housereps = re.compile('(?:Rep\. )([A-Za-z\.\s]+)(?: \(|\,| of)')
 52 | re_get_senators = re.compile('(?:Sen\. )([A-Za-z\.\s]+)(?: \(|\,| of)')
 53 | 
 54 | ### FUNCTIONS ###
 55 | 
 56 | def nexis_rtf_extract(nex_rtf):
 57 |     is_filename = nex_rtf[-4:].lower() == '.rtf'
 58 |     if is_filename:
 59 |         rtf_str = open(nex_rtf,"r").read()
 60 |         nex_str = striprtf2(rtf_str)
 61 |     else:
 62 |         nex_str = nex_rtf
 63 |     nex_str = unicodedata.normalize('NFKD',nex_str)
 64 |     nex_articles = nex_str.split("End of Document")[:-1]
 65 |     nex_list = [] 
 66 |     
 67 |     for x,nex in enumerate(nex_articles):
 68 |         nex_dict = {}
 69 |         nex_split = nex.split("\n")
 70 |         nex_split = [i for i in nex_split if i != '']
 71 |         #fields
 72 |         headline = nex_split[0]
 73 |         outlet = nex_split[1]
 74 |         date_str = re_date_nu.findall(nex_split[2])[0]
 75 |         pub_date = datetime.strptime(date_str,
 76 |                                   "%B %d, %Y").isoformat()[:10]
 77 |         try:
 78 |             sec_index = [n for n,i 
 79 |                          in enumerate(nex_split) 
 80 |                          if i[:8] == "Section:"][0]
 81 |             section = re_section_nu.findall(
 82 |                             nex_split[sec_index])[0]
 83 |         except IndexError:
 84 |             section = ""
 85 |         try:
 86 |             ct_index = [n for n,i 
 87 |                         in enumerate(nex_split) 
 88 |                         if i[:7] == "Length:"][0]
 89 |             word_ct = int(
 90 |                         re_length_nu.findall(
 91 |                                    nex_split[ct_index])[0])
 92 |         except (IndexError,ValueError):
 93 |             word_ct = 0    
 94 |         #bylines are a little tricky    
 95 |         try:
 96 |             by_index = [n for n,i 
 97 |                         in enumerate(nex_split) 
 98 |                         if i[:7] == "Byline:"][0]
 99 |             byline = re_byline_nu.findall(
100 |                           nex_split[by_index])[0]
101 |         except IndexError:
102 |             by_index = None
103 |         if by_index is None:
104 |             try:
105 |                 by_index = [n for n,i 
106 |                             in enumerate(nex_split) 
107 |                             if i[:8] == "Anchors:"][0]
108 |                 byline = re_anchors_nu.findall(
109 |                                 nex_split[by_index])[0]
110 |             except IndexError:
111 |                 by_index = None
112 |             
113 |         try:
114 |             loc_index = [n for n,i 
115 |                         in enumerate(nex_split) 
116 |                         if i[:11] == "Geographic:"][0]
117 |             loc = re_geo_nu.findall(nex_split[loc_index])[0]
118 |         except IndexError:
119 |             loc = ""
120 |         body_start = [n for n,i 
121 |                       in enumerate(nex_split) 
122 |                       if i == "Body"][0] + 1
123 |         body_end = [n for n,i 
124 |                     in enumerate(nex_split) 
125 |                     if i == "Classification"][0]
126 |         body_text = " ".join(nex_split[body_start:body_end])
127 |         
128 |         if by_index is None:
129 |             try:
130 |                 byline = re_byline2_nu.findall(body_text)[0]
131 |             except IndexError:
132 |                 byline = ""
133 |         if len(byline) == 1:
134 |             byline = ""
135 |         
136 |         nex_dict['HEADLINE'] = headline
137 |         nex_dict['OUTLET'] = outlet
138 |         nex_dict['DATE'] = pub_date
139 |         nex_dict['SECTION'] = section
140 |         nex_dict['LENGTH'] = word_ct
141 |         nex_dict['BYLINE'] = byline
142 |         nex_dict['LOCATION'] = loc
143 |         nex_dict['BODY'] = body_text
144 |         if is_filename:
145 |             nex_dict['FILENAME'] = nex_rtf
146 |         else:
147 |             nex_dict['FILENAME'] = ""
148 |         nex_list.append(nex_dict)
149 | 
150 |     return nex_list
151 | 
152 | def factiva_extract(article_fn):
153 |     factiva_list = []
154 |     txt = open(article_fn,encoding='utf8').read()
155 |     txt = re_split_factiva.split(txt)[:-1]
156 |     for t in txt:
157 |         field_names2 = [fn.replace('*','') 
158 |                         for fn 
159 |                         in re_get_field_names2.findall(t)]
160 |         field_names3 = [fn.replace('*','') 
161 |                         for fn
162 |                         in re_get_field_names3.findall(t)]
163 |         field_names3.append('TXT')
164 |         fields2 = [f.strip() 
165 |                    for f 
166 |                    in re_get_fields2.findall(t)]
167 |         fields3 = [f.strip()
168 |                    for f
169 |                    in re_get_fields3.findall(t)]
170 |         fields3.append('')
171 |         
172 |         article_dict2 = dict(zip(field_names2,fields2))
173 |         article_dict3 = dict(zip(field_names3,fields3))
174 |         
175 |         if 'LP' in article_dict2: 
176 |             article_dict3['TXT'] += article_dict2['LP'] 
177 |             del article_dict2['LP']
178 |         if 'TD' in article_dict2:
179 |             article_dict3['TXT'] += " " + article_dict2['TD'] 
180 |             del article_dict2['TD']
181 |         article_dict3['TXT'] = re_fix_whitespace.sub(
182 |                                ' ',article_dict3['TXT'])
183 |         
184 |         article_dict = {}
185 |         article_dict.update(article_dict2)
186 |         article_dict.update(article_dict3)
187 |         article_dict['FILENAME'] = article_fn
188 |         article_dict['PD'] = datetime.strptime(
189 |                              article_dict['PD'],
190 |                              '%d %B %Y').isoformat()[:10]
191 |         article_dict['WC'] = int(article_dict['WC'].replace(
192 |                              " words",""))
193 |         factiva_list.append(article_dict)
194 |         
195 |     return factiva_list
196 | 
197 | def fix_fac_fieldnames(factiva_list):
198 |     fff_dict = {"SE":"SECTION",
199 |                 "HD":"HEADLINE",
200 |                 "PD":"DATE",
201 |                 "WC":"LENGTH",
202 |                "TXT":"BODY",
203 |                 "SN":"OUTLET",
204 |                 "RE":"LOCATION",
205 |                 "BY":"BYLINE"}
206 |     
207 |     for n,i in enumerate(factiva_list):
208 |         for j in fff_dict:
209 |             if j in factiva_list[n]:
210 |                 factiva_list[n][fff_dict[j]] = factiva_list[n][j]
211 |                 del factiva_list[n][j]
212 |     return factiva_list
213 |         
214 | def news_export(news_list,
215 |                 to_pandas=True,
216 |                 fn_template='nexis',
217 |                 jacc_threshold=0.75,
218 |                 show_dup_rows=True,
219 |                 master_fields=[],
220 |                 field_threshold=0.5,
221 |                 dup_days=14):   
222 |     news_dates = []
223 |     remove_rows = []
224 |     for n,i in enumerate(news_list):
225 |         try:
226 |             news_dates.append(datetime.strptime(i['DATE'],
227 |                                                 "%Y-%m-%d"))
228 |         except ValueError:
229 |             remove_rows.append(n)
230 |     
231 |     news_list = [i for n,i 
232 |                  in enumerate(news_list) 
233 |                  if n not in remove_rows]
234 |     
235 |     print("Removed",len(remove_rows),"articles with bad dates.")
236 |     dup_split = [set(i['BODY'].split()) for i in news_list]
237 |     dup_rows = []
238 |     news_len = len(news_list)
239 |     
240 |     for n,i in enumerate(dup_split):
241 |         for x,j in enumerate(dup_split):
242 |             day_diff = abs(news_dates[x] - news_dates[n]).days
243 |             if x > n and len(j) > 0 and day_diff <= dup_days:
244 |                 jacc = len(dup_split[n].intersection(dup_split[x]))/ \
245 |                        len(dup_split[n].union(dup_split[x]))
246 |                 if jacc >= jacc_threshold:
247 |                     dup_rows.append(x)
248 |         if n % 100 == 0:
249 |             print(100*n/news_len,"% done.")
250 |     
251 |     remove_list = set(dup_rows)                    
252 |     n_dups = len(remove_list)
253 |     print(n_dups,'duplicates removed.')
254 |     if show_dup_rows == True:
255 |         print(dup_rows)
256 |     news_list = [i for n,i 
257 |                  in enumerate(news_list) 
258 |                  if n not in remove_list]
259 |     
260 |     if master_fields != []:
261 |         master_fields = sorted(master_fields)
262 |     else:
263 |         for a in news_list:
264 |             master_fields.extend(list(a.keys()))
265 |         master_top = collections.Counter(master_fields).most_common()
266 |         n_articles = len(news_list)
267 |         master_fields = [i[0] 
268 |                          for i 
269 |                          in master_top 
270 |                          if i[1]/n_articles >= field_threshold]
271 |         master_fields = sorted(list(set(master_fields)))
272 |     
273 |     if to_pandas == True:
274 |         to_df_list = []
275 |         for a in news_list:
276 |             article_list = []
277 |             for f in master_fields:
278 |                 try:
279 |                     article_list.append(a[f])
280 |                 except KeyError:
281 |                     article_list.append('')
282 |             to_df_list.append(article_list)
283 |         
284 |         news_df = pd.DataFrame(to_df_list,columns=master_fields)
285 |         return news_df
286 | 
287 |     else:
288 |         nlen = len(str(len(news_list)))
289 |         for n,a in enumerate(news_list):
290 |             for f in master_fields:
291 |                 if f not in a:
292 |                     a[f] = ''
293 |             with open(fn_template + '_' + 
294 |                       str(n+1).zfill(nlen) + 
295 |                       '.json','w') as f:
296 |                 f.write(json.dumps(a))
297 |                 
298 | #old nexis extraction code
299 | def ln_extract(article_fn):
300 |     ln_list = []
301 |     txt = open(article_fn,encoding='utf8').read()
302 |     txt = re_split_ln.split(txt)[1:]
303 |     for n,t in enumerate(txt):
304 |         field_names = []
305 |         fields = []
306 |         field_names.append('BODY')
307 |         try: 
308 |             fields.append(re_get_body_field.findall(t)[0] + '.')
309 |         except IndexError:
310 |             print("Fulltext not found for article at index",n,"in file",article_fn)
311 |             continue
312 |         field_names.append('DATE')
313 |         try:
314 |             iso_date = datetime.strptime(
315 |                        re_get_date_field.findall(t)[0],
316 |                        "%B %d, %Y").isoformat()[:10]
317 |         except ValueError:
318 |             print("No date found for article",
319 |                    n,"in file",article_fn)
320 |             continue
321 |         fields.append(iso_date)
322 |         field_names.append('PUBLICATION')
323 |         fields.append(re_get_pub_field.findall(t)[0].strip())
324 |         field_names.append('HEADLINE')
325 |         try:
326 |             fields.append(re_get_headline_field.findall(t)[0].strip())
327 |         except IndexError:
328 |             print("Year not found for article",n,"in file",article_fn)
329 |             continue            
330 |         field_names.append('FILENAME')
331 |         fields.append(article_fn)
332 |         
333 |         field_names.extend([re_field_sub.sub('',fn)
334 |                             for fn 
335 |                             in re_get_field_names.findall(t)])
336 |         fields.extend([f.strip() 
337 |                        for f 
338 |                        in re_get_fields.findall(t)])
339 |         
340 |         article_dict = dict(zip(field_names,fields))
341 |         article_dict['LENGTH'] = article_dict['LENGTH'].replace(
342 |                                  " words","")
343 |         if len(fields) != len(field_names):
344 |             article_dict['BROKEN'] = True
345 |         else:
346 |             article_dict['BROKEN'] = False
347 |         ln_list.append(article_dict)
348 |     
349 |     for n,i in enumerate(ln_list):
350 |         for j in i:
351 |             if j != "BROKEN":
352 |                 ln_list[n][j] = re_fix_whitespace.sub(' ',
353 |                                    ln_list[n][j])
354 |     return ln_list


--------------------------------------------------------------------------------