├── .python-version ├── lib ├── tagnews │ ├── geoloc │ │ ├── models │ │ │ ├── __init__.py │ │ │ └── lstm │ │ │ │ ├── __init__.py │ │ │ │ ├── saved │ │ │ │ └── .gitignore │ │ │ │ └── save_model.py │ │ ├── __init__.py │ │ └── tag.py │ ├── crimetype │ │ ├── models │ │ │ ├── __init__.py │ │ │ └── binary_stemmed_logistic │ │ │ │ ├── __init__.py │ │ │ │ └── save_model.py │ │ ├── __init__.py │ │ ├── cli.py │ │ ├── benchmark.py │ │ └── tag.py │ ├── data │ │ ├── ci-data │ │ │ ├── .gitignore │ │ │ ├── newsarticles_category.csv │ │ │ ├── newsarticles_usercoding.csv │ │ │ ├── newsarticles_usercoding_categories.csv │ │ │ ├── newsarticles_trainedcoding.csv │ │ │ └── newsarticles_trainedlocation.csv │ │ ├── .gitignore │ │ └── column_names.txt │ ├── senteval │ │ ├── __init__.py │ │ ├── police_words.py │ │ └── eval.py │ ├── utils │ │ ├── __init__.py │ │ ├── model_helpers.py │ │ ├── quick_map.py │ │ ├── neighborhoods.py │ │ ├── load_vectorizer.py │ │ └── utils.py │ ├── __init__.py │ └── tests │ │ ├── test_crimetype_tag.py │ │ ├── test_load_data.py │ │ └── test_geocoder.py └── notebooks │ ├── extract-geostring-example.ipynb │ ├── keras-glove-testing-api-example.ipynb │ ├── keras-glove-with-street-names-better.ipynb │ └── geo-string-result-explorations.ipynb ├── .pylintrc ├── CODEOWNERS ├── r_models ├── .DS_Store └── qj_models_explore.R ├── .gitignore ├── LICENSE ├── pyproject.toml ├── .travis.yml ├── .github └── workflows │ └── publish.yml ├── README.md └── CONTRIBUTING.md /.python-version: -------------------------------------------------------------------------------- 1 | 3.9 2 | -------------------------------------------------------------------------------- /lib/tagnews/geoloc/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | generated-members=pandas.* 2 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @jherzberg @mchladek @RJWorth -------------------------------------------------------------------------------- /lib/tagnews/crimetype/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/tagnews/geoloc/models/lstm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/tagnews/data/ci-data/.gitignore: -------------------------------------------------------------------------------- 1 | !*.csv 2 | -------------------------------------------------------------------------------- /lib/tagnews/geoloc/models/lstm/saved/.gitignore: -------------------------------------------------------------------------------- 1 | *.hdf5 2 | -------------------------------------------------------------------------------- /lib/tagnews/crimetype/models/binary_stemmed_logistic/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/tagnews/data/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.tgz 3 | *.bin 4 | glove* 5 | -------------------------------------------------------------------------------- /lib/tagnews/geoloc/__init__.py: -------------------------------------------------------------------------------- 1 | from . import tag 2 | 3 | __all__ = [tag] 4 | -------------------------------------------------------------------------------- /lib/tagnews/senteval/__init__.py: -------------------------------------------------------------------------------- 1 | from . import eval, police_words 2 | 3 | __all__ = [eval, police_words] 4 | -------------------------------------------------------------------------------- /r_models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chicago-justice-project/article-tagging/HEAD/r_models/.DS_Store -------------------------------------------------------------------------------- /lib/tagnews/crimetype/__init__.py: -------------------------------------------------------------------------------- 1 | from . import tag 2 | from . import benchmark 3 | 4 | __all__ = [tag, benchmark] 5 | -------------------------------------------------------------------------------- /lib/tagnews/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import load_data 2 | from . import load_vectorizer 3 | 4 | __all__ = [load_data, load_vectorizer] 5 | -------------------------------------------------------------------------------- /lib/tagnews/senteval/police_words.py: -------------------------------------------------------------------------------- 1 | police_words_list = ["police", "officer", "cop", "officers", "pigs"] 2 | 3 | bins = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1] 4 | num_bins = len(bins) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.ipynb_checkpoints* 3 | *.pkl 4 | lib/tagnews.egg-info 5 | build/ 6 | dist/ 7 | .eggs/ 8 | .cache/ 9 | .DS_Store 10 | .coverage 11 | .pytest_cache* 12 | *.gz 13 | lib/tagnews/data/*.geojson 14 | .vscode* 15 | .idea* 16 | -------------------------------------------------------------------------------- /lib/tagnews/utils/model_helpers.py: -------------------------------------------------------------------------------- 1 | from nltk import word_tokenize 2 | from nltk.stem import WordNetLemmatizer 3 | 4 | 5 | class LemmaTokenizer(object): 6 | def __init__(self): 7 | self.wnl = WordNetLemmatizer() 8 | 9 | def __call__(self, doc): 10 | return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] 11 | -------------------------------------------------------------------------------- /lib/tagnews/__init__.py: -------------------------------------------------------------------------------- 1 | from . import utils 2 | from . import crimetype 3 | 4 | from .crimetype.tag import CrimeTags 5 | from .senteval.eval import SentimentGoogler 6 | from .geoloc.tag import GeoCoder, get_lat_longs_from_geostrings 7 | from .utils.load_data import load_data 8 | from .utils.load_vectorizer import load_glove 9 | 10 | __all__ = [utils, crimetype, CrimeTags, GeoCoder, SentimentGoogler, 11 | get_lat_longs_from_geostrings, load_data, load_glove] 12 | -------------------------------------------------------------------------------- /lib/tagnews/tests/test_crimetype_tag.py: -------------------------------------------------------------------------------- 1 | import tagnews 2 | 3 | 4 | class Test_Crimetype(): 5 | @classmethod 6 | def setup_method(cls): 7 | cls.model = tagnews.CrimeTags() 8 | 9 | def test_tagtext(self): 10 | self.model.tagtext('This is example article text') 11 | 12 | def test_tagtext_proba(self): 13 | article = 'Murder afoul, someone has been shot!' 14 | probs = self.model.tagtext_proba(article) 15 | max_prob = probs.max() 16 | max_type = probs.idxmax() 17 | tags = self.model.tagtext(article, 18 | prob_thresh=max_prob-0.001) 19 | assert max_type in tags 20 | -------------------------------------------------------------------------------- /lib/tagnews/utils/quick_map.py: -------------------------------------------------------------------------------- 1 | # EXAMPLE 2 | # https://maps.googleapis.com/maps/api/staticmap?size=400x400&markers=41.8850800,-87.6241350|41.880633,-87.629656&key=KEY 3 | 4 | import webbrowser 5 | 6 | 7 | def generate_api_string(lats_lons, key, size=400): 8 | print('Found {} addresses.'.format(len(lats_lons))) 9 | markers = [] 10 | for addr in lats_lons: 11 | loc = '{},{}'.format(addr[0], addr[1]) 12 | markers.append(loc) 13 | url_markers = '|'.join(markers) 14 | full_str = ('https://maps.googleapis.com/maps/api/staticmap' 15 | '?size={}x{}&markers={}&key={}').format( 16 | size, size, url_markers, key 17 | ) 18 | return full_str 19 | 20 | 21 | def url_open(url): 22 | webbrowser.open_new_tab(url) 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Article Tagging Development Team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/tagnews/tests/test_load_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | import pytest 5 | 6 | import tagnews 7 | 8 | 9 | class Test_LoadData(): 10 | @staticmethod 11 | def setup_method(): 12 | os.makedirs('./tmp/', exist_ok=True) 13 | 14 | @staticmethod 15 | def teardown_method(): 16 | shutil.rmtree('./tmp/', ignore_errors=True) 17 | 18 | def test_load_data(self): 19 | df = tagnews.load_data() 20 | assert df.size 21 | 22 | def test_load_data_nrows(self): 23 | df = tagnews.load_data(nrows=2) 24 | assert df.size 25 | 26 | def test_subsample_and_resave(self): 27 | tagnews.utils.load_data.subsample_and_resave('./tmp/', n=1) 28 | 29 | def test_subsample_and_resave_raises_on_matching_folders(self): 30 | with pytest.raises(RuntimeError): 31 | tagnews.utils.load_data.subsample_and_resave( 32 | './tmp/', input_folder='./tmp/' 33 | ) 34 | 35 | 36 | class Test_LoadGlove(): 37 | def test_load_glove(self): 38 | glove_path = os.path.join( 39 | os.path.dirname(__file__), '..', 'data', 'glove.6B.50d.txt') 40 | glove = tagnews.load_glove(glove_path) 41 | glove.loc['murder'] 42 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "tagnews" 7 | version = "1.5.0" 8 | description = "automatically tag news articles with justice-related categories and extract location information" 9 | authors = [ 10 | {name = "Kevin Rose"}, 11 | {name = "Josh Herzberg"}, 12 | {name = "Matt Sweeney"}, 13 | ] 14 | readme = "README.md" 15 | requires-python = ">=3.9" 16 | dependencies = [ 17 | "google-cloud-language>=2.17.2", 18 | "h5py>=3.14.0", 19 | "keras>=2.15.0", 20 | "nltk>=3.9.1", 21 | "numpy>=1.26.4", 22 | "pandas>=2.3.2", 23 | "requests>=2.32.5", 24 | "scikit-learn>=1.6.1", 25 | "scipy>=1.13.1", 26 | "shapely>=2.0.7", 27 | "tensorflow==2.15.1", 28 | ] 29 | 30 | [dependency-groups] 31 | dev = [ 32 | "pytest>=8.4.2", 33 | "pytest-cov>=6.2.1", 34 | ] 35 | 36 | [project.urls] 37 | Repository = "https://github.com/chicago-justice-project/article-tagging" 38 | 39 | [tool.setuptools] 40 | package-dir = {"" = "lib"} 41 | packages = [ 42 | "tagnews", 43 | "tagnews.utils", 44 | "tagnews.crimetype", 45 | "tagnews.crimetype.models.binary_stemmed_logistic", 46 | "tagnews.geoloc", 47 | "tagnews.geoloc.models.lstm", 48 | "tagnews.senteval", 49 | "tagnews.tests", 50 | ] 51 | 52 | [tool.setuptools.package-data] 53 | tagnews = [ 54 | "crimetype/models/binary_stemmed_logistic/*.pkl", 55 | "geoloc/models/lstm/saved/*.hdf5", 56 | "data/glove.6B.50d.txt", 57 | "data/Boundaries - Community Areas (current).geojson", 58 | ] 59 | -------------------------------------------------------------------------------- /lib/tagnews/crimetype/cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from .tag import CrimeTags 3 | 4 | """ 5 | A command line interface to the automatic article crime taging. 6 | Run with `python -m tagnews.crimetype.cli` 7 | """ 8 | 9 | if __name__ == '__main__': 10 | crimetags = CrimeTags() 11 | 12 | if len(sys.argv) == 1: 13 | print(('Go ahead and start typing.' 14 | '\nIf you are on a UNIX machine, hit ctrl-d when done.' 15 | '\nIf you are on a Windows machine, hit ctrl-Z and' 16 | ' then Enter when done.')) 17 | s = sys.stdin.read() 18 | preds = crimetags.tagtext_proba(s) 19 | preds = preds.sort_values(ascending=False) 20 | for tag, prob in zip(preds.index, preds.values): 21 | print('{: >5}, {:.9f}'.format(tag, prob)) 22 | else: 23 | if sys.argv[1] in ['-h', '--help']: 24 | h = 'python -m tagnews.crimetype.tag [filename [filename [...]]]\n' 25 | h += '\n' 26 | h += 'If no filenames are provided, read and tag from stdin.\n' 27 | h += '(Use ctrl-d to stop inputting to stdin.)\n' 28 | h += '\n' 29 | h += 'Otherwise, tag all filenames, outputting the tags as a CSV\n' 30 | h += 'to the file .tagged.' 31 | print(h) 32 | quit() 33 | for filename in sys.argv[1:]: 34 | with open(filename) as f_in: 35 | preds = crimetags.tagtext_proba(f_in.read()) 36 | preds = preds.sort_values(ascending=False) 37 | with open(filename + '.tagged', 'w') as f_out: 38 | for tag, prob in zip(preds.index, preds.values): 39 | f_out.write('{: >5}, {:.9f}\n'.format(tag, prob)) 40 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | os: 4 | - linux 5 | 6 | dist: focal 7 | 8 | python: 9 | - "3.8" 10 | - "3.9" 11 | - "3.10" 12 | 13 | env: 14 | - FLAKE8= 15 | - FLAKE8=yes 16 | 17 | jobs: 18 | allow_failures: 19 | - python: "3.8" 20 | env: FLAKE8=yes 21 | 22 | sudo: false 23 | 24 | install: 25 | - pip install numpy 26 | - pip install nltk 27 | - pip install scikit-learn 28 | - pip install pandas 29 | - pip install scipy 30 | - pip install tensorflow 31 | - pip install h5py 32 | - pip install keras 33 | - pip install shapely 34 | - pip install pytest 35 | - pip install pytest-cov 36 | - pip install requests 37 | - pip install google-cloud-language 38 | - | 39 | if [[ $FLAKE8 ]]; then 40 | pip install flake8 41 | else 42 | python -c "import nltk; nltk.download('punkt'); nltk.download('wordnet')" 43 | python -c "import requests;\ 44 | r = requests.get('https://data.cityofchicago.org/api/geospatial/cauq-8yn6?method=export&format=GeoJSON');\ 45 | f = open('Boundaries - Community Areas (current).geojson', 'w');\ 46 | f.write(r.text)" 47 | mv "Boundaries - Community Areas (current).geojson" lib/tagnews/data/ 48 | wget http://nlp.stanford.edu/data/glove.6B.zip --no-check-certificate 49 | python -c "import zipfile; myzip = zipfile.ZipFile('glove.6B.zip'); myzip.extract('glove.6B.50d.txt')" 50 | mv glove.6B.50d.txt lib/tagnews/data/ 51 | rm glove.6B.zip 52 | mv lib/tagnews/data/ci-data/*.csv lib/tagnews/data/ 53 | fi 54 | 55 | before_script: 56 | - cd lib 57 | 58 | script: 59 | - | 60 | if [[ $FLAKE8 ]]; then 61 | flake8 --ignore=E261,E226,E402,W503 62 | else 63 | python -m tagnews.crimetype.models.binary_stemmed_logistic.save_model 64 | python -m tagnews.geoloc.models.lstm.save_model 2 65 | python -m pytest --cov-report term-missing --cov=tagnews 66 | fi 67 | -------------------------------------------------------------------------------- /lib/tagnews/crimetype/models/binary_stemmed_logistic/save_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import sys 4 | 5 | from ....utils import load_data as ld 6 | from ....utils.model_helpers import LemmaTokenizer 7 | import numpy as np 8 | import sklearn 9 | import sklearn.feature_extraction.text 10 | import sklearn.multiclass 11 | import sklearn.linear_model 12 | import pandas as pd 13 | 14 | # needed to make pickle-ing work 15 | from nltk import word_tokenize # noqa 16 | from nltk.stem import WordNetLemmatizer # noqa 17 | 18 | np.random.seed(1029384756) 19 | 20 | if len(sys.argv) == 2: 21 | df = ld.load_data(nrows=int(sys.argv[1])) 22 | elif len(sys.argv) == 1: 23 | df = ld.load_data() 24 | else: 25 | raise Exception('BAD ARGUMENTS') 26 | 27 | crime_df = df.loc[df.loc[:, 'OEMC':'TASR'].any(axis=1), :] 28 | crime_df = pd.concat([ 29 | df.loc[df.loc[:, 'OEMC':'TASR'].any(axis=1), :], 30 | df.loc[~df['relevant'], :].sample(n=min(3000, (~df['relevant']).sum()), axis=0) 31 | ], ignore_index=True) 32 | 33 | vectorizer = sklearn.feature_extraction.text.CountVectorizer( 34 | tokenizer=LemmaTokenizer(), 35 | binary=True, 36 | max_features=40000 37 | ) 38 | 39 | clf = sklearn.multiclass.OneVsRestClassifier( 40 | sklearn.linear_model.LogisticRegression(verbose=0) 41 | ) 42 | 43 | X = vectorizer.fit_transform(crime_df['bodytext'].values) 44 | Y = crime_df.loc[:, 'OEMC':'TASR'].values 45 | 46 | clf.fit(X, Y) 47 | 48 | from ...tag import CrimeTags 49 | 50 | crimetags = CrimeTags(clf=clf, vectorizer=vectorizer) 51 | 52 | print(crimetags.tagtext_proba(('This is an article about drugs and' 53 | ' gangs.'))) 54 | 55 | import pickle 56 | 57 | curr_time = time.strftime("%Y%m%d-%H%M%S") 58 | 59 | with open(os.path.join(os.path.split(__file__)[0], 60 | 'model-' + curr_time + '.pkl'), 'wb') as f: 61 | pickle.dump(clf, f) 62 | with open(os.path.join(os.path.split(__file__)[0], 63 | 'vectorizer-' + curr_time + '.pkl'), 'wb') as f: 64 | pickle.dump(vectorizer, f) 65 | -------------------------------------------------------------------------------- /lib/tagnews/utils/neighborhoods.py: -------------------------------------------------------------------------------- 1 | neighborhoods = [ 2 | "Andersonville", 3 | "Archer Heights", 4 | "Ashburn", 5 | "Ashburn Estates", 6 | "Austin", 7 | "Avaondale", 8 | "Belmont Central", 9 | "Beverly", 10 | "Beverly Woods", 11 | "Brainerd", 12 | "Bridgeport", 13 | "Brighton Park", 14 | "Bronceville", 15 | "Bucktown", 16 | "Burnside", 17 | "Calumet Heights", 18 | "Canaryville", 19 | "Clearing", 20 | "Chatham", 21 | "Chinatown", 22 | "Cottage Grove Heights", 23 | "Cragin", 24 | "Dunning", 25 | "East Chicago", 26 | "Edison Park", 27 | "Edgebrook", 28 | "Edgewater", 29 | "Englewood", 30 | "Ford City", 31 | "Gage Park", 32 | "Galewood", 33 | "Garfield Park", 34 | "Garfield Ridge", 35 | "Gold Coast", 36 | "Grand Crossing", 37 | "Gresham", 38 | "Hamilton Park", 39 | "Humboldt Park", 40 | "Hyde Park", 41 | "Jefferson Park", 42 | "Kelvyn Park", 43 | "Kenwood", 44 | "Kilbourn Park", 45 | "Lake Meadows", 46 | "Lakeview", 47 | "Lawndale", 48 | "Lincoln Park", 49 | "Lincoln Square", 50 | "Little Village", 51 | "Logan Square", 52 | "Longwood Manor", 53 | "Loop", 54 | "Marquette Park", 55 | "McKinley Park", 56 | "Midway", 57 | "Morgan Park", 58 | "Montclare", 59 | "Mount Greenwood", 60 | "North Center", 61 | "Norwood Park", 62 | "Old Irving Park", 63 | "Old Town", 64 | "Park Manor", 65 | "Pilsen", 66 | "Princeton Park", 67 | "Portage Park", 68 | "Pullman", 69 | "Ravenswood", 70 | "River North", 71 | "River West", 72 | "Rodgers Park", 73 | "Roscoe VIllage", 74 | "Roseland", 75 | "Sauganash", 76 | "Schorsch Village", 77 | "Scottsdale", 78 | "South Chicago", 79 | "South Deering", 80 | "South Loop", 81 | "South Shore", 82 | "Streeterville", 83 | "Tri-Taylor", 84 | "Ukrainian Village", 85 | "United Center", 86 | "Uptown", 87 | "Vittum Park", 88 | "Washington Heights", 89 | "West Elsdon", 90 | "West Loop", 91 | "West Pullman", 92 | "Westlawn", 93 | "Wicker Park", 94 | "Woodlawn", 95 | "Wrigleyville", 96 | "Wrigtwood", 97 | ] 98 | -------------------------------------------------------------------------------- /r_models/qj_models_explore.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | "Models explore" 4 | # Explore results from 5 | # Load result files from loop in quant_justice_models.R 6 | algorithm_summaries <- read.csv("algorith_summaries_031117.csv", stringsAsFactors = F) 7 | ensemble_summaries <- read.csv("ensemble_summaries_031107.csv", stringsAsFactors = F) 8 | algorithm_summaries_detailed <- read.csv("model_performance_measures_031107_cleaned.csv", stringsAsFactors = F) #NOTE: total_positive and total_negative values off for RANDOM FOREST ENSEMBLE 9 | ensemble_summaries_detailed <- read.csv("ensemble_summaries_more_detail_031117_cleaned.csv", stringsAsFactors = F) 10 | 11 | 12 | # Clean 13 | ensemble_summaries_detailed[ensemble_summaries_detailed == 99999] <- NA 14 | algorithm_summaries_detailed[colnames(select(algorithm_summaries_detailed, num_articles_predicted:Recall_for_max_Matt_coef))] <- sapply(algorithm_summaries_detailed[colnames(select(algorithm_summaries_detailed, num_articles_predicted:Recall_for_max_Matt_coef))], as.numeric) 15 | algorithm_summaries_detailed[algorithm_summaries_detailed == 99999.000] <- NA 16 | 17 | 18 | # Example explorations 19 | View(ensemble_summaries_detailed %>% group_by(crime_category) %>% summarise(mean_F_score = mean(F_Score, na.rm = T), 20 | max_F_score = max(F_Score, na.rm = T), 21 | mean_accuracy = mean(Accuracy, na.rm = T), 22 | max_accuracy = max(Accuracy, na.rm = T))) 23 | 24 | View(algorithm_summaries_detailed %>% group_by(model) %>% summarise(mean(AUC, na.rm = T), max(AUC, na.rm = T), 25 | mean(Max_F_score, na.rm = T), max(Max_F_score, na.rm = T), 26 | mean(Max_Accuracy, na.rm = T), max(Max_Accuracy, na.rm = T), 27 | mean(Max_Matt_coef, na.rm = T), max(Max_Matt_coef, na.rm = T))) 28 | 29 | View(algorithm_summaries_detailed %>% group_by(crime_category) %>% summarise(mean(AUC, na.rm = T), max(AUC, na.rm = T), 30 | mean(Max_F_score, na.rm = T), max(Max_F_score, na.rm = T), 31 | mean(Max_Accuracy, na.rm = T), max(Max_Accuracy, na.rm = T), 32 | mean(Max_Matt_coef, na.rm = T), max(Max_Matt_coef, na.rm = T))) -------------------------------------------------------------------------------- /lib/tagnews/data/ci-data/newsarticles_category.csv: -------------------------------------------------------------------------------- 1 | 15,Juvenile,JUVE,2011-08-26 20:22:06.828537+00,t,other 2 | 16,Re-Entry,REEN,2011-08-26 20:22:20.557875+00,t,other 3 | 17,Violence,VIOL,2011-08-26 20:22:33.659219+00,t,other 4 | 19,Probation,PROB,2011-08-26 20:23:14.245505+00,t,other 5 | 20,Parole,PARL,2011-08-26 20:23:20.964542+00,t,other 6 | 21,Criminal Justice Policy,CPLY,2011-08-26 20:23:35.241574+00,t,other 7 | 29,Immigration,IMMG,2011-09-06 20:55:25.596941+00,t,other 8 | 31,Unspecified Crime,UNSPC,2011-10-07 18:55:52.911149+00,t,other 9 | 33,Arson,ARSN,2012-03-27 21:22:12.636112+00,t,crimes 10 | 34,Burlgary,BURG,2012-03-27 21:22:24.563428+00,t,crimes 11 | 4,Cook County Circuit Court,CCCC,2011-08-26 20:17:32.200297+00,t,orgs 12 | 5,Cook County Jail,CCJ,2011-08-26 20:17:45.062532+00,t,orgs 13 | 9,Domestic Violence,DOMV,2011-08-26 20:19:47.881876+00,t,crimes 14 | 35,Driving Under the Influence,DUI,2012-03-27 21:22:44.588387+00,t,crimes 15 | 22,Drugs,DRUG,2011-08-26 20:23:42.081659+00,t,crimes 16 | 30,Environmental Crimes,ENVI,2011-09-07 22:06:48.627019+00,t,crimes 17 | 36,Fraud,FRUD,2012-03-27 21:22:56.963232+00,t,crimes 18 | 24,Gangs,GANG,2011-08-26 20:23:59.845203+00,t,crimes 19 | 14,GLBTQ,GLBTQ,2011-08-26 20:21:54.769447+00,t,crimes 20 | 13,Gun Violence,GUNV,2011-08-26 20:21:24.513693+00,t,crimes 21 | 26,Homicides,HOMI,2011-08-26 20:24:23.339118+00,t,crimes 22 | 37,Robbery,ROBB,2012-03-27 21:23:04.531403+00,t,crimes 23 | 10,Sexual Assault,SEXA,2011-08-26 20:20:38.071264+00,t,crimes 24 | 28,Chicago Police Board,CPBD,2011-09-06 20:54:59.505925+00,t,orgs 25 | 2,Chicago Police Department,CPD,2011-08-26 20:16:27.480709+00,t,orgs 26 | 23,Chicago Public Schools,CPS,2011-08-26 20:23:52.5828+00,t,orgs 27 | 6,Cook County Sheriff's Police,CCSP,2011-08-26 20:18:03.825616+00,t,orgs 28 | 7,Cook County Public Defender's Office,CPUB,2011-08-26 20:18:52.323487+00,t,orgs 29 | 8,Illinois Department of Corrections,IDOC,2011-08-26 20:19:31.308972+00,t,orgs 30 | 3,Cook County State's Attorney's Office,SAO,2011-08-26 20:16:53.005243+00,t,orgs 31 | 11,Police Brutality,POLB,2011-08-26 20:20:53.565396+00,t,policing 32 | 32,Illinois State Court,ILSC,2011-10-07 18:56:14.66409+00,t,orgs 33 | 25,Illinois State Police,ILSP,2011-08-26 20:24:11.965487+00,t,orgs 34 | 27,Independent Police Review Authority,IPRA,2011-09-06 20:53:58.289631+00,t,orgs 35 | 1,Office of Emergency Management & Communications,OEMC,2011-08-25 15:49:06.569879+00,t,orgs 36 | 12,Police Misconduct,POLM,2011-08-26 20:21:04.201743+00,t,policing 37 | 38,Police Use of Taser,TASR,2012-03-27 21:23:26.656998+00,t,policing 38 | 18,Beat Realignment / Police Resouce Allocation,BEAT,2011-08-26 20:22:51.626161+00,t,policing 39 | 39,Civilian Office for Police Accountability,COPA,2017-05-12 19:02:36.800007+00,t,orgs 40 | 40,Deputy Inspector General for Police,DIGP,2017-05-12 19:02:54.114808+00,t,orgs 41 | -------------------------------------------------------------------------------- /lib/tagnews/utils/load_vectorizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import sklearn.preprocessing 4 | 5 | 6 | def load_glove(vectors_file, normalize=False): 7 | """ 8 | Load a GloVe formatted file, which is simply of the format 9 | 10 | ... 11 | ... 12 | ... 13 | See https://github.com/stanfordnlp/GloVe for more information. 14 | That link also has information on how to download the pre-trained 15 | word vectorizer models. If the file you download is compressed, 16 | you will need to uncompress it before using this function. 17 | 18 | Note that the loading speed and memory usage is highly depdendent 19 | on what model you use. The downloadable model "glove.840B.300d.txt" 20 | will take a few minutes to load and use 2.8 GB of memory, whereas the 21 | model "glove.6B.50d.txt" will take a few seconds and use < 200 MB 22 | of memory. 23 | 24 | Sample usage: 25 | 26 | >>> vectors = load_glove('tagnews/geoloc/glove.6B.50d.txt') 27 | >>> text = 'This is a sentence and stuff.' 28 | >>> # you should use an actual tokenizer for this step. 29 | >>> vectorized_text = vectors.loc[[word.lower() 30 | ... for word in text.split()]] 31 | >>> print(vectorized_text.shape) 32 | (6, 300) 33 | >>> k = 5 34 | >>> import numpy as np 35 | >>> def euc(word): 36 | ... return np.sum((vectors.values-vectors.loc[word].values)**2.0, 1) 37 | ... 38 | >>> vectors.index[np.argpartition(euc('murder'), range(k))[:k]] 39 | 40 | Inputs: 41 | vectors_file: path to file that contains GloVe formatted word 42 | vectors. 43 | normalize: Should the word vectors be normalized? See 44 | https://stats.stackexchange.com/questions/177905/ for 45 | a good discussion on the topic. 46 | 47 | Retuns: 48 | vectors: NxM pandas dataframe whose rows are indexed by the word. 49 | """ 50 | 51 | with open(vectors_file, 'r', encoding='utf-8') as f: 52 | for vocab_size, line in enumerate(f): 53 | pass 54 | vocab_size += 1 55 | 56 | vec_size = len(line.split(' ')) - 1 57 | vectors = np.zeros((vocab_size, vec_size), dtype=np.float32) 58 | words = np.empty(shape=(vocab_size), dtype=np.dtype('object')) 59 | 60 | with open(vectors_file, 'r', encoding='utf-8') as f: 61 | for i, line in enumerate(f): 62 | line = line.split(' ') 63 | words[i] = line[0] 64 | vectors[i] = [float(x) for x in line[1:]] 65 | 66 | vectors = pd.DataFrame(vectors, index=words, copy=False) 67 | vectors = vectors.loc[~vectors.index.duplicated()] 68 | 69 | if normalize: 70 | sklearn.preprocessing.normalize(vectors, copy=False) 71 | 72 | return vectors 73 | -------------------------------------------------------------------------------- /lib/tagnews/senteval/eval.py: -------------------------------------------------------------------------------- 1 | from google.cloud import language_v2 2 | 3 | from tagnews.senteval.police_words import police_words_list, bins 4 | 5 | 6 | # def process_google_result(text): 7 | # document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) 8 | # sentiment = client.analyze_entity_sentiment(document=document) 9 | # 10 | # for entity in sentiment.entities: 11 | # clean_entity = "".join(filter(str.isalpha, entity)).lower() 12 | # 13 | # if clean_entity in police_words_list: 14 | # 15 | # for mention in entity.mentions: 16 | # return mention.sentiment.score 17 | 18 | 19 | class SentimentGoogler: 20 | def __init__(self): 21 | self.client = self.connect_to_client() 22 | self.police_words = police_words_list 23 | self.bins = bins[::-1] # reversed because we start with lower numbered bins 24 | self.num_bins = len(bins) 25 | 26 | def run(self, doc_text): 27 | sentiment_ = self.call_api(doc_text) 28 | for entity in sentiment_.entities: 29 | police_entity = self.is_police_entity(entity) 30 | if police_entity: 31 | return self.sentiment_from_entity(police_entity) 32 | 33 | def connect_to_client(self): 34 | return language_v2.LanguageServiceClient() 35 | 36 | def sentiment_from_entity(self, entity): 37 | return entity.sentiment.score 38 | 39 | def call_api(self, doc_text): 40 | """ 41 | Parameters 42 | ---------- 43 | doc_text : str 44 | article text 45 | 46 | Returns 47 | ------- 48 | sentiment : json 49 | google response call 50 | """ 51 | document = language_v2.Document(content=doc_text, type_=language_v2.Document.Type.PLAIN_TEXT) 52 | sentiment = self.client.analyze_entity_sentiment(document=document) 53 | 54 | return sentiment 55 | 56 | def is_police_entity(self, entity): 57 | if entity in self.police_words: 58 | return entity 59 | for mention in entity.mentions: 60 | if pre_process_text(mention.text.content) in self.police_words: 61 | return entity 62 | return False 63 | 64 | def extract_google_priority_bin(self, article:str, cpd_model_val=1, cpd_val=1): 65 | cop_word_counts = sum([article.count(substr) for substr in self.police_words]) 66 | score = 0.5 * cpd_val + 0.25 * cpd_model_val + 0.25 * min(cop_word_counts / (2 * len(self.police_words)), 1.) 67 | bin = [bin for bin, bin_max_val in enumerate(self.bins) if bin_max_val >= score][-1] 68 | return bin 69 | 70 | 71 | def pre_process_text(html_text): 72 | """ 73 | Parameters 74 | ---------- 75 | html_text : str 76 | Article text. 77 | 78 | Returns 79 | ------- 80 | words: str 81 | lower case, just letters 82 | """ 83 | words = "".join(filter(str.isalpha, html_text)).lower() 84 | return words 85 | -------------------------------------------------------------------------------- /lib/tagnews/tests/test_geocoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import tagnews 5 | 6 | 7 | class Test_GeoCoder: 8 | @classmethod 9 | def setup_class(cls): 10 | cls.model = tagnews.GeoCoder() 11 | 12 | def test_extract_geostrings(self): 13 | self.model.extract_geostrings( 14 | ( 15 | "This is example article text with a location of" 16 | " 55th and Woodlawn where something happened." 17 | ) 18 | ) 19 | 20 | def test_extract_geostring_probs(self): 21 | article = ( 22 | "This is example article text with a location of" 23 | " 55th and Woodlawn where something happened." 24 | ) 25 | words, probs = self.model.extract_geostring_probs(article) 26 | max_prob = probs.max() 27 | max_word = words[np.argmax(probs)] 28 | geostrings = self.model.extract_geostrings( 29 | article, prob_thresh=max_prob - 0.001 30 | ) 31 | assert max_word in [word for geostring in geostrings for word in geostring][0] 32 | 33 | def test_extract_geostring_probs_word_not_in_glove(self): 34 | """ 35 | Regression test for issue #105. 36 | """ 37 | article = "___1234567890nonexistent0987654321___" 38 | words, probs = self.model.extract_geostring_probs(article) 39 | 40 | def test_lat_longs_from_geostring_lists(self): 41 | geostring_lists = [ 42 | ["5500", "S", "Woodlawn"], 43 | ["100", "N.", "Wacker"], 44 | ["thigh"], 45 | ] 46 | coords, scores = self.model.lat_longs_from_geostring_lists( 47 | geostring_lists, sleep_secs=0.0 48 | ) 49 | 50 | assert coords.shape[0] == len(geostring_lists) == len(scores) 51 | 52 | def test_community_areas(self): 53 | # Approximately 55th and Woodlawn, which is in Hyde Park. 54 | coords = pd.DataFrame([[41.793465, -87.596930]], columns=["lat", "long"]) 55 | com_area = self.model.community_area_from_coords(coords) 56 | assert com_area == ["HYDE PARK"] 57 | 58 | def test_best_geostring(self): 59 | """Verify that the best_geostring function returns expected values""" 60 | # Example from the readme 61 | input1 = ( 62 | [ 63 | ["1700", "block", "of", "S.", "Halsted", "Ave."], 64 | ["55th", "and", "Woodlawn,"], 65 | ], 66 | [ 67 | np.array( 68 | [ 69 | 0.71738559, 70 | 0.81395197, 71 | 0.82227415, 72 | 0.79400611, 73 | 0.70529455, 74 | 0.60538059, 75 | ] 76 | ), 77 | np.array( 78 | [ 79 | 0.79358339, 80 | 0.69696939, 81 | 0.68011874 82 | ] 83 | ), 84 | ], 85 | ) 86 | output1 = ["1700", "block", "of", "S.", "Halsted", "Ave."] 87 | # Empty geostring example 88 | input2, output2 = [(), ()], '' 89 | for inpt, expected_output in zip([input1, input2], [output1, output2]): 90 | actual_output = self.model.best_geostring(inpt) 91 | assert ( 92 | actual_output == expected_output 93 | ), "ERROR: expected output != actual output for input {}/n {} != {}".format( 94 | inpt, actual_output, expected_output 95 | ) 96 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: 4 | # Allow manual triggers from any branch 5 | workflow_dispatch: 6 | inputs: 7 | environment: 8 | description: 'Choose environment to deploy to' 9 | required: true 10 | default: 'testpypi' 11 | type: choice 12 | options: 13 | - testpypi 14 | - pypi 15 | 16 | # Automatically trigger on new releases 17 | release: 18 | types: [published] 19 | 20 | jobs: 21 | build: 22 | runs-on: ubuntu-latest 23 | environment: ${{ github.event.inputs.environment || 'pypi' }} 24 | 25 | steps: 26 | - uses: actions/checkout@v4 27 | 28 | - name: Set up Python 29 | uses: actions/setup-python@v5 30 | with: 31 | python-version-file: ".python-version" 32 | 33 | - name: Install uv 34 | uses: astral-sh/setup-uv@v6 35 | with: 36 | version: "0.8.9" 37 | 38 | - name: Install build/package dependencies 39 | run: uv sync --locked --all-extras --dev 40 | 41 | - name: Download required data files 42 | run: | 43 | # Download NLTK data 44 | uv run python -c "import nltk; nltk.download('punkt_tab', '.venv/nltk_data'); nltk.download('wordnet', '.venv/nltk_data')" 45 | 46 | # Download geographic data 47 | curl "https://data.cityofchicago.org/api/geospatial/igwz-8jzy?method=export&format=GeoJSON" -o "lib/tagnews/data/Boundaries - Community Areas (current).geojson" 48 | 49 | # Download and extract GloVe 50 | curl -O https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip 51 | unzip glove.6B.zip glove.6B.50d.txt -d lib/tagnews/data 52 | rm glove.6B.zip 53 | 54 | # Move test data 55 | mv lib/tagnews/data/ci-data/*.csv lib/tagnews/data/ 56 | 57 | - name: Train and save models 58 | run: | 59 | uv run python -m tagnews.crimetype.models.binary_stemmed_logistic.save_model 60 | uv run python -m tagnews.geoloc.models.lstm.save_model 2 61 | 62 | - name: Build package 63 | run: uv build 64 | 65 | - name: Store the distribution packages 66 | uses: actions/upload-artifact@v4 67 | with: 68 | name: python-package-distributions 69 | path: dist/ 70 | 71 | publish-to-pypi: 72 | name: Publish to PyPI 73 | if: ${{ github.event.inputs.environment == 'pypi' }} 74 | needs: 75 | - build 76 | runs-on: ubuntu-latest 77 | 78 | environment: 79 | name: pypi 80 | url: https://pypi.org/p/tagnews 81 | 82 | permissions: 83 | id-token: write 84 | 85 | steps: 86 | - name: Download all the dists 87 | uses: actions/download-artifact@v4 88 | with: 89 | name: python-package-distributions 90 | path: dist/ 91 | - name: Publish distribution to PyPI 92 | uses: pypa/gh-action-pypi-publish@release/v1 93 | 94 | publish-to-testpypi: 95 | name: Publish to Test PyPI 96 | if: ${{ github.event.inputs.environment == 'testpypi' }} 97 | needs: 98 | - build 99 | runs-on: ubuntu-latest 100 | 101 | environment: 102 | name: testpypi 103 | url: https://test.pypi.org/p/tagnews 104 | 105 | permissions: 106 | id-token: write 107 | 108 | steps: 109 | - name: Download all the dists 110 | uses: actions/download-artifact@v4 111 | with: 112 | name: python-package-distributions 113 | path: dist/ 114 | - name: Publish distribution to TestPyPi 115 | uses: pypa/gh-action-pypi-publish@release/v1 116 | with: 117 | repository-url: https://test.pypi.org/legacy/ 118 | verbose: true 119 | -------------------------------------------------------------------------------- /lib/tagnews/crimetype/benchmark.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | def get_kfold_split(N, k=4): 8 | """ 9 | Create groups used for k-fold cross validation. 10 | 11 | Parameters 12 | ---------- 13 | N : number of samples to split 14 | k : number of groups used for cross validation 15 | 16 | Returns 17 | ------- 18 | List of (index_train, index_test) pairs 19 | """ 20 | np.random.seed(2017) 21 | idx = np.random.permutation(N) 22 | index_pairs = [(np.ones(N).astype(np.bool), 23 | np.zeros(N).astype(np.bool)) 24 | for _ in range(k)] 25 | 26 | for i, fold_idx in enumerate(np.array_split(idx, k)): 27 | index_pairs[i][0][fold_idx] = 0 28 | index_pairs[i][1][fold_idx] = 1 29 | 30 | return index_pairs 31 | 32 | 33 | def benchmark(clf_factory, X, Y, clf_params_dict=None, k=4, verbose=False): 34 | """ 35 | benchmark a classifier on preprocessed data. 36 | 37 | Parameters 38 | ---------- 39 | clf_factory : 40 | Function which returns a classifier. Classifiers implement 41 | a `fit` method and a `predict` method. The parameters 42 | clf_params will be passed to clf_factory. 43 | X : NxM matrix of features 44 | Y : NxL matrix of binary values. Y[i,j] indicates whether or 45 | not the j'th tag applies to the i'th article. 46 | clf_params_dict : 47 | dictionary of parameters passed to the classifier factory. 48 | If None, no parameters are passed. 49 | k : how many folds to use for cross validation 50 | verbose : Should status be printed? 51 | """ 52 | if clf_params_dict is None: 53 | clf_params_dict = {} 54 | 55 | L = Y.shape[1] 56 | 57 | fold_indexes = get_kfold_split(X.shape[0], k) 58 | acc = np.zeros(k) 59 | tpr = np.zeros((k, L)) 60 | fpr = np.zeros((k, L)) 61 | ppv = np.zeros((k, L)) 62 | 63 | clfs = [] 64 | for i, (idx_trn, idx_tst) in enumerate(fold_indexes): 65 | if verbose: 66 | print('step {} of {}...'.format(i, k), end='') 67 | 68 | clf = clf_factory(**clf_params_dict) 69 | 70 | x_trn = X[idx_trn, :] 71 | y_trn = Y[idx_trn, :] 72 | 73 | x_tst = X[idx_tst, :] 74 | y_tst = Y[idx_tst, :] 75 | 76 | clf.fit(x_trn, y_trn) 77 | y_hat = clf.predict_proba(x_tst) 78 | y_hat = y_hat > 0.5 79 | 80 | y_hat.dtype = np.int8 81 | y_tst.dtype = np.int8 82 | 83 | acc[i] = (np.sum(y_tst == y_hat)) / float(y_tst.size) 84 | for j in range(L): 85 | tpr[i, j] = np.sum(y_tst[:, j] & y_hat[:, j]) / np.sum(y_tst[:, j]) 86 | fpr[i, j] = (np.sum(np.logical_not(y_tst[:, j]) & y_hat[:, j]) 87 | / np.sum(np.logical_not(y_tst[:, j]))) 88 | ppv[i, j] = np.sum(y_tst[:, j] & y_hat[:, j]) / np.sum(y_hat[:, j]) 89 | 90 | clfs.append(clf) 91 | 92 | if verbose: 93 | print('done') 94 | 95 | return {'acc': acc, 'tpr': tpr, 'fpr': fpr, 'ppv': ppv, 'clfs': clfs} 96 | 97 | 98 | def predict_articles(clf, vectorizer, df, n=100, seed=1029384756): 99 | np.random.seed(seed) 100 | 101 | pd.set_option('display.max_columns', 100) 102 | pd.set_option('display.float_format', lambda x: '%.6f' % x) 103 | 104 | random_subset = np.random.choice(np.arange(df.shape[0]), 105 | size=n, 106 | replace=False) 107 | 108 | preds = clf.predict_proba(vectorizer.transform( 109 | df.iloc[random_subset, 3].values 110 | )) 111 | preds = pd.DataFrame(preds) 112 | preds.columns = df.loc[:, 'OEMC':'TASR'].columns 113 | 114 | for i, rand_i in enumerate(random_subset): 115 | s = 'Article ID: ' + str(df.index[rand_i]) 116 | s += '\n' + df.iloc[rand_i, 3] 117 | s += '\n Predicted Tags: ' 118 | s += str(preds.iloc[i, :].index[preds.iloc[i, :] > 0.5].values) 119 | s += '\n' + str(preds.iloc[i, :]) 120 | s += '\n' 121 | filename = 'test-tag-' + str(df.index[rand_i]) + '.txt' 122 | with open(filename, 'w', encoding='utf-8') as f: 123 | f.write(s) 124 | -------------------------------------------------------------------------------- /lib/tagnews/geoloc/models/lstm/save_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | os.chdir(os.path.split(__file__)[0]) 5 | 6 | import glob 7 | saved_files = glob.glob('saved/weights*.hdf5') 8 | if saved_files: 9 | delete = input(('This will delete existing saved weight' 10 | ' files, proceed? [y/n] ')) 11 | while delete not in ['y', 'n']: 12 | delete = input(('This will delete existing saved weight' 13 | ' files, proceed? [y/n] ')) 14 | if delete == 'y': 15 | for f in saved_files: 16 | os.remove(f) 17 | else: 18 | print('Exiting.') 19 | exit() 20 | 21 | from .... import utils 22 | import pandas as pd 23 | from keras.models import Sequential 24 | from keras.layers import LSTM, Dense, TimeDistributed 25 | from keras.utils import to_categorical 26 | from keras.callbacks import ModelCheckpoint 27 | import numpy as np 28 | import json 29 | import requests 30 | import keras 31 | 32 | if len(sys.argv) == 1: 33 | num_epochs = 20 34 | else: 35 | num_epochs = int(sys.argv[1]) 36 | 37 | glove = utils.load_vectorizer.load_glove('../../../data/glove.6B.50d.txt') 38 | # ner = utils.load_data.load_ner_data('../../../data/') 39 | 40 | with open('training.txt', encoding='utf-8') as f: 41 | training_data = f.read() 42 | 43 | training_df = pd.DataFrame([x.split() for x in training_data.split('\n') if x], 44 | columns=['word', 'tag']) 45 | training_df.iloc[:, 1] = training_df.iloc[:, 1].apply(int) 46 | training_df['all_tags'] = 'NA' 47 | 48 | ner = training_df # pd.concat([training_df, ner]).reset_index(drop=True) 49 | ner = ner[['word', 'all_tags', 'tag']] 50 | 51 | ner = pd.concat([ner, 52 | pd.DataFrame(ner['word'].str[0].str.isupper().values), 53 | pd.DataFrame(glove.reindex(ner['word'].str.lower()).values)], 54 | axis='columns') 55 | ner.fillna(value=0.0, inplace=True) 56 | 57 | data_dim = 51 58 | timesteps = 25 # only during training, testing can take arbitrary length. 59 | num_classes = 2 60 | 61 | train_val_split = int(19 * ner.shape[0] / 20.) 62 | 63 | ner_train_idxs = range(0, train_val_split - timesteps, timesteps) 64 | x_train = np.asarray([ner.iloc[i:i+timesteps, 3:].values 65 | for i in ner_train_idxs]).astype(np.float32) 66 | y_train = np.asarray([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2) 67 | for i in ner_train_idxs]).astype(np.float32) 68 | 69 | ner_val_idxs = range(train_val_split, ner.shape[0] - timesteps, timesteps) 70 | x_val = np.asarray([ner.iloc[i:i+timesteps, 3:].values 71 | for i in ner_val_idxs]).astype(np.float32) 72 | y_val = np.asarray([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2) 73 | for i in ner_val_idxs]).astype(np.float32) 74 | 75 | model = Sequential() 76 | model.add(LSTM(32, return_sequences=True, input_shape=(None, data_dim))) 77 | model.add(LSTM(8, return_sequences=True)) 78 | model.add(TimeDistributed(Dense(2, activation='softmax'))) 79 | model.compile(loss='categorical_crossentropy', 80 | optimizer='adam', 81 | metrics=['categorical_accuracy']) 82 | print(model.summary(100)) 83 | 84 | checkpointer = ModelCheckpoint(filepath='./saved/weights-{epoch:02d}.hdf5', 85 | monitor='val_categorical_accuracy', 86 | mode='max', 87 | verbose=1, 88 | save_best_only=True) 89 | 90 | with open('validation.txt', encoding='utf-8') as f: 91 | s = f.read() 92 | val_words = [w for w in s.split('\n') if w] 93 | 94 | gloved_data = pd.concat( 95 | [pd.DataFrame([[w[0].isupper()] for w in val_words]), 96 | glove.reindex([w for w in val_words]).fillna(0).reset_index(drop=True)], 97 | axis='columns' 98 | ) 99 | 100 | 101 | class OurAUC(keras.callbacks.Callback): 102 | def on_epoch_end(self, epoch, logs={}): 103 | # Go to https://geo-extract-tester.herokuapp.com/ and download 104 | # the validation data (validation.txt). 105 | 106 | glove_time_size = 100 107 | preds_batched = [] 108 | i = 0 109 | while gloved_data[i:i+glove_time_size].size: 110 | preds_batched.append( 111 | model.predict(np.asarray(np.expand_dims(gloved_data[i:i+glove_time_size], 112 | axis=0)).astype(np.float32))[0][:, 1] 113 | ) 114 | i += glove_time_size 115 | 116 | with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'w') as f: 117 | for prob in [p for pred in preds_batched for p in pred]: 118 | f.write(str(prob) + '\n') 119 | 120 | with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'rb') as f: 121 | url = 'https://geo-extract-tester.herokuapp.com/api/score' 122 | r = requests.post(url, files={'file': f}) 123 | r = json.loads(r.text) 124 | auc = r['auc'] 125 | print('AUC: {:.5f}, high score? {}'.format(auc, r['high_score'])) 126 | 127 | os.remove('guesses-{epoch:02d}.txt'.format(epoch=epoch)) 128 | logs['val_auc'] = auc 129 | 130 | 131 | #our_auc = OurAUC() 132 | 133 | model.fit(x_train, y_train, 134 | epochs=num_epochs, 135 | validation_data=(x_val, y_val), 136 | callbacks=[checkpointer], 137 | verbose=2) 138 | 139 | idx = slice(501, 550) 140 | pd.set_option('display.width', 200) 141 | df_to_print = pd.DataFrame( 142 | model.predict(np.asarray(np.expand_dims(ner.iloc[idx, 3:].values, axis=0)).astype(np.float32))[0][:, 1:], 143 | columns=['prob_geloc'] 144 | ) 145 | print(pd.concat([ner.iloc[idx, :3].reset_index(drop=True), df_to_print], 146 | axis='columns')) 147 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/chicago-justice-project/article-tagging.svg?branch=master)](https://travis-ci.org/chicago-justice-project/article-tagging) 2 | 3 | # tagnews 4 | 5 | `tagnews` is a Python library that can 6 | 7 | * Automatically categorize the text from news articles with type-of-crime tags, e.g. homicide, arson, gun violence, etc. 8 | * Automatically extract the locations discussed in the news article text, e.g. "55th and Woodlawn" and "1700 block of S. Halsted". 9 | * Retrieve the latitude/longitude pairs for said locations using an instance of the pelias geocoder hosted by CJP. 10 | * Get the community areas those lat/long pairs belong to using a shape file downloaded from the city data portal parsed by the `shapely` python library. 11 | 12 | Sound interesting? There's example usage below! 13 | 14 | You can find the source code on [GitHub](https://github.com/chicago-justice-project/article-tagging). 15 | 16 | ## Installation 17 | 18 | You can install `tagnews` with pip: 19 | 20 | ``` 21 | pip install tagnews 22 | ``` 23 | 24 | **NOTE:** You will need to install some [NLTK](http://www.nltk.org/) packages as well: 25 | 26 | ```python 27 | >>> import nltk 28 | >>> nltk.download('punkt_tab') 29 | >>> nltk.download('wordnet') 30 | ``` 31 | 32 | Beware, `tagnews` requires python >= 3.9. 33 | 34 | ## Example 35 | 36 | The main classes are `tagnews.CrimeTags` and `tagnews.GeoCoder`. 37 | 38 | ```python 39 | >>> import tagnews 40 | >>> crimetags = tagnews.CrimeTags() 41 | >>> article_text = ('The homicide occurred at the 1700 block of S. Halsted Ave.' 42 | ... ' It happened just after midnight. Another person was killed at the' 43 | ... ' intersection of 55th and Woodlawn, where a lone gunman') 44 | >>> crimetags.tagtext_proba(article_text) 45 | HOMI 0.739159 46 | VIOL 0.146943 47 | GUNV 0.134798 48 | ... 49 | >>> crimetags.tagtext(article_text, prob_thresh=0.5) 50 | ['HOMI'] 51 | >>> geoextractor = tagnews.GeoCoder() 52 | >>> prob_out = geoextractor.extract_geostring_probs(article_text) 53 | >>> list(zip(*prob_out)) 54 | [..., ('at', 0.0044685714), ('the', 0.005466637), ('1700', 0.7173856), 55 | ('block', 0.81395197), ('of', 0.82227415), ('S.', 0.7940061), 56 | ('Halsted', 0.70529455), ('Ave.', 0.60538065), ...] 57 | >>> geostrings = geoextractor.extract_geostrings(article_text, prob_thresh=0.5) 58 | >>> geostrings 59 | [['1700', 'block', 'of', 'S.', 'Halsted', 'Ave.'], ['55th', 'and', 'Woodlawn,']] 60 | >>> coords, scores = geoextractor.lat_longs_from_geostring_lists(geostrings) 61 | >>> coords 62 | lat long 63 | 0 41.859021 -87.646934 64 | 1 41.794816 -87.597422 65 | >>> scores # confidence in the lat/longs as returned by pelias, higher is better 66 | array([0.878, 1. ]) 67 | >>> geoextractor.community_area_from_coords(coords) 68 | ['LOWER WEST SIDE', 'HYDE PARK'] 69 | ``` 70 | 71 | ## Limitations 72 | 73 | This project uses Machine Learning to automate data cleaning/preparation tasks that would be cost and time prohibitive to perform using people. Like all Machine Learning projects, *the results are not perfect, and in some cases may look just plain bad*. 74 | 75 | We strived to build the best models possible, but perfect accuracy is rarely possible. If you have thoughts on how to do better, please consider [reporting an issue](https://github.com/chicago-justice-project/article-tagging/issues/new), or better yet [contributing](https://github.com/chicago-justice-project/article-tagging/blob/master/CONTRIBUTING.md). 76 | 77 | ## How can I contribute? 78 | 79 | Great question! Please see [CONTRIBUTING.md](https://github.com/chicago-justice-project/article-tagging/blob/master/CONTRIBUTING.md). 80 | 81 | ## Problems? 82 | 83 | If you have problems, please [report an issue](https://github.com/chicago-justice-project/article-tagging/issues/new). Anything that is behaving unexpectedly is an issue, and should be reported. If you are getting bad or unexpected results, that is also an issue, and should be reported. We may not be able to do anything about it, but more data rarely degrades performance. 84 | 85 | ## Background 86 | 87 | We want to compare the amount of different types of crimes are reported in certain areas vs. the actual occurrence amount in those areas. In essence, *are some crimes under-represented in certain areas but over-represented in others?* This is the main question driving the analysis. 88 | 89 | This question came from the [Chicago Justice Project](http://chicagojustice.org/). They have been interested in answering this question for quite a while, and have been collecting the data necessary to have a data-backed answer. Their efforts include 90 | 91 | 1. Scraping RSS feeds of articles written by Chicago area news outlets for several years, allowing them to collect almost half a million articles. 92 | 2. Organizing an amazing group of [volunteers](http://chicagojustice.org/volunteer-for-cjp/) that have helped them tag these articles with crime categories like "Gun Violence" and "Drugs", but also organizations such as "Cook County State's Attorney's Office", "Illinois State Police", "Chicago Police Department", and other miscellaneous categories such as "LGBTQ", "Immigration". 93 | 3. The web UI used to do this tagging was also recently updated to allow highlighting of geographic information, resulting in several hundred articles with labeled location sub-strings. 94 | 95 | Most of the code for those components can be found [here](https://github.com/chicago-justice-project/chicago-justice). 96 | 97 | A group actively working on this project meets every Tuesday at [Chi Hack Night](https://chihacknight.org/). 98 | 99 | ## See Also 100 | 101 | * [Chicago Justice Project](http://chicagojustice.org/) 102 | * [Source code of other CJP projects](https://github.com/chicago-justice-project) 103 | * [... including the database/web scraping side of things](https://github.com/chicago-justice-project/chicago-justice) 104 | * [What is Chi Hack Night?](https://chihacknight.org/about.html) 105 | -------------------------------------------------------------------------------- /lib/tagnews/crimetype/tag.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import glob 4 | import time 5 | import pandas as pd 6 | 7 | # not used explicitly, but this needs to be imported like this 8 | # for unpickling to work. 9 | from ..utils.model_helpers import LemmaTokenizer # noqa 10 | 11 | """ 12 | Contains the CrimeTags class that allows tagging of articles. 13 | """ 14 | 15 | MODEL_LOCATION = os.path.join(os.path.split(__file__)[0], 16 | 'models', 17 | 'binary_stemmed_logistic') 18 | 19 | TAGS = ['OEMC', 'CPD', 'SAO', 'CCCC', 'CCJ', 'CCSP', 20 | 'CPUB', 'IDOC', 'DOMV', 'SEXA', 'POLB', 'POLM', 21 | 'GUNV', 'GLBTQ', 'JUVE', 'REEN', 'VIOL', 'BEAT', 22 | 'PROB', 'PARL', 'CPLY', 'DRUG', 'CPS', 'GANG', 'ILSP', 23 | 'HOMI', 'IPRA', 'CPBD', 'IMMG', 'ENVI', 'UNSPC', 24 | 'ILSC', 'ARSN', 'BURG', 'DUI', 'FRUD', 'ROBB', 'TASR'] 25 | 26 | 27 | def load_model(location=MODEL_LOCATION): 28 | """ 29 | Load a model from the given folder `location`. 30 | There should be at least one file named model-TIME.pkl and 31 | a file named vectorizer-TIME.pkl inside the folder. 32 | 33 | The files with the most recent timestamp are loaded. 34 | """ 35 | models = glob.glob(os.path.join(location, 'model*.pkl')) 36 | if not models: 37 | raise RuntimeError(('No models to load. Run' 38 | ' "python -m tagnews.crimetype.models.' 39 | 'binary_stemmed_logistic.save_model"')) 40 | model = models.pop() 41 | while models: 42 | model_time = time.strptime(model[-19:-4], '%Y%m%d-%H%M%S') 43 | new_model_time = time.strptime(models[0][-19:-4], '%Y%m%d-%H%M%S') 44 | if model_time < new_model_time: 45 | model = models[0] 46 | models = models[1:] 47 | 48 | with open(model, 'rb') as f: 49 | clf = pickle.load(f) 50 | 51 | with open(os.path.join(location, 'vectorizer-' + model[-19:-4] + '.pkl'), 52 | 'rb') as f: 53 | vectorizer = pickle.load(f) 54 | 55 | return clf, vectorizer 56 | 57 | 58 | class CrimeTags(): 59 | """ 60 | CrimeTags let you tag articles. Neat! 61 | """ 62 | def __init__(self, 63 | model_directory=MODEL_LOCATION, 64 | clf=None, 65 | vectorizer=None): 66 | """ 67 | Load a model from the given `model_directory`. 68 | See `load_model` for more information. 69 | 70 | Alternatively, the classifier and vectorizer can be 71 | provided. If one is provided, then both must be provided. 72 | """ 73 | if clf is None and vectorizer is None: 74 | self.clf, self.vectorizer = load_model(model_directory) 75 | elif clf is None or vectorizer is None: 76 | raise ValueError(('clf and vectorizer must both be None,' 77 | ' or both be not None')) 78 | else: 79 | self.clf, self.vectorizer = clf, vectorizer 80 | 81 | def tagtext_proba(self, text): 82 | """ 83 | Compute the probability each tag applies to the given text. 84 | 85 | inputs: 86 | text: A python string. 87 | returns: 88 | pred_proba: A pandas series indexed by the tag name. 89 | """ 90 | x = self.vectorizer.transform([text]) 91 | y_hat = self.clf.predict_proba(x) 92 | preds = pd.DataFrame(y_hat) 93 | preds.columns = TAGS 94 | preds = preds.T.iloc[:, 0].sort_values(ascending=False) 95 | return preds 96 | 97 | def tagtext(self, text, prob_thresh=0.5): 98 | """ 99 | Tag a string with labels. 100 | 101 | inputs: 102 | text: A python string. 103 | prob_thresh: The threshold on probability at which point 104 | the tag will be applied. 105 | returns: 106 | preds: A list of tags that have > prob_thresh probability 107 | according to the model. 108 | """ 109 | preds = self.tagtext_proba(text) 110 | return preds[preds > prob_thresh].index.values.tolist() 111 | 112 | def relevant_proba(self, text): 113 | """ 114 | Outputs the probability that the given text is relevant. 115 | This probability is computed naively as the maximum of 116 | the probabilities each tag applies to the text. 117 | 118 | A more nuanced method would compute a joint probability. 119 | 120 | inputs: 121 | text: A python string. 122 | 123 | returns: 124 | relevant_proba: Probability the text is relevant. 125 | """ 126 | return max(self.tagtext_proba(text)) 127 | 128 | def relevant(self, text, prob_thresh=0.05): 129 | """ 130 | Determines whether given text is relevant or not. Relevance 131 | is defined as whether any tag has more than prob_thresh 132 | chance of applying to the text according to the model. 133 | 134 | inputs: 135 | text: A python string. 136 | prob_thresh: The threshold on probability that 137 | determines relevance. If no tags have >= 138 | prob_thresh of applying to the text, then 139 | the text is not relevant. 140 | returns: 141 | relevant: Boolean. Is the text "relevant"? 142 | """ 143 | return len(self.tagtext(text, prob_thresh)) > 0 144 | 145 | def get_contributions(self, text): 146 | """ 147 | Rank the words in the text by their contribution to each 148 | category. This function assumes that clf has an attribute 149 | `coef_` and that vectorizer has an attribute 150 | `inverse_transform`. 151 | 152 | inputs: 153 | text: A python string. 154 | returns: 155 | contributions: Pandas panel keyed off [category, word]. 156 | 157 | Example: 158 | >>> s = 'This is an article about drugs and gangs.' 159 | >>> s += ' Written by the amazing Kevin Rose.' 160 | >>> p = tagger.get_contributions(s) 161 | >>> p['DRUG'].sort_values('weight', ascending=False) 162 | weight 163 | drug 5.549870 164 | copyright 0.366905 165 | gang 0.194773 166 | this 0.124590 167 | an -0.004484 168 | article -0.052026 169 | is -0.085534 170 | about -0.154800 171 | kevin -0.219028 172 | rose -0.238296 173 | and -0.316201 174 | . -0.853208 175 | """ 176 | p = {} 177 | vec = self.vectorizer.transform([text]) 178 | vec_inv = self.vectorizer.inverse_transform(vec) 179 | for i, tag in enumerate(TAGS): 180 | p[tag] = pd.DataFrame( 181 | index=vec_inv, 182 | data={'weight': self.clf.coef_[i, vec.nonzero()[1]]} 183 | ) 184 | return pd.Panel(p) 185 | -------------------------------------------------------------------------------- /lib/notebooks/extract-geostring-example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "C:\\Users\\kevin.rose\\Documents\\GitHub\\cjp-article-tagging\\lib\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "cd .." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import tagnews\n", 29 | "import pandas as pd" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "# Download (and extract if needed) a saved glove data from\n", 41 | "# https://github.com/stanfordnlp/GloVe\n", 42 | "# and save it to tagnews/data/\n", 43 | "glove = tagnews.load_glove('tagnews/data/glove.6B.50d.txt')" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stderr", 53 | "output_type": "stream", 54 | "text": [ 55 | "b'Skipping line 281837: expected 25 fields, saw 34\\n'\n", 56 | "C:\\Users\\kevin.rose\\AppData\\Local\\Continuum\\Anaconda2\\envs\\cjp\\lib\\site-packages\\numpy\\lib\\arraysetops.py:463: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n", 57 | " mask |= (ar1 == a)\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "# Download (and extract if needed) the NER data from\n", 63 | "# https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data\n", 64 | "# and save it to tagnews/data/\n", 65 | "ner = tagnews.load_ner_data('tagnews/data/')" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 5, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "ner = pd.concat([ner, pd.DataFrame(glove.loc[ner['word'].str.lower()].values)], axis='columns')" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 6, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "Asserted correct vectorizations 998 times.\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "num_asserts = 0\n", 94 | "for i, row in ner.sample(1000).iterrows():\n", 95 | " if not any(row.iloc[2:].isnull()):\n", 96 | " assert (glove.loc[row['word'].lower()].values == row.iloc[3:].values).all()\n", 97 | " num_asserts += 1\n", 98 | "print('Asserted correct vectorizations', num_asserts, 'times.')" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 7, 104 | "metadata": { 105 | "collapsed": true 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "import sklearn.ensemble" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 8, 115 | "metadata": { 116 | "collapsed": true 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "clf = sklearn.ensemble.RandomForestClassifier()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 9, 126 | "metadata": { 127 | "collapsed": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "# be careful doing this if you are relying on sequential-ness!\n", 132 | "ner.fillna(value=0.0, inplace=True)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 10, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 144 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 145 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 146 | " min_samples_leaf=1, min_samples_split=2,\n", 147 | " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n", 148 | " oob_score=False, random_state=None, verbose=0,\n", 149 | " warm_start=False)" 150 | ] 151 | }, 152 | "execution_count": 10, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "clf.fit(ner.iloc[:200000, 3:], ner['tag'].iloc[:200000].values)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 11, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/plain": [ 169 | "array([[ 0.04864864, 0.95135136],\n", 170 | " [ 0.2663006 , 0.7336994 ],\n", 171 | " [ 1. , 0. ]])" 172 | ] 173 | }, 174 | "execution_count": 11, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "clf.predict_proba(glove.loc[['london', 'france', 'napkins']])" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 12, 186 | "metadata": { 187 | "collapsed": true 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "# Go to https://geo-extract-tester.herokuapp.com/ and download\n", 192 | "# the validation data (validation.txt).\n", 193 | "with open('validation.txt', encoding='utf-8') as f:\n", 194 | " s = f.read()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 13, 200 | "metadata": { 201 | "collapsed": true 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "with open('guesses.txt', 'w') as f:\n", 206 | " for prob in clf.predict_proba(glove.loc[[w for w in s.split('\\n') if w]].fillna(0))[:,1]:\n", 207 | " f.write(str(prob) + '\\n')" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": { 213 | "collapsed": true 214 | }, 215 | "source": [ 216 | "Now go to https://geo-extract-tester.herokuapp.com/ and upload `guesses.txt` to see how you did!" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": { 223 | "collapsed": true 224 | }, 225 | "outputs": [], 226 | "source": [] 227 | } 228 | ], 229 | "metadata": { 230 | "kernelspec": { 231 | "display_name": "Python 3", 232 | "language": "python", 233 | "name": "python3" 234 | }, 235 | "language_info": { 236 | "codemirror_mode": { 237 | "name": "ipython", 238 | "version": 3 239 | }, 240 | "file_extension": ".py", 241 | "mimetype": "text/x-python", 242 | "name": "python", 243 | "nbconvert_exporter": "python", 244 | "pygments_lexer": "ipython3", 245 | "version": "3.6.1" 246 | } 247 | }, 248 | "nbformat": 4, 249 | "nbformat_minor": 2 250 | } 251 | -------------------------------------------------------------------------------- /lib/tagnews/utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import shutil 4 | import pandas as pd 5 | import logging 6 | 7 | logger = logging.getLogger() 8 | logger.setLevel(logging.INFO) 9 | 10 | def load_all_data(): 11 | # fmt: off 12 | """ articles_df, categories_df, trainedcategoryrelevance_df, trainedcoding_df, usercoding_df, usercoding_categories_df, trainedlocation""" 13 | # fmt: on 14 | newssource = load_newssource() 15 | articles = load_articles() 16 | categories = load_categories() 17 | trainedcategoryrelevance = load_trainedcategoryrelevance() 18 | trainedlocation = load_trainedlocation() 19 | trainedcoding = load_trainedcoding() 20 | usercoding = load_usercoding() 21 | usercoding_categories = load_usercoding_categories() 22 | # trainedsentiment = load_trainedsentiment() 23 | # trainedsentimententities = load_trainedsentimententities() 24 | return ( 25 | newssource, 26 | articles, 27 | categories, 28 | trainedcategoryrelevance, 29 | trainedcoding, 30 | usercoding, 31 | usercoding_categories, 32 | trainedlocation, 33 | # trainedsentiment, 34 | # trainedsentimententities 35 | ) 36 | 37 | def load_data_subset(): 38 | # fmt: off 39 | """ articles_df, categories_df, trainedcategoryrelevance_df, trainedcoding_df, usercoding_df, usercoding_categories_df, trainedlocation""" 40 | # fmt: on 41 | newssource = load_newssource() 42 | articles = load_articles_nohtml() 43 | categories = load_categories() 44 | trainedcategoryrelevance = load_trainedcategoryrelevance() 45 | trainedlocation = load_trainedlocation() 46 | trainedcoding = load_trainedcoding() 47 | usercoding = load_usercoding() 48 | usercoding_categories = load_usercoding_categories() 49 | 50 | return ( 51 | newssource, 52 | articles, 53 | categories, 54 | trainedcategoryrelevance, 55 | trainedcoding, 56 | usercoding, 57 | usercoding_categories, 58 | trainedlocation 59 | ) 60 | 61 | def load_newssource(): 62 | newsource = pd.read_csv( 63 | "./cjp_tables/newsarticles_newssource.csv.gz", header=None, compression="gzip", low_memory=False 64 | ) 65 | newsource.columns = [ 66 | "source_id", 67 | "source_name", 68 | "short_name", 69 | "legacy_feed_id", 70 | ] 71 | print(f"news sources loaded. size: {newsource.shape}") 72 | return newsource 73 | 74 | 75 | def load_articles(): 76 | # Read CSV file of articles but exclude the original html (orig_html) column 77 | article = pd.read_csv( 78 | "./cjp_tables/newsarticles_article.csv.gz", header=None, usecols=[0,1,2,4,5,6,7,8,9,10], compression="gzip", low_memory=False 79 | ) 80 | article.columns = [ 81 | "id", 82 | "feedname", 83 | "url", 84 | "title", 85 | "bodytext", 86 | "relevant", 87 | "created", 88 | "last_modified", 89 | "news_source_id", 90 | "author", 91 | ] 92 | print(f"articles loaded. size: {article.shape}") 93 | return article 94 | 95 | def load_articles_nohtml(): 96 | article = pd.read_csv( 97 | "./cjp_tables/newsarticles_article.csv.gz", header=None, compression="gzip", low_memory=False 98 | ) 99 | article.columns = [ 100 | "id", 101 | "feedname", 102 | "url", 103 | "title", 104 | "bodytext", 105 | "relevant", 106 | "created", 107 | "last_modified", 108 | "news_source_id", 109 | "author", 110 | ] 111 | print(f"articles loaded. size: {article.shape}") 112 | return article 113 | 114 | 115 | def load_categories(): 116 | categories = pd.read_csv( 117 | "./cjp_tables/newsarticles_category.csv.gz", header=None, compression="gzip", low_memory=False 118 | ) 119 | categories.columns = ["id", "title", "abbreviation", "created", "active", "kind"] 120 | print(f"categories loaded. size: {categories.shape}") 121 | return categories 122 | 123 | 124 | def load_trainedcategoryrelevance(): 125 | trainedcategoryrelevance = pd.read_csv( 126 | "./cjp_tables/newsarticles_trainedcategoryrelevance.csv.gz", header=None, compression="gzip", low_memory=False 127 | ) 128 | trainedcategoryrelevance.columns = ["id", "relevance", "category_id", "coding_id"] 129 | print(f"trainedcategoryrelevance loaded. size: {trainedcategoryrelevance.shape}") 130 | return trainedcategoryrelevance 131 | 132 | 133 | def load_trainedcoding(): 134 | trainedcoding = pd.read_csv( 135 | "./cjp_tables/newsarticles_trainedcoding.csv.gz", 136 | header=None, 137 | compression="gzip", 138 | low_memory=False 139 | ) 140 | trainedcoding.columns = [ 141 | "id", 142 | "date", 143 | "model_info", 144 | "relevance", 145 | "article_id", 146 | "sentiment", 147 | "bin", 148 | "sentiment_processed", 149 | ] 150 | print(f"trainedcoding loaded. size: {trainedcoding.shape}") 151 | return trainedcoding 152 | 153 | 154 | def load_trainedlocation(): 155 | trainedlocation = pd.read_csv( 156 | "./cjp_tables/newsarticles_trainedlocation.csv.gz", 157 | header=None, 158 | compression="gzip", 159 | low_memory=False 160 | ) 161 | trainedlocation.columns = [ 162 | "id", 163 | "text", 164 | "latitude", 165 | "longitude", 166 | "coding_id", 167 | "confidence", 168 | "neighborhood", 169 | "is_best" 170 | ] 171 | print(f"trainedlocation loaded. size: {trainedlocation.shape}") 172 | return trainedlocation 173 | 174 | 175 | def load_usercoding(): 176 | usercoding = pd.read_csv( 177 | "./cjp_tables/newsarticles_usercoding.csv.gz", header=None, compression="gzip", low_memory=False 178 | ) 179 | usercoding.columns = [ 180 | "id", 181 | "date", 182 | "relevant", 183 | "article_id", 184 | "user_id", 185 | "locations", 186 | "sentiment", 187 | ] 188 | print(f"usercoding loaded. size: {usercoding.shape}") 189 | return usercoding 190 | 191 | 192 | def load_usercoding_categories(): 193 | usercoding_categories = pd.read_csv( 194 | "./cjp_tables/newsarticles_usercoding_categories.csv.gz", 195 | header=None, 196 | compression="gzip", 197 | low_memory=False 198 | ) 199 | usercoding_categories.columns = ["id", "usercoding_id", "category_id"] 200 | print(f"usercoding_categories loaded. size: {usercoding_categories.shape}") 201 | return usercoding_categories 202 | 203 | 204 | def load_trainedsentiment(): 205 | trainedsentiment = pd.read_csv( 206 | "./cjp_tables/newsarticles_trainedsentiment.csv.gz", 207 | header=None, 208 | compression="gzip", 209 | low_memory=False 210 | ) 211 | trainedsentiment.columns = [ 212 | "id", 213 | "date", 214 | "api_response", 215 | "coding_id", 216 | ] 217 | print(f"trainedsentiment loaded. size: {trainedsentiment.shape}") 218 | return trainedsentiment 219 | 220 | 221 | def load_trainedsentimententities(): 222 | trainedsentimententities = pd.read_csv( 223 | "./cjp_tables/newsarticles_trainedsentimententities.csv.gz", 224 | header=None, 225 | compression="gzip", 226 | low_memory=False 227 | ) 228 | trainedsentimententities.columns = [ 229 | "id", 230 | "index", 231 | "entity", 232 | "sentiment", 233 | "coding_id", 234 | "response_id", 235 | ] 236 | print(f"trainedsentimententities loaded. size: {trainedsentimententities.shape}") 237 | return trainedsentimententities 238 | -------------------------------------------------------------------------------- /lib/tagnews/data/ci-data/newsarticles_usercoding.csv: -------------------------------------------------------------------------------- 1 | 132,2017-04-30 23:51:50.887415+00,t,132,,[] 2 | 219,2017-04-30 23:51:51.342135+00,t,219,,[] 3 | 1728,2017-04-30 23:51:59.692187+00,t,1761,,[] 4 | 2227,2017-04-30 23:52:02.546656+00,t,2276,,[] 5 | 3789,2017-04-30 23:52:11.735746+00,t,3840,,[] 6 | 4069,2017-04-30 23:52:13.664501+00,t,4121,,[] 7 | 4146,2017-04-30 23:52:14.110165+00,t,4198,,[] 8 | 4233,2017-04-30 23:52:14.538316+00,t,4285,,[] 9 | 4302,2017-04-30 23:52:14.930339+00,t,4354,,[] 10 | 4856,2017-04-30 23:52:17.915424+00,t,4908,,[] 11 | 5770,2017-04-30 23:52:22.935989+00,t,5822,,[] 12 | 6327,2017-04-30 23:52:26.249772+00,t,6380,,[] 13 | 6398,2017-04-30 23:52:26.686142+00,t,6451,,[] 14 | 6480,2017-04-30 23:52:27.364435+00,t,6534,,[] 15 | 6776,2017-04-30 23:52:28.925421+00,t,6831,,[] 16 | 7060,2017-04-30 23:52:31.092362+00,t,7115,,[] 17 | 7107,2017-04-30 23:52:31.438787+00,t,7162,,[] 18 | 7254,2017-04-30 23:52:32.551403+00,t,7309,,[] 19 | 7362,2017-04-30 23:52:33.369139+00,t,7417,,[] 20 | 7710,2017-04-30 23:52:36.095998+00,t,7765,,[] 21 | 7713,2017-04-30 23:52:36.118787+00,t,7768,,[] 22 | 7956,2017-04-30 23:52:37.469579+00,t,8011,,[] 23 | 8373,2017-04-30 23:52:39.798142+00,t,8430,,[] 24 | 8769,2017-04-30 23:52:42.065576+00,t,8826,,[] 25 | 9304,2017-04-30 23:52:45.118601+00,t,9362,,[] 26 | 9932,2017-04-30 23:52:48.919135+00,t,9990,,[] 27 | 9987,2017-04-30 23:52:49.377954+00,t,10045,,[] 28 | 9997,2017-04-30 23:52:49.43844+00,t,10055,,[] 29 | 10301,2017-04-30 23:52:51.064391+00,t,10360,,[] 30 | 10604,2017-04-30 23:52:52.870625+00,t,10663,,[] 31 | 10874,2017-04-30 23:52:54.366605+00,t,10934,,[] 32 | 11159,2017-04-30 23:52:56.28323+00,t,11219,,[] 33 | 11619,2017-04-30 23:52:59.181713+00,t,11679,,[] 34 | 11819,2017-04-30 23:53:00.454832+00,t,11879,,[] 35 | 11847,2017-04-30 23:53:00.622677+00,t,11907,,[] 36 | 11896,2017-04-30 23:53:00.926895+00,t,11956,,[] 37 | 12822,2017-04-30 23:53:05.984266+00,t,12914,,[] 38 | 13406,2017-04-30 23:53:09.270184+00,t,13498,,[] 39 | 15416,2017-04-30 23:53:19.617777+00,t,15531,,[] 40 | 16772,2017-04-30 23:53:27.433229+00,t,16887,,[] 41 | 17292,2017-04-30 23:53:30.668672+00,t,17407,,[] 42 | 17509,2017-04-30 23:53:32.07463+00,t,17624,,[] 43 | 17808,2017-04-30 23:53:33.953668+00,t,18171,,[] 44 | 18167,2017-04-30 23:53:35.977106+00,t,18534,,[] 45 | 18630,2017-04-30 23:53:38.531453+00,t,18999,,[] 46 | 18783,2017-04-30 23:53:39.49154+00,t,19152,,[] 47 | 21221,2017-04-30 23:53:53.693364+00,t,21593,,[] 48 | 21766,2017-04-30 23:53:57.044322+00,t,22138,,[] 49 | 21772,2017-04-30 23:53:57.081285+00,t,22144,,[] 50 | 22428,2017-04-30 23:54:00.861717+00,t,22800,,[] 51 | 23050,2017-04-30 23:54:05.848318+00,t,24019,,[] 52 | 25991,2017-04-30 23:54:23.585039+00,t,26962,,[] 53 | 27133,2017-04-30 23:54:30.188898+00,t,28104,,[] 54 | 28032,2017-04-30 23:54:35.285352+00,t,29003,,[] 55 | 28126,2017-04-30 23:54:35.815621+00,t,29097,,[] 56 | 31220,2017-04-30 23:54:52.834565+00,t,32192,,[] 57 | 33722,2017-04-30 23:55:06.745282+00,t,34694,,[] 58 | 33873,2017-04-30 23:55:07.777538+00,t,34846,,[] 59 | 33938,2017-04-30 23:55:08.330327+00,t,34911,,[] 60 | 34093,2017-04-30 23:55:09.340739+00,t,35066,,[] 61 | 35017,2017-04-30 23:55:15.075316+00,f,35990,,[] 62 | 37023,2017-04-30 23:55:26.855568+00,t,37996,,[] 63 | 38136,2017-04-30 23:55:33.215386+00,t,39114,,[] 64 | 39556,2017-04-30 23:55:41.143696+00,t,40534,,[] 65 | 40077,2017-04-30 23:55:44.202924+00,f,41056,,[] 66 | 40153,2017-04-30 23:55:44.732382+00,f,41132,,[] 67 | 40306,2017-04-30 23:55:45.526339+00,t,41285,,[] 68 | 40871,2017-04-30 23:55:48.76526+00,t,41851,,[] 69 | 41820,2017-04-30 23:55:53.945199+00,t,42800,,[] 70 | 41852,2017-04-30 23:55:54.10859+00,t,42832,,[] 71 | 43350,2017-04-30 23:56:02.26225+00,t,44332,,[] 72 | 44004,2017-04-30 23:56:06.142511+00,t,44986,,[] 73 | 44438,2017-04-30 23:56:08.611559+00,t,45420,,[] 74 | 46592,2017-04-30 23:56:20.186414+00,t,47574,,[] 75 | 47805,2017-04-30 23:56:26.71788+00,t,48789,,[] 76 | 53469,2017-04-30 23:56:56.080526+00,t,54456,,[] 77 | 55829,2017-04-30 23:57:08.615496+00,t,56816,,[] 78 | 57826,2017-04-30 23:57:19.575145+00,t,58813,,[] 79 | 58242,2017-04-30 23:57:21.829651+00,t,59229,,[] 80 | 58848,2017-04-30 23:57:25.128702+00,t,59836,,[] 81 | 59062,2017-04-30 23:57:26.194408+00,t,60050,,[] 82 | 59597,2017-04-30 23:57:28.999912+00,t,60585,,[] 83 | 60721,2017-04-30 23:57:35.250609+00,t,61709,,[] 84 | 60981,2017-04-30 23:57:36.692958+00,t,61971,,[] 85 | 72429,2017-04-30 23:58:42.287373+00,t,74059,,[] 86 | 74956,2017-04-30 23:59:00.479804+00,t,78269,,[] 87 | 75398,2017-04-30 23:59:02.800105+00,t,78711,,[] 88 | 75663,2017-04-30 23:59:04.153408+00,t,78976,,[] 89 | 76411,2017-04-30 23:59:08.242627+00,t,79724,,[] 90 | 77257,2017-04-30 23:59:13.056702+00,t,80874,,[] 91 | 78695,2017-04-30 23:59:22.060328+00,t,83560,,[] 92 | 80662,2017-04-30 23:59:35.556954+00,t,87395,,[] 93 | 81047,2017-04-30 23:59:37.738259+00,t,87780,,[] 94 | 81846,2017-04-30 23:59:41.760211+00,t,88579,,[] 95 | 82527,2017-04-30 23:59:45.566285+00,t,89261,,[] 96 | 82584,2017-04-30 23:59:45.8657+00,t,89318,,[] 97 | 82926,2017-04-30 23:59:47.529423+00,t,89660,,[] 98 | 83912,2017-04-30 23:59:52.299707+00,t,90646,,[] 99 | 87771,2017-05-01 00:00:11.788641+00,f,94509,,[] 100 | 87800,2017-05-01 00:00:11.919481+00,f,94538,,[] 101 | 90724,2017-05-01 00:00:27.111264+00,f,97498,,[] 102 | 91284,2017-05-01 00:00:30.207335+00,f,98058,,[] 103 | 92580,2017-05-01 00:00:37.140398+00,t,99361,,[] 104 | 94294,2017-05-01 00:00:46.24013+00,t,101077,,[] 105 | 94603,2017-05-01 00:00:47.925523+00,t,101386,,[] 106 | 95282,2017-05-01 00:00:51.443114+00,t,102066,,[] 107 | 96332,2017-05-01 00:00:57.111733+00,t,103122,,[] 108 | 96361,2017-05-01 00:00:57.267301+00,f,103156,,[] 109 | 97506,2017-05-01 00:01:03.755725+00,t,104306,,[] 110 | 97740,2017-05-01 00:01:04.81288+00,t,104540,,[] 111 | 98644,2017-05-01 00:01:09.924047+00,t,105448,,[] 112 | 99538,2017-05-01 00:01:14.54655+00,t,106346,,[] 113 | 101082,2017-05-01 00:01:22.678444+00,t,107893,,[] 114 | 102047,2017-05-01 00:01:28.15472+00,t,108861,,[] 115 | 106009,2017-05-01 00:01:49.779743+00,t,112858,,[] 116 | 106027,2017-05-01 00:01:49.867794+00,f,112876,,[] 117 | 109564,2017-05-01 00:02:10.073368+00,t,116413,,[] 118 | 113250,2017-05-01 00:02:29.867255+00,t,120162,,[] 119 | 114030,2017-05-01 00:02:34.170748+00,t,120942,,[] 120 | 114749,2017-05-01 00:02:37.969701+00,t,121661,,[] 121 | 115899,2017-05-01 00:02:44.403655+00,f,122811,,[] 122 | 117328,2017-05-01 00:02:52.058585+00,t,124241,,[] 123 | 118240,2017-05-01 00:02:57.183845+00,t,125154,,[] 124 | 119322,2017-05-01 00:03:03.171412+00,t,126236,,[] 125 | 122057,2017-05-01 00:03:18.284443+00,t,128971,,[] 126 | 124718,2017-05-01 00:03:33.214867+00,t,131633,,[] 127 | 125488,2017-05-01 00:03:37.356293+00,t,132403,,[] 128 | 125760,2017-05-01 00:03:38.728316+00,t,132675,,[] 129 | 127428,2017-05-01 00:03:48.033591+00,t,134343,,[] 130 | 127697,2017-05-01 00:03:49.396559+00,f,134612,,[] 131 | 128087,2017-05-01 00:03:51.428787+00,t,135002,,[] 132 | 128637,2017-05-01 00:03:54.390846+00,t,135552,,[] 133 | 128662,2017-05-01 00:03:54.520546+00,t,135577,,[] 134 | 128857,2017-05-01 00:03:55.583598+00,f,135772,,[] 135 | 129483,2017-05-01 00:03:58.949281+00,t,136398,,[] 136 | 129899,2017-05-01 00:04:01.022955+00,f,136814,,[] 137 | 131048,2017-05-01 00:04:08.111101+00,t,137963,,[] 138 | 131765,2017-05-01 00:04:11.956763+00,t,138681,,[] 139 | 132243,2017-05-01 00:04:14.476838+00,t,139159,,[] 140 | 135388,2017-05-01 00:04:32.30874+00,t,142304,,[] 141 | 135705,2017-05-01 00:04:34.105896+00,t,142621,,[] 142 | 139290,2017-05-01 00:04:53.206682+00,t,146218,,[] 143 | 140867,2017-05-01 00:05:01.678874+00,t,147795,,[] 144 | 141314,2017-05-01 00:05:04.282004+00,t,148242,,[] 145 | 142222,2017-05-01 00:05:09.117977+00,t,149150,,[] 146 | 142937,2017-05-01 00:05:12.890425+00,t,149865,,[] 147 | 150134,2017-05-01 00:05:51.013016+00,t,157068,,[] 148 | 151573,2017-05-01 00:05:58.867502+00,t,158507,,[] 149 | 153541,2017-05-01 00:06:09.803195+00,t,160476,,[] 150 | 154021,2017-05-01 00:06:12.487757+00,t,160956,,[] 151 | 154234,2017-05-01 00:06:13.546425+00,f,161169,,[] 152 | 155785,2017-05-01 00:06:21.76145+00,t,162729,,[] 153 | 157346,2017-05-01 00:06:30.15392+00,t,164291,,[] 154 | 157684,2017-05-01 00:06:32.002844+00,t,164631,,[] 155 | 157999,2017-05-01 00:06:33.765279+00,t,164947,,[] 156 | 159958,2017-05-01 00:06:43.879207+00,f,166908,,[] 157 | 160087,2017-05-01 00:06:44.536764+00,f,167037,,[] 158 | 160137,2017-05-01 00:06:44.778934+00,t,167087,,[] 159 | 162892,2017-05-01 00:07:00.330939+00,t,170803,,[] 160 | 165333,2017-05-01 00:07:16.799013+00,t,175658,,[] 161 | 177862,2017-05-01 00:08:37.746581+00,t,201260,,[] 162 | 180249,2017-05-01 00:08:49.482782+00,t,203651,,[] 163 | 184791,2017-05-01 00:09:19.417519+00,t,213344,,[] 164 | 185415,2017-05-01 00:09:22.305448+00,t,213968,,[] 165 | 186861,2017-05-01 00:09:30.312094+00,t,215414,,[] 166 | 193258,2017-05-01 00:10:05.523489+00,t,224701,,[] 167 | 195838,2017-05-01 00:10:59.159411+00,t,250841,,[] 168 | 199329,2017-05-01 00:12:11.686742+00,t,288705,,[] 169 | 200327,2017-05-01 00:12:20.586629+00,t,292352,,[] 170 | 203291,2017-05-31 19:39:42.124411+00,t,327488,133.0,[] 171 | 204359,2017-06-13 21:37:17.99267+00,t,265658,130.0,[] 172 | 204592,2017-06-15 01:06:02.665167+00,t,313702,134.0,[] 173 | 204900,2017-06-20 18:26:30.680508+00,t,290038,142.0,[] 174 | 204902,2017-06-20 18:29:50.615251+00,t,290036,142.0,[] 175 | 205755,2017-06-26 16:35:12.712312+00,t,281528,142.0,[] 176 | 205788,2017-06-26 16:47:08.644718+00,t,281628,142.0,[] 177 | 210278,2017-07-19 19:46:16.533487+00,t,326413,132.0,[] 178 | 211531,2017-07-31 22:08:27.499098+00,t,353037,2.0,"[{""start"":139,""end"":149,""text"":""South Side""},{""start"":150,""end"":161,""text"":""Fuller Park""},{""start"":605,""end"":635,""text"":""200 block of West 47th Street.""},{""start"":1462,""end"":1482,""text"":""Ashburn neighborhood""},{""start"":1447,""end"":1461,""text"":""Southwest Side""}]" 179 | 211597,2017-07-31 23:54:27.375507+00,t,285008,131.0,[] 180 | 212076,2017-08-02 01:36:33.116888+00,t,285119,131.0,"[{""start"":686,""end"":721,""text"":""Irving Park Road and Western Avenue""}]" 181 | 212124,2017-08-03 17:27:13.961423+00,t,354618,130.0,"[{""start"":324,""end"":333,""text"":""West Side""},{""start"":1773,""end"":1781,""text"":""Lawndale""},{""start"":3303,""end"":3335,""text"":""Polk Street and Francisco Avenue""}]" 182 | 212536,2017-08-16 04:47:42.564931+00,t,315827,130.0,"[{""start"":135,""end"":145,""text"":""South Loop""},{""start"":425,""end"":452,""text"":""1100 block of South Indiana""},{""start"":936,""end"":961,""text"":""South Side Auburn Gresham""},{""start"":993,""end"":1006,""text"":""Humboldt Park""}]" 183 | 221833,2017-10-27 18:04:39.114165+00,t,268139,33.0,[] 184 | 222644,2017-10-30 19:57:12.165916+00,t,267157,157.0,[] 185 | 222854,2017-10-30 23:45:51.357935+00,t,267325,130.0,"[{""start"":1329,""end"":1339,""text"":""Palos Park""},{""start"":477,""end"":487,""text"":""Bridgeview""},{""start"":1882,""end"":1907,""text"":""12700 block of 81st Court""}]" 186 | 222947,2017-10-31 16:33:02.206965+00,t,260241,33.0,[] 187 | 223183,2017-10-31 23:17:12.900255+00,t,242395,130.0,"[{""start"":218,""end"":228,""text"":""South Side""}]" 188 | 224330,2017-11-02 23:54:01.689147+00,t,17881,158.0,[] 189 | 225660,2017-11-07 20:32:34.455332+00,t,368055,157.0,[] 190 | 227740,2017-11-15 02:03:56.81339+00,t,315061,157.0,[] 191 | 228325,2017-11-21 17:01:18.233928+00,t,296467,157.0,[] 192 | 230878,2018-01-10 22:43:11.839972+00,t,303024,33.0,[] 193 | 233151,2018-01-31 17:08:47.581319+00,t,362825,33.0,[] 194 | 233177,2018-01-31 18:56:46.092939+00,t,317223,33.0,[] 195 | -------------------------------------------------------------------------------- /lib/tagnews/data/column_names.txt: -------------------------------------------------------------------------------- 1 | Table "public.newsarticles_newssource" 2 | Column | Type | Modifiers 3 | ----------------+------------------------+---------------------------------------------------------------------- 4 | id | integer | not null default nextval('newsarticles_newssource_id_seq'::regclass) 5 | name | character varying(256) | not null 6 | short_name | character varying(256) | not null 7 | legacy_feed_id | character varying(8) | not null 8 | Indexes: 9 | "newsarticles_newssource_pkey" PRIMARY KEY, btree (id) 10 | "newsarticles_newssource_3ee615f7" btree (legacy_feed_id) 11 | "newsarticles_newssource_legacy_feed_id_5367de32a6bdc03f_like" btree (legacy_feed_id varchar_pattern_ops) 12 | "newsarticles_newssource_short_name_1ff6619d20cb947d_like" btree (short_name varchar_pattern_ops) 13 | "newsarticles_newssource_short_name_1ff6619d20cb947d_uniq" btree (short_name) 14 | Referenced by: 15 | TABLE "newsarticles_article" CONSTRAINT "n_news_source_id_6ef491df45588361_fk_newsarticles_newssource_id" FOREIGN KEY (news_source_id) REFERENCES newsarticles_newssource(id) DEFERRABLE INITIALLY DEFERRED 16 | TABLE "newsarticles_scraperresult" CONSTRAINT "ne_news_source_id_e906324e3d2ac00_fk_newsarticles_newssource_id" FOREIGN KEY (news_source_id) REFERENCES newsarticles_newssource(id) DEFERRABLE INITIALLY DEFERRED 17 | 18 | Table "public.newsarticles_article" 19 | Column | Type | Modifiers 20 | ----------------+--------------------------+------------------------------------------------------------------- 21 | id | integer | not null default nextval('newsarticles_article_id_seq'::regclass) 22 | feedname | character varying(1) | 23 | url | character varying(1024) | not null 24 | orig_html | text | not null 25 | title | text | not null 26 | bodytext | text | not null 27 | relevant | boolean | 28 | created | timestamp with time zone | not null 29 | last_modified | timestamp with time zone | not null 30 | news_source_id | integer | 31 | author | character varying(1024) | not null 32 | Indexes: 33 | "newsarticles_article_pkey" PRIMARY KEY, btree (id) 34 | "newsarticles_article_url_key" UNIQUE CONSTRAINT, btree (url) 35 | "newsarticles_article_8f28a911" btree (news_source_id) 36 | "newsarticles_article_ba31968f" btree (feedname) 37 | "newsarticles_article_created" btree (created) 38 | "newsarticles_article_e2fa5388" btree (created) 39 | "newsarticles_article_f552707d" btree (relevant) 40 | "newsarticles_article_feedname" btree (feedname) 41 | "newsarticles_article_feedname_6f274b5fd8544257_like" btree (feedname varchar_pattern_ops) 42 | "newsarticles_article_feedname_like" btree (feedname varchar_pattern_ops) 43 | "newsarticles_article_relevant" btree (relevant) 44 | "newsarticles_article_url_3fe47845b28cdc08_like" btree (url varchar_pattern_ops) 45 | Foreign-key constraints: 46 | "n_news_source_id_6ef491df45588361_fk_newsarticles_newssource_id" FOREIGN KEY (news_source_id) REFERENCES newsarticles_newssource(id) DEFERRABLE INITIALLY DEFERRED 47 | Referenced by: 48 | TABLE "newsarticles_article_categories" CONSTRAINT "newsarti_article_id_438886c21ec59122_fk_newsarticles_article_id" FOREIGN KEY (article_id) REFERENCES newsarticles_article(id) DEFERRABLE INITIALLY DEFERRED 49 | TABLE "newsarticles_usercoding" CONSTRAINT "newsarti_article_id_54d685c1a8b57e2c_fk_newsarticles_article_id" FOREIGN KEY (article_id) REFERENCES newsarticles_article(id) DEFERRABLE INITIALLY DEFERRED 50 | TABLE "newsarticles_trainedcoding" CONSTRAINT "newsarticles_trained_article_id_5b9c0111_fk_newsartic" FOREIGN KEY (article_id) REFERENCES newsarticles_article(id) DEFERRABLE INITIALLY DEFERRED 51 | 52 | Table "public.newsarticles_usercoding" 53 | Column | Type | Modifiers 54 | ------------+--------------------------+---------------------------------------------------------------------- 55 | id | integer | not null default nextval('newsarticles_usercoding_id_seq'::regclass) 56 | date | timestamp with time zone | not null 57 | relevant | boolean | not null 58 | article_id | integer | not null 59 | user_id | integer | 60 | locations | text | not null 61 | sentiment | integer | 62 | Indexes: 63 | "newsarticles_usercoding_pkey" PRIMARY KEY, btree (id) 64 | "newsarticles_usercoding_article_id_3535f524868d4ee3_uniq" UNIQUE CONSTRAINT, btree (article_id, user_id) 65 | "newsarticles_usercoding_article_id_key" UNIQUE CONSTRAINT, btree (article_id) 66 | "newsarticles_usercoding_e8701ad4" btree (user_id) 67 | Foreign-key constraints: 68 | "newsarti_article_id_54d685c1a8b57e2c_fk_newsarticles_article_id" FOREIGN KEY (article_id) REFERENCES newsarticles_article(id) DEFERRABLE INITIALLY DEFERRED 69 | "newsarticles_usercodin_user_id_6f03990de1e1875c_fk_auth_user_id" FOREIGN KEY (user_id) REFERENCES auth_user(id) DEFERRABLE INITIALLY DEFERRED 70 | Referenced by: 71 | TABLE "newsarticles_usercoding_categories" CONSTRAINT "ne_usercoding_id_3ce766f5753b730e_fk_newsarticles_usercoding_id" FOREIGN KEY (usercoding_id) REFERENCES newsarticles_usercoding(id) DEFERRABLE INITIALLY DEFERRED 72 | 73 | Table "public.newsarticles_category" 74 | Column | Type | Modifiers 75 | --------------+--------------------------+-------------------------------------------------------------------- 76 | id | integer | not null default nextval('newsarticles_category_id_seq'::regclass) 77 | title | character varying(256) | not null 78 | abbreviation | character varying(5) | not null 79 | created | timestamp with time zone | not null 80 | active | boolean | not null 81 | kind | character varying(50) | not null 82 | Indexes: 83 | "newsarticles_category_pkey" PRIMARY KEY, btree (id) 84 | Referenced by: 85 | TABLE "newsarticles_usercoding_categories" CONSTRAINT "newsar_category_id_6f8bff226c05e06b_fk_newsarticles_category_id" FOREIGN KEY (category_id) REFERENCES newsarticles_category(id) DEFERRABLE INITIALLY DEFERRED 86 | TABLE "newsarticles_article_categories" CONSTRAINT "newsarti_category_id_5876ea9f7b91a1_fk_newsarticles_category_id" FOREIGN KEY (category_id) REFERENCES newsarticles_category(id) DEFERRABLE INITIALLY DEFERRED 87 | TABLE "newsarticles_trainedcategoryrelevance" CONSTRAINT "newsarticles_trained_category_id_d3c4a714_fk_newsartic" FOREIGN KEY (category_id) REFERENCES newsarticles_category(id) DEFERRABLE INITIALLY DEFERRED 88 | 89 | Table "public.newsarticles_usercoding_categories" 90 | Column | Type | Modifiers 91 | ---------------+---------+--------------------------------------------------------------------------------- 92 | id | integer | not null default nextval('newsarticles_usercoding_categories_id_seq'::regclass) 93 | usercoding_id | integer | not null 94 | category_id | integer | not null 95 | Indexes: 96 | "newsarticles_usercoding_categories_pkey" PRIMARY KEY, btree (id) 97 | "newsarticles_usercoding_categorie_usercoding_id_category_id_key" UNIQUE CONSTRAINT, btree (usercoding_id, category_id) 98 | "newsarticles_usercoding_categories_3ca0ec33" btree (usercoding_id) 99 | "newsarticles_usercoding_categories_b583a629" btree (category_id) 100 | Foreign-key constraints: 101 | "ne_usercoding_id_3ce766f5753b730e_fk_newsarticles_usercoding_id" FOREIGN KEY (usercoding_id) REFERENCES newsarticles_usercoding(id) DEFERRABLE INITIALLY DEFERRED 102 | "newsar_category_id_6f8bff226c05e06b_fk_newsarticles_category_id" FOREIGN KEY (category_id) REFERENCES newsarticles_category(id) DEFERRABLE INITIALLY DEFERRED 103 | 104 | Table "public.newsarticles_trainedcoding" 105 | Column | Type | Modifiers 106 | ------------+--------------------------+------------------------------------------------------------------------- 107 | id | integer | not null default nextval('newsarticles_trainedcoding_id_seq'::regclass) 108 | date | timestamp with time zone | not null 109 | model_info | text | not null 110 | relevance | double precision | not null 111 | article_id | integer | not null 112 | sentiment | double precision | 113 | Indexes: 114 | "newsarticles_trainedcoding_pkey" PRIMARY KEY, btree (id) 115 | "newsarticles_trainedcoding_article_id_key" UNIQUE CONSTRAINT, btree (article_id) 116 | Foreign-key constraints: 117 | "newsarticles_trained_article_id_5b9c0111_fk_newsartic" FOREIGN KEY (article_id) REFERENCES newsarticles_article(id) DEFERRABLE INITIALLY DEFERRED 118 | Referenced by: 119 | TABLE "newsarticles_trainedcategoryrelevance" CONSTRAINT "newsarticles_trained_coding_id_ad7cc027_fk_newsartic" FOREIGN KEY (coding_id) REFERENCES newsarticles_trainedcoding(id) DEFERRABLE INITIALLY DEFERRED 120 | TABLE "newsarticles_trainedlocation" CONSTRAINT "newsarticles_trained_coding_id_d406a29f_fk_newsartic" FOREIGN KEY (coding_id) REFERENCES newsarticles_trainedcoding(id) DEFERRABLE INITIALLY DEFERRED 121 | 122 | Table "public.newsarticles_trainedlocation" 123 | Column | Type | Modifiers 124 | --------------+------------------+--------------------------------------------------------------------------- 125 | id | integer | not null default nextval('newsarticles_trainedlocation_id_seq'::regclass) 126 | text | text | not null 127 | latitude | double precision | 128 | longitude | double precision | 129 | coding_id | integer | not null 130 | confidence | double precision | 131 | neighborhood | text | not null 132 | Indexes: 133 | "newsarticles_trainedlocation_pkey" PRIMARY KEY, btree (id) 134 | "newsarticles_trainedlocation_coding_id_d406a29f" btree (coding_id) 135 | Foreign-key constraints: 136 | "newsarticles_trained_coding_id_d406a29f_fk_newsartic" FOREIGN KEY (coding_id) REFERENCES newsarticles_trainedcoding(id) DEFERRABLE INITIALLY DEFERRED 137 | 138 | Table "public.newsarticles_trainedcategoryrelevance" 139 | Column | Type | Modifiers 140 | -------------+------------------+------------------------------------------------------------------------------------ 141 | id | integer | not null default nextval('newsarticles_trainedcategoryrelevance_id_seq'::regclass) 142 | relevance | double precision | not null 143 | category_id | integer | not null 144 | coding_id | integer | not null 145 | Indexes: 146 | "newsarticles_trainedcategoryrelevance_pkey" PRIMARY KEY, btree (id) 147 | "newsarticles_trainedcategoryrelevance_category_id_d3c4a714" btree (category_id) 148 | "newsarticles_trainedcategoryrelevance_coding_id_ad7cc027" btree (coding_id) 149 | Foreign-key constraints: 150 | "newsarticles_trained_category_id_d3c4a714_fk_newsartic" FOREIGN KEY (category_id) REFERENCES newsarticles_category(id) DEFERRABLE INITIALLY DEFERRED 151 | "newsarticles_trained_coding_id_ad7cc027_fk_newsartic" FOREIGN KEY (coding_id) REFERENCES newsarticles_trainedcoding(id) DEFERRABLE INITIALLY DEFERRED 152 | 153 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ## Getting Started 4 | 5 | This project is developed with [uv](https://docs.astral.sh/uv/). 6 | Follow the [installation directions](https://docs.astral.sh/uv/getting-started/installation/) from uv's website. 7 | If Python is installed on your system uv will detect and use it. 8 | If not, uv will download Python automatically. 9 | 10 | ``` 11 | git clone https://github.com/chicago-justice-project/article-tagging.git 12 | cd article-tagging 13 | ``` 14 | 15 | ### Get the required data 16 | 17 | Download the [Natural Language Toolkit (NLTK)](http://www.nltk.org/) data: 18 | 19 | ``` 20 | uv run python -c "import nltk; nltk.download('punkt_tab', '.venv/nltk_data')" 21 | uv run python -c "import nltk; nltk.download('wordnet', '.venv/nltk_data')" 22 | ``` 23 | 24 | For the geotagging model you will need the [GloVe](https://nlp.stanford.edu/projects/glove/) pre-trained word vectors: 25 | 26 | ``` 27 | curl -O https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip 28 | unzip glove.6B.zip -d lib/tagnews/data 29 | ``` 30 | 31 | You will also need the [Chicago Community Areas boundary data](https://data.cityofchicago.org/d/igwz-8jzy): 32 | 33 | ``` 34 | curl https://data.cityofchicago.org/api/geospatial/igwz-8jzy?method=export&format=GeoJSON \ 35 | -o "lib/tagnews/data/Boundaries - Community Areas (current).geojson" 36 | ``` 37 | 38 | The latest data dump from the [Chicago Justice Project](https://github.com/chicago-justice-project/chicago-justice) should be placed in `lib/tagnews/data`. 39 | If you do not have access to the production data you can use the test dataset included in the repo: 40 | 41 | ``` 42 | cp lib/tagnews/data/ci-data/*.csv lib/tagnews/data 43 | ``` 44 | 45 | ### Generate the models 46 | 47 | ``` 48 | uv run python -m tagnews.crimetype.models.binary_stemmed_logistic.save_model 49 | uv run python -m tagnews.geoloc.models.lstm.save_model 50 | ``` 51 | 52 | ### Run tests 53 | 54 | ``` 55 | uv run pytest --cov-report term-missing --cov=tagnews 56 | ``` 57 | 58 | ### Run JupyterLab 59 | 60 | ``` 61 | uv run --with jupyter jupyter lab 62 | ``` 63 | 64 | ## Directory structure 65 | 66 | This project is structured as follows: 67 | 68 | ``` 69 | ├───lib 70 | │ ├───notebooks ............................ Jupyter/IPython notebooks 71 | │ └───tagnews .............................. Python package/source code 72 | │ ├───crimetype ........................ Code related to time-of-crime tagging 73 | │ │ └───models ....................... Filler directory 74 | │ │ └───binary_stemmed_logistic .. Code to train/save crimetype NLP model 75 | │ ├───data ............................. Put the data in here! 76 | │ │ └───ci-data ...................... A tiny subset of data used for testing 77 | │ ├───geoloc ........................... Code related to geocoding 78 | │ │ └───models ....................... Filler directory 79 | │ │ └───lstm ..................... Code *and data* to train/save geostring extractor 80 | │ │ └───saved ................ Where the geostring model is saved. 81 | │ ├───tests ............................ Code used to test this project 82 | │ └───utils ............................ Helper functions, mostly around data loading 83 | └───r_models ................................. R code, unused for a while, use with caution 84 | ``` 85 | 86 | Depending on how you want to contribute will dictate which parts you need to know about. 87 | 88 | ## What can I do? 89 | 90 | There are a couple things you could do, each item listed here is expounded on further below. 91 | 92 | * Improve the type-of-crime model (article text -> type-of-crime tags) 93 | * Improve the geostring extractor model (article text -> list of location strings) 94 | * Improve the geocoding (list of location strings -> list of lat/longs) 95 | * Write more tests 96 | * Write documentation 97 | * Ways to help without coding 98 | 99 | ### The type-of-crime model 100 | 101 | #### What is it? 102 | 103 | The type-of-crime model builds a multi-class classifier that takes in text from a news article and for each type-of-crime tag outputs a probability that the tag applies to the news article. In other words, it tries to guess what kinds of crimes the news article discusses. 104 | 105 | The model code can be found in `lib/tagnes/crimetype/models/binary_stemmed_logistic/save_model.py`. It's less than 100 lines, don't be afraid to read it! 106 | 107 | The model relies on NLTK as a tokenizer and builds a binary bag-of-words vectorizer with 40000 features. (We restricted to 40000 features because performance did not decrease significantly and it made the model much smaller, useful when trying to publish to pypi as a package.) The vectorized versions of the articles are then used as input to a separate logistic regression for each crime tag. 108 | 109 | #### How to train it? 110 | 111 | The `save_model.py` can be run as a script to save the trained model: 112 | 113 | ``` 114 | uv run python -m tagnews.crimetype.models.binary_stemmed_logistic.save_model 115 | ``` 116 | 117 | The vectorizer is saved in the same directory as the code with the name `vectorizer--.pkl`. The model is saved similarly, but with `model` instead of `vectorizer`. 118 | 119 | This code trains on the whole labeled dataset. During development, the `lib/tagnews/crimetype/benchmark.py` file was used to perform cross validation. 120 | 121 | #### How to measure performance? 122 | 123 | We never defined a single number that could be used to decide if one model was better than another, even though that's usually a critical step. We generated FPR/TPRs for all the crime categories and plotted those. The best way may be to fix an acceptable FPR rate at something like 5% or 10% and see what maximizes the mean TPR across a set of desired categories. In short, there's not a solid answer here and refining this would be super helpful in its own right. 124 | 125 | #### How might it be improved? 126 | 127 | * Use a better vectorizer than bag-of-words, e.g. GloVe as used for the geostring model. 128 | * We briefly tried a naive bayes classifier over a logistic regression and it didn't seem to improve performance, but naive bayes is usually used as the baseline for these kinds of tasks. Could it be made to work better? 129 | * Add more examples of articles that have *no* tags. Right now we randomly sample 3000 such articles, but we could probably use more. This may help with an observed problem where some sports articles have a high chance of being about a crime according to the model (likely due to the high use of words like "shoot"). 130 | 131 | ### The geostring extractor model 132 | 133 | #### What is it? 134 | 135 | The geostring model builds a word-by-word probability that each word is part of a "geostring". A "geostring" is a list of words that define a location. They can be pretty accurate street addresses as in "the shooting happened at the *corner of 55th and Woodlawn*" or fuzzier locations such as a neighborhood name, a church name, etc. The per-word probability can be thresholded and we take all consecutive list of words above the threshold as the geostrings inside an article. 136 | 137 | The model code can be found in `lib/tagnews/geoloc/models/lstm/save_model.py`. It's 150 lines of python code a good portion of which is trying to hit an external internet API. The keras library is used extensively. 138 | 139 | The model relies on the pre-trained semantic word vectorizer GloVE to get a 50 dimensional feature vector for each word, and then a two layer bi-directional LSTM is used to generate the probabilities. 140 | 141 | #### How to train it? 142 | 143 | The `save_model.py` file can be run as a script to save the trained model: 144 | 145 | ``` 146 | uv run python -m tagnews.geoloc.models.lstm.save_model 147 | ``` 148 | 149 | The model is saved under `lib/tagnews/geoloc/models/lstm/saved/weights-*.hdf5`. The code will run for a set number of training epochs (one epoch is one pass through all of the training examples), saving the weights after each epoch. 150 | 151 | #### How to measure performance? 152 | 153 | Download the validation data from https://geo-extract-tester.herokuapp.com/ (there is also training data available for downloading). Follow the instructions on that website to upload guesses and the ROC curve will be shown for your model's predictions. If you have a higher AUC than the current high score, congratulations! Please submit a Pull Request! 154 | 155 | You can also upload your model's predictions via an API. There is code inside `lib/tagnews/geoloc/models/lstm/save_model.py` demonstrating this. 156 | 157 | #### How might it be improved? 158 | 159 | * Including "naive" models that do simple look-ups against Chicago street names. 160 | * Using a word vectorizer that handles out-of-vocabulary predictions better (perhaps `FastText`?). 161 | * Just use a character-level CNN? 162 | * Augment the training data by labeling more articles (see the "I want to contribute to Chicago Justice Project but I don’t want to work on this NLP stuff. What can I do?" section). 163 | 164 | ### The geocoding 165 | 166 | #### What is it? 167 | 168 | Geocoding here refers to the process of sending a geostring (e.g. "55th and Woodlawn") to an external service to retrieve a best-guess latitude/longitude pair of where that geostring is referring to. 169 | 170 | Right now, the geocoding is done using an instance of pelias hosted by CJP. 171 | 172 | The code can be found in `lib/tagnews/geoloc/tag.py`, in the `get_lat_longs_from_geostrings` function. 173 | 174 | #### How might it be improved? 175 | 176 | * Improve post-processing of geostrings (we do rudimentary things like append "Chicago, Illinois", but we could get more sophisticated). 177 | * Improve the inputs to it by improving the geostring model. 178 | * Improve the inputs by making a better post-processor of geostrings. 179 | * Improve the confidence score. 180 | 181 | #### What if it breaks? 182 | 183 | The last time the geocoding broke it was because they started checking for browser-like headers, so we updated our requests to have browser-like headers. Something like this may happen again and unfortunately there's no real playbook here. 184 | 185 | The good news is that the geostrings will always be there, and if needed we can always re-process any geocoding that doesn't work. 186 | 187 | ## Testing 188 | 189 | ### The test suite 190 | 191 | You can find the tests `lib/tagnews/tests/`. We use `pytest` as the test runner. The test coverage isn't phenomenal, but it's not terrible either. We always welcome Pull Requests making more and better tests! 192 | 193 | ### Running locally 194 | 195 | You need the data to run the tests. If you have the data, great! You should be able to run the tests. If you don't have the data, you use the tiny subset of the data stored in `lib/tagnews/data/ci-data/`. 196 | 197 | ``` 198 | cp lib/tagnews/data/ci-data/*.csv lib/tagnews/data 199 | ``` 200 | 201 | Make sure you have downloaded the GloVE vectors (see Getting Started above). 202 | 203 | Beware that if you run the tests with the full data-set, it can take a _long_ time and a _lot_ of memory. 204 | 205 | If you don't already have a type-of-crime or geostring model, you will need train one (see above). 206 | 207 | Once that's completed, run: 208 | 209 | ``` 210 | uv run pytest --cov-report term-missing --cov=tagnews 211 | ``` 212 | 213 | ### Continuous Integration Testing 214 | 215 | We use GitHub Actions for continuous integration testing. Right now, this is run manually under Actions in GitHub. 216 | 217 | This is configured via the `.github/workflows/publish.yml` file at the top-level of this project. 218 | 219 | ## Documentation 220 | 221 | ### How to write it? 222 | 223 | Write it in this very file! Or the README.md file! 224 | 225 | ### How to publish it? 226 | 227 | Documentation is not currently published. If you have interest in helping with this, submit a Pull Request! 228 | 229 | ## Publishing a new version to pypi 230 | 231 | First, update the version string in `pyproject.toml`, initially start out by bumping the version and making it a release candidate, e.g. `1.1.0rc1`. 232 | 233 | Second, make sure the saved models either match the previously published version exactly (by downloading the current release, extracting it, and copying the model file to where it needs to be), or are _meant_ to be updated. Make sure only the saved model you want exists in your project, delete all others. 234 | 235 | Then, use GitHub Actions to run the workflow using the `pypi` environment. This can also be tested first by publishing to TestPyPI. To do so, run the same GitHub Action manually but use the `testpypi` environment. 236 | 237 | Create a new anaconda environment to download the version for rudimentary testing. The Continuous Integration should take care of most rigorous testing, this is just to make sure everything is working. I usually run through the example at the top of the README. 238 | 239 | Once you are happy, remove the `rc*` suffix and publish as the actual version. You should then create a [release](https://github.com/chicago-justice-project/article-tagging/releases) on GitHub, attempting to log all the changes and attach the tarball created by `python setup.py sdist`. 240 | 241 | *Note: pypi has a limit on the size of projects that can be uploaded, and pypi was recently migrated to a new data warehouse. We originally had to request a size increase in [this issue](https://github.com/pypa/packaging-problems/issues/119).* 242 | 243 | ## I want to contribute to Chicago Justice Project but I don’t want to work on this NLP stuff. What can I do? 244 | 245 | You can help out the [the team scraping articles/maintaining the volunteers' web interface](https://github.com/chicago-justice-project/chicago-justice). If that doesn't sound interesting either, we can always use more [volunteer taggers](http://chicagojustice.org/volunteer-for-cjp/). Or just show up Tuesday nights at ChiHackNight and ask what you can do! 246 | -------------------------------------------------------------------------------- /lib/tagnews/data/ci-data/newsarticles_usercoding_categories.csv: -------------------------------------------------------------------------------- 1 | 92,132,17 2 | 93,132,26 3 | 94,132,20 4 | 95,132,13 5 | 96,132,37 6 | 188,219,17 7 | 189,219,26 8 | 190,219,31 9 | 191,219,10 10 | 192,219,29 11 | 1557,1728,25 12 | 1558,1728,2 13 | 1559,1728,31 14 | 1560,1728,17 15 | 1956,2227,8 16 | 1957,2227,17 17 | 1958,2227,2 18 | 1959,2227,20 19 | 1960,2227,13 20 | 3667,3789,30 21 | 4014,4069,31 22 | 4126,4146,4 23 | 4127,4146,2 24 | 4128,4146,11 25 | 4129,4146,12 26 | 4130,4146,17 27 | 4219,4233,2 28 | 4220,4233,3 29 | 4221,4233,4 30 | 4222,4233,11 31 | 4223,4233,12 32 | 4224,4233,17 33 | 4225,4233,21 34 | 4345,4302,24 35 | 4346,4302,17 36 | 4347,4302,26 37 | 4348,4302,13 38 | 4993,4856,3 39 | 4994,4856,37 40 | 4995,4856,13 41 | 4996,4856,17 42 | 4997,4856,26 43 | 4998,4856,31 44 | 5924,5770,28 45 | 5925,5770,12 46 | 5926,5770,22 47 | 5927,5770,38 48 | 6620,6327,2 49 | 6621,6327,3 50 | 6622,6327,4 51 | 6623,6327,7 52 | 6624,6327,13 53 | 6625,6327,17 54 | 6626,6327,24 55 | 6627,6327,26 56 | 6628,6327,31 57 | 6771,6398,17 58 | 6772,6398,2 59 | 6773,6398,5 60 | 6774,6398,6 61 | 6775,6398,31 62 | 6846,6480,17 63 | 6847,6480,2 64 | 6848,6480,27 65 | 6849,6480,13 66 | 6850,6480,22 67 | 7256,6776,17 68 | 7257,6776,2 69 | 7258,6776,35 70 | 7259,6776,26 71 | 7260,6776,31 72 | 7930,7060,2 73 | 7931,7060,4 74 | 7932,7060,5 75 | 7933,7060,17 76 | 7934,7060,28 77 | 7935,7060,29 78 | 8069,7107,36 79 | 8472,7254,11 80 | 8473,7254,12 81 | 8474,7254,13 82 | 8475,7254,15 83 | 8476,7254,26 84 | 8477,7254,31 85 | 8747,7362,17 86 | 8748,7362,2 87 | 8749,7362,27 88 | 8750,7362,12 89 | 8751,7362,31 90 | 9696,7710,17 91 | 9697,7710,26 92 | 9698,7710,28 93 | 9703,7713,36 94 | 9704,7713,29 95 | 9705,7713,31 96 | 10009,7956,2 97 | 10010,7956,3 98 | 10011,7956,13 99 | 10012,7956,17 100 | 10013,7956,22 101 | 10014,7956,23 102 | 10408,8373,2 103 | 10409,8373,17 104 | 10410,8373,22 105 | 10411,8373,25 106 | 10412,8373,26 107 | 10413,8373,29 108 | 11018,8769,13 109 | 11019,8769,21 110 | 11020,8769,31 111 | 11523,9304,8 112 | 12295,9932,17 113 | 12296,9932,2 114 | 12297,9932,13 115 | 12298,9932,37 116 | 12384,9987,2 117 | 12385,9987,37 118 | 12386,9987,12 119 | 12387,9987,13 120 | 12388,9987,17 121 | 12389,9987,22 122 | 12390,9987,24 123 | 12404,9997,8 124 | 12405,9997,34 125 | 12406,9997,4 126 | 12407,9997,37 127 | 12791,10301,32 128 | 12792,10301,26 129 | 13137,10604,17 130 | 13138,10604,34 131 | 13139,10604,20 132 | 13140,10604,37 133 | 13141,10604,31 134 | 13412,10874,2 135 | 13413,10874,35 136 | 13414,10874,31 137 | 14046,11159,17 138 | 14047,11159,2 139 | 14048,11159,10 140 | 14779,11619,32 141 | 14780,11619,4 142 | 15204,11819,2 143 | 15205,11819,3 144 | 15206,11819,13 145 | 15207,11819,18 146 | 15208,11819,22 147 | 15209,11819,24 148 | 15255,11847,28 149 | 15256,11847,31 150 | 15321,11896,32 151 | 15322,11896,2 152 | 15323,11896,37 153 | 16089,12822,29 154 | 16448,13406,16 155 | 17681,15416,2 156 | 17682,15416,3 157 | 17683,15416,4 158 | 17684,15416,5 159 | 17685,15416,38 160 | 17686,15416,13 161 | 17687,15416,17 162 | 17688,15416,22 163 | 17689,15416,24 164 | 17690,15416,31 165 | 18898,16772,33 166 | 18899,16772,2 167 | 18900,16772,26 168 | 18901,16772,9 169 | 19360,17292,2 170 | 19361,17292,3 171 | 19362,17292,4 172 | 19363,17292,5 173 | 19364,17292,7 174 | 19365,17292,34 175 | 19366,17292,13 176 | 19367,17292,17 177 | 19368,17292,31 178 | 19739,17509,17 179 | 19740,17509,2 180 | 19741,17509,26 181 | 19742,17509,13 182 | 19856,17808,18 183 | 19857,17808,2 184 | 20268,18167,17 185 | 20269,18167,2 186 | 20270,18167,13 187 | 20271,18167,26 188 | 20272,18167,5 189 | 20535,18630,17 190 | 20536,18630,2 191 | 20537,18630,4 192 | 20538,18630,13 193 | 20664,18783,17 194 | 20665,18783,2 195 | 20666,18783,31 196 | 22007,21221,34 197 | 22008,21221,3 198 | 22009,21221,4 199 | 22010,21221,37 200 | 22011,21221,7 201 | 22012,21221,13 202 | 22013,21221,26 203 | 22577,21766,32 204 | 22578,21766,17 205 | 22579,21766,26 206 | 22586,21772,32 207 | 22587,21772,17 208 | 22588,21772,26 209 | 23089,22428,24 210 | 23090,22428,17 211 | 23091,22428,2 212 | 23092,22428,13 213 | 23657,23050,26 214 | 26028,25991,25 215 | 26029,25991,18 216 | 26030,25991,2 217 | 26865,27133,33 218 | 26866,27133,34 219 | 26867,27133,3 220 | 26868,27133,4 221 | 26869,27133,5 222 | 26870,27133,8 223 | 26871,27133,2 224 | 26872,27133,22 225 | 26873,27133,26 226 | 27534,28032,27 227 | 27535,28032,2 228 | 27536,28032,11 229 | 27537,28032,12 230 | 27538,28032,38 231 | 27558,28126,27 232 | 27559,28126,2 233 | 27560,28126,11 234 | 27561,28126,12 235 | 27562,28126,38 236 | 29188,31220,17 237 | 29189,31220,2 238 | 29190,31220,26 239 | 29191,31220,13 240 | 30539,33722,21 241 | 30540,33722,31 242 | 30746,33873,21 243 | 30747,33873,31 244 | 30880,33938,18 245 | 30881,33938,2 246 | 30882,33938,21 247 | 31065,34093,30 248 | 32106,35017,33 249 | 32107,35017,26 250 | 33234,37023,33 251 | 33945,38136,34 252 | 33946,38136,2 253 | 33947,38136,13 254 | 33948,38136,17 255 | 33949,38136,20 256 | 33950,38136,22 257 | 33951,38136,26 258 | 33952,38136,31 259 | 34972,39556,24 260 | 34973,39556,2 261 | 34974,39556,26 262 | 34975,39556,13 263 | 35445,40077,25 264 | 35446,40077,35 265 | 35538,40153,25 266 | 35539,40153,15 267 | 35540,40153,31 268 | 35662,40306,9 269 | 35663,40306,26 270 | 35664,40306,13 271 | 35665,40306,15 272 | 36261,40871,1 273 | 36262,40871,26 274 | 36263,40871,3 275 | 36264,40871,4 276 | 36265,40871,17 277 | 36998,41820,2 278 | 36999,41820,13 279 | 37000,41820,15 280 | 37001,41820,17 281 | 37002,41820,18 282 | 37003,41820,24 283 | 37004,41820,27 284 | 37019,41852,24 285 | 37020,41852,17 286 | 37021,41852,2 287 | 37022,41852,13 288 | 38121,43350,2 289 | 38122,43350,3 290 | 38123,43350,4 291 | 38124,43350,14 292 | 38125,43350,17 293 | 38126,43350,22 294 | 38127,43350,31 295 | 38536,44004,2 296 | 38537,44004,37 297 | 38844,44438,1 298 | 38845,44438,2 299 | 38846,44438,31 300 | 38847,44438,4 301 | 38848,44438,17 302 | 40129,46592,17 303 | 40130,46592,2 304 | 40131,46592,11 305 | 40132,46592,4 306 | 40776,47805,1 307 | 40777,47805,2 308 | 40778,47805,37 309 | 40779,47805,9 310 | 40780,47805,13 311 | 40781,47805,17 312 | 40782,47805,26 313 | 42718,53469,2 314 | 42719,53469,4 315 | 42720,53469,5 316 | 42721,53469,6 317 | 42722,53469,9 318 | 42723,53469,17 319 | 43582,55829,2 320 | 43583,55829,22 321 | 43584,55829,31 322 | 44878,57826,2 323 | 44879,57826,37 324 | 44880,57826,10 325 | 44881,57826,34 326 | 44882,57826,17 327 | 44883,57826,20 328 | 44884,57826,22 329 | 44885,57826,31 330 | 45070,58242,6 331 | 45071,58242,22 332 | 45239,58848,2 333 | 45240,58848,26 334 | 45241,58848,38 335 | 45287,59062,2 336 | 45288,59062,23 337 | 45614,59597,17 338 | 45615,59597,2 339 | 45616,59597,11 340 | 45617,59597,12 341 | 45618,59597,21 342 | 46443,60721,34 343 | 46444,60721,5 344 | 46713,60981,2 345 | 46714,60981,11 346 | 46715,60981,12 347 | 46716,60981,21 348 | 53495,72429,16 349 | 53496,72429,13 350 | 54842,74956,24 351 | 54843,74956,17 352 | 54844,74956,2 353 | 54845,74956,13 354 | 54846,74956,22 355 | 55056,75398,24 356 | 55057,75398,2 357 | 55058,75398,13 358 | 55059,75398,22 359 | 55202,75663,24 360 | 55203,75663,2 361 | 55204,75663,31 362 | 55443,76411,35 363 | 55855,77257,26 364 | 55856,77257,13 365 | 56439,78695,17 366 | 56440,78695,2 367 | 56441,78695,23 368 | 57674,80662,28 369 | 57675,80662,38 370 | 57952,81047,34 371 | 57953,81047,2 372 | 57954,81047,37 373 | 57955,81047,22 374 | 57956,81047,13 375 | 58417,81846,17 376 | 58418,81846,2 377 | 58419,81846,26 378 | 58420,81846,31 379 | 59053,82527,2 380 | 59054,82527,3 381 | 59055,82527,4 382 | 59056,82527,7 383 | 59057,82527,13 384 | 59058,82527,16 385 | 59059,82527,17 386 | 59060,82527,22 387 | 59061,82527,26 388 | 59118,82584,1 389 | 59119,82584,18 390 | 59120,82584,2 391 | 59298,82926,18 392 | 59299,82926,2 393 | 59730,83912,31 394 | 61701,87771,21 395 | 61702,87771,6 396 | 61712,87800,18 397 | 61713,87800,2 398 | 63545,90724,14 399 | 63853,91284,29 400 | 64460,92580,2 401 | 64461,92580,10 402 | 64462,92580,15 403 | 65238,94294,36 404 | 65239,94294,31 405 | 65337,94603,9 406 | 65590,95282,24 407 | 65591,95282,2 408 | 65592,95282,26 409 | 65593,95282,13 410 | 66169,96332,9 411 | 66170,96332,26 412 | 66192,96361,23 413 | 66811,97506,3 414 | 66812,97506,35 415 | 66813,97506,19 416 | 66814,97506,15 417 | 66866,97740,26 418 | 66867,97740,4 419 | 66868,97740,5 420 | 66869,97740,7 421 | 67402,98644,7 422 | 67403,98644,26 423 | 67404,98644,3 424 | 67405,98644,37 425 | 67406,98644,13 426 | 67722,99538,2 427 | 67723,99538,13 428 | 68485,101082,17 429 | 68486,101082,2 430 | 68487,101082,26 431 | 68488,101082,13 432 | 69076,102047,17 433 | 69077,102047,3 434 | 69078,102047,37 435 | 70987,106009,17 436 | 70988,106009,12 437 | 70989,106009,15 438 | 70993,106027,14 439 | 72798,109564,36 440 | 72799,109564,31 441 | 74755,113250,17 442 | 74756,113250,2 443 | 74757,113250,3 444 | 74758,113250,4 445 | 74759,113250,26 446 | 75448,114030,25 447 | 75449,114030,3 448 | 75450,114030,13 449 | 75451,114030,17 450 | 75872,114749,17 451 | 75873,114749,2 452 | 75874,114749,3 453 | 75875,114749,26 454 | 75876,114749,13 455 | 76825,115899,2 456 | 76826,115899,3 457 | 76827,115899,10 458 | 76828,115899,11 459 | 76829,115899,17 460 | 76830,115899,26 461 | 77700,117328,8 462 | 77701,117328,5 463 | 78344,118240,9 464 | 78345,118240,2 465 | 78346,118240,26 466 | 78347,118240,6 467 | 78348,118240,17 468 | 79074,119322,9 469 | 79075,119322,15 470 | 79076,119322,31 471 | 80971,122057,23 472 | 80972,122057,22 473 | 80973,122057,15 474 | 82526,124718,2 475 | 82527,124718,3 476 | 82528,124718,10 477 | 82529,124718,34 478 | 82530,124718,15 479 | 82531,124718,17 480 | 82532,124718,19 481 | 82533,124718,22 482 | 83029,125488,2 483 | 83030,125488,35 484 | 83031,125488,15 485 | 83032,125488,25 486 | 83033,125488,26 487 | 83034,125488,31 488 | 83166,125760,17 489 | 83167,125760,5 490 | 83168,125760,37 491 | 84413,127428,26 492 | 84414,127428,11 493 | 84554,127697,5 494 | 84555,127697,6 495 | 84727,128087,30 496 | 85049,128637,17 497 | 85050,128637,26 498 | 85051,128637,13 499 | 85052,128637,37 500 | 85058,128662,4 501 | 85059,128662,36 502 | 85060,128662,22 503 | 85061,128662,31 504 | 85197,128857,1 505 | 85198,128857,6 506 | 85419,129483,17 507 | 85420,129483,2 508 | 85421,129483,37 509 | 85662,129899,30 510 | 86270,131048,8 511 | 86271,131048,17 512 | 86272,131048,13 513 | 86273,131048,31 514 | 86726,131765,37 515 | 86727,131765,13 516 | 86728,131765,31 517 | 86949,132243,15 518 | 86950,132243,2 519 | 86951,132243,3 520 | 86952,132243,31 521 | 89412,135388,34 522 | 89413,135388,37 523 | 89414,135388,13 524 | 89415,135388,2 525 | 89416,135388,15 526 | 89417,135388,17 527 | 89684,135705,34 528 | 89685,135705,37 529 | 89686,135705,2 530 | 89687,135705,17 531 | 89688,135705,19 532 | 89689,135705,26 533 | 91458,139290,16 534 | 91459,139290,5 535 | 91460,139290,21 536 | 92175,140867,2 537 | 92176,140867,12 538 | 92177,140867,22 539 | 92178,140867,31 540 | 92362,141314,33 541 | 92363,141314,34 542 | 92364,141314,19 543 | 92365,141314,9 544 | 92366,141314,31 545 | 92726,142222,2 546 | 92727,142222,3 547 | 92728,142222,31 548 | 93025,142937,17 549 | 93026,142937,3 550 | 93027,142937,14 551 | 96046,150134,13 552 | 96047,150134,22 553 | 96890,151573,17 554 | 96891,151573,2 555 | 96892,151573,19 556 | 96893,151573,37 557 | 97924,153541,17 558 | 97925,153541,10 559 | 97926,153541,6 560 | 98180,154021,17 561 | 98181,154021,34 562 | 98182,154021,19 563 | 98183,154021,37 564 | 98184,154021,31 565 | 98335,154234,36 566 | 98935,155785,13 567 | 98936,155785,37 568 | 99813,157346,25 569 | 99814,157346,10 570 | 99815,157346,20 571 | 99816,157346,6 572 | 99817,157346,17 573 | 99993,157684,24 574 | 99994,157684,17 575 | 99995,157684,2 576 | 99996,157684,26 577 | 99997,157684,13 578 | 100190,157999,2 579 | 100191,157999,27 580 | 100192,157999,13 581 | 101008,159958,33 582 | 101009,159958,15 583 | 101121,160087,14 584 | 101142,160137,34 585 | 101143,160137,2 586 | 101144,160137,37 587 | 101145,160137,23 588 | 101815,162892,26 589 | 101816,162892,35 590 | 101817,162892,4 591 | 102512,165333,2 592 | 102513,165333,3 593 | 102514,165333,6 594 | 102515,165333,9 595 | 102516,165333,17 596 | 102517,165333,31 597 | 104832,177862,2 598 | 104833,177862,10 599 | 105683,180249,2 600 | 105684,180249,3 601 | 105685,180249,4 602 | 105686,180249,12 603 | 105687,180249,13 604 | 105688,180249,17 605 | 106200,184791,2 606 | 106201,184791,3 607 | 106202,184791,36 608 | 106343,185415,17 609 | 106344,185415,26 610 | 106345,185415,13 611 | 106346,185415,7 612 | 106712,186861,17 613 | 106713,186861,2 614 | 106714,186861,27 615 | 106715,186861,13 616 | 106716,186861,15 617 | 107697,193258,2 618 | 107698,193258,11 619 | 107699,193258,4 620 | 107700,193258,5 621 | 107701,193258,13 622 | 108098,195838,17 623 | 108099,195838,2 624 | 108100,195838,11 625 | 108880,199329,24 626 | 108881,199329,22 627 | 109121,200327,35 628 | 109606,203291,17 629 | 109607,203291,3 630 | 109608,203291,30 631 | 109943,204359,27 632 | 109944,204359,12 633 | 109945,204359,21 634 | 109946,204359,39 635 | 110046,204592,2 636 | 110047,204592,37 637 | 110188,204900,40 638 | 110189,204900,27 639 | 110190,204900,39 640 | 110194,204902,11 641 | 110195,204902,12 642 | 110196,204902,13 643 | 110197,204902,17 644 | 110198,204902,26 645 | 110199,204902,28 646 | 110583,205755,34 647 | 110632,205788,16 648 | 110633,205788,22 649 | 112637,210278,24 650 | 112638,210278,17 651 | 112639,210278,26 652 | 112640,210278,13 653 | 112641,210278,15 654 | 113538,211531,2 655 | 113539,211531,35 656 | 113540,211531,6 657 | 113588,211597,2 658 | 113589,211597,27 659 | 113590,211597,21 660 | 113591,211597,39 661 | 113861,212076,2 662 | 113862,212076,27 663 | 113863,212076,12 664 | 113864,212076,38 665 | 113913,212124,2 666 | 113914,212124,3 667 | 113915,212124,39 668 | 113916,212124,11 669 | 113917,212124,12 670 | 113918,212124,27 671 | 114202,212536,2 672 | 114203,212536,35 673 | 114204,212536,26 674 | 114205,212536,5 675 | 116228,221833,17 676 | 116229,221833,5 677 | 116372,222644,28 678 | 116373,222644,13 679 | 116437,222854,3 680 | 116438,222854,8 681 | 116439,222854,15 682 | 116440,222854,17 683 | 116441,222854,26 684 | 116442,222854,31 685 | 116471,222947,24 686 | 116472,222947,26 687 | 116473,222947,13 688 | 116474,222947,15 689 | 116540,223183,2 690 | 116541,223183,11 691 | 116542,223183,12 692 | 116543,223183,37 693 | 116544,223183,38 694 | 116891,224330,2 695 | 116892,224330,3 696 | 116893,224330,4 697 | 116894,224330,40 698 | 116895,224330,12 699 | 116896,224330,26 700 | 117414,225660,27 701 | 117415,225660,12 702 | 117416,225660,39 703 | 118134,227740,25 704 | 118135,227740,26 705 | 118136,227740,17 706 | 118389,228325,2 707 | 118390,228325,37 708 | 118391,228325,10 709 | 118392,228325,17 710 | 118393,228325,22 711 | 118394,228325,31 712 | 119209,230878,33 713 | 119210,230878,15 714 | 119808,233151,17 715 | 119809,233151,10 716 | 119810,233151,15 717 | 119817,233177,34 718 | -------------------------------------------------------------------------------- /lib/tagnews/geoloc/tag.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import glob 4 | import json 5 | import os 6 | import re 7 | import time 8 | from collections import namedtuple 9 | from contextlib import ExitStack, redirect_stderr 10 | 11 | import numpy as np 12 | import pandas as pd 13 | import requests 14 | from shapely.geometry import shape, Point 15 | 16 | from tagnews.utils.neighborhoods import neighborhoods 17 | from .. import utils 18 | 19 | with ExitStack() as stack: 20 | null_stream = open(os.devnull, "w") 21 | stack.enter_context(null_stream) 22 | stack.enter_context(redirect_stderr(null_stream)) 23 | import keras 24 | 25 | """ 26 | Contains the CrimeTags class that allows tagging of articles. 27 | """ 28 | 29 | MODEL_LOCATION = os.path.join( 30 | os.path.split(__file__)[0], os.path.join("models", "lstm", "saved") 31 | ) 32 | 33 | COMMUNITY_AREAS_FILE = os.path.join( 34 | os.path.split(__file__)[0], 35 | "..", 36 | "data", 37 | "Boundaries - Community Areas (current).geojson", 38 | ) 39 | 40 | 41 | def post_process(geostring): 42 | """ 43 | Post process the geostring in a way that makes it more amenable to 44 | geocoding by the current geocoding provider GISgraphy. 45 | 46 | Inputs 47 | ------ 48 | geostring : str 49 | The geostring to post process 50 | 51 | Returns 52 | ------- 53 | processed_geostring : str 54 | """ 55 | # Merge multiple whitespaces into one 56 | geostring = " ".join(geostring.split()) 57 | 58 | # gisgraphy struggles with things like "55th and Woodlawn". 59 | # replace "... and..." 60 | # with two zeros. 61 | # \100 does not work correclty so we need to add a separator. 62 | geostring = re.sub( 63 | r"([0-9]+)(th|rd|st) and", r"\1<__internal_separator__>00 and", geostring 64 | ) 65 | geostring = geostring.replace("<__internal_separator__>", "") 66 | 67 | # remove stopwords, only if they are internal, i.e. 68 | # the geostring doesn't start with "block ...". 69 | for stopword in ["block", "of", "and"]: 70 | geostring = geostring.replace(" {} ".format(stopword), " ") 71 | 72 | return geostring 73 | 74 | 75 | _base_geocoder_url = ( 76 | "http://ec2-34-228-58-223.compute-1.amazonaws.com" ":4000/v1/search?text={}" 77 | ) 78 | 79 | GeocodeResults = namedtuple( 80 | "GeocodeResults", 81 | [ 82 | "coords_raw", 83 | "full_responses_raw", 84 | "scores_raw", 85 | "coords_post", 86 | "full_responses_post", 87 | "scores_post", 88 | ], 89 | ) 90 | 91 | 92 | def get_lat_longs_from_geostrings( 93 | geostring_list, 94 | post_process_f=None, 95 | sleep_secs=0, 96 | geocoder_url_formatter=_base_geocoder_url, 97 | ): 98 | """ 99 | Geo-code each geostring in `geostring_list` into lat/long values. 100 | Also return the full response from the geocoding service. 101 | 102 | Inputs 103 | ------ 104 | geostring_list : list of strings 105 | The list of geostrings to geocode into lat/longs. 106 | post_process_f : function 107 | The results are returned for both the raw geostrings being 108 | passed to the geocoder, and the results of 109 | `post_process_f(geostring)` being passed to the geocoder. 110 | sleep_secs : float 111 | How long to sleep between successive requests, in seconds. 112 | geocoder_url_formatter : str 113 | A string with a "{}" in it where the text should be input, e.g. 114 | "http://our-pelias.biz:4000/v1/search?text={}". 115 | 116 | Returns 117 | ------- 118 | GeocodeResults : namedtuple 119 | A named tuple with the following fields: 120 | coords_raw : pandas.DataFrame 121 | The length `n` DataFrame of lat/long values. Values are NaN 122 | if the geocoder returned no results. 123 | full_responses_raw : list 124 | The length `n` list of the full responses from the geocoding 125 | service. 126 | scores_raw : numpy.array 127 | Numpy array of the confidence scores of the responses. 128 | coords_post : pandas.DataFrame 129 | The length `n` DataFrame of lat/long values. Values are NaN 130 | if the geocoder returned no results. 131 | full_responses_post : list 132 | The length `n` list of the full responses of the post-processed 133 | geostrings. 134 | scores_post : numpy.array 135 | Numpy array of the confidence scores of the responses. 136 | """ 137 | if post_process_f is None: 138 | post_process_f = post_process 139 | 140 | def _geocode(lst): 141 | full_responses = [] 142 | for addr_str in lst: 143 | try: 144 | g = json.loads( 145 | requests.get(geocoder_url_formatter.format(addr_str)).text 146 | ) 147 | except Exception: 148 | g = {} 149 | full_responses.append(g) 150 | time.sleep(sleep_secs) 151 | 152 | def _get_latlong(g): 153 | try: 154 | return g["features"][0]["geometry"]["coordinates"] 155 | except (KeyError, IndexError): 156 | return [np.nan, np.nan] 157 | 158 | def _get_confidence(g): 159 | try: 160 | return g["features"][0]["properties"]["confidence"] 161 | except (KeyError, IndexError): 162 | return np.nan 163 | 164 | coords = pd.DataFrame( 165 | [_get_latlong(g) for g in full_responses], columns=["long", "lat"] 166 | ) 167 | coords = coords[["lat", "long"]] # it makes me feel better, OK? 168 | scores = np.asarray([_get_confidence(g) for g in full_responses]).astype(np.float32) 169 | 170 | return full_responses, coords, scores 171 | 172 | full_responses_raw, coords_raw, scores_raw = _geocode(geostring_list) 173 | 174 | full_responses_post, coords_post, scores_post = _geocode( 175 | [post_process_f(geo_s) for geo_s in geostring_list] 176 | ) 177 | 178 | return GeocodeResults( 179 | coords_raw=coords_raw, 180 | full_responses_raw=full_responses_raw, 181 | scores_raw=scores_raw, 182 | coords_post=coords_post, 183 | full_responses_post=full_responses_post, 184 | scores_post=scores_post, 185 | ) 186 | 187 | 188 | def load_model(location=MODEL_LOCATION): 189 | """ 190 | Load a model from the given folder `location`. 191 | There should be at least one file named model-TIME.pkl and 192 | a file named vectorizer-TIME.pkl inside the folder. 193 | 194 | The files with the most recent timestamp are loaded. 195 | """ 196 | models = glob.glob(os.path.join(location, "weights*.hdf5")) 197 | if not models: 198 | raise RuntimeError( 199 | ( 200 | "No models to load. Run" 201 | ' "python -m tagnews.geoloc.models.' 202 | 'lstm.save_model"' 203 | ) 204 | ) 205 | 206 | model = keras.models.load_model(models[-1]) 207 | 208 | return model 209 | 210 | 211 | class GeoCoder: 212 | def __init__(self): 213 | self.model = load_model() 214 | self.glove = utils.load_vectorizer.load_glove( 215 | os.path.join(os.path.split(__file__)[0], "../data/glove.6B.50d.txt") 216 | ) 217 | with open(COMMUNITY_AREAS_FILE) as f: 218 | d = json.load(f) 219 | self.com_areas = { 220 | f["properties"]["community"]: shape(f["geometry"]) 221 | for f in d["features"] 222 | } 223 | 224 | def pre_process(self, s): 225 | """ 226 | Takes in a string which is the text of an article and returns the tuple 227 | `(words, data)` where `words` is the list of words found and `data` 228 | is the 3D numpy array that contains the numeric data that can be used 229 | by the trained model. 230 | 231 | Inputs 232 | ------ 233 | s : str 234 | Article text. 235 | 236 | Returns 237 | ------- 238 | words : list of strings 239 | The words found in the article. 240 | data : 3D numpy.array 241 | Has shape (1, N, M) where N is the number of words and M 242 | is the size of the word vectors, currently M is 51. 243 | """ 244 | words = s.split() # split along white space. 245 | data = pd.concat( 246 | [ 247 | pd.DataFrame([[w[0].isupper()] if w else [False] for w in words]), 248 | (self.glove.reindex(words).fillna(0).reset_index(drop=True)), 249 | ], 250 | axis="columns", 251 | ) 252 | data = np.asarray(data).astype(np.float32) 253 | return words, np.expand_dims(data, axis=0) 254 | 255 | def extract_geostring_probs(self, s): 256 | """ 257 | Extract the probability that each word in s is part of a geostring. 258 | 259 | Inputs 260 | ------ 261 | s : str 262 | Article text. 263 | 264 | Returns 265 | ------- 266 | words : list of strings 267 | The words found in the article. 268 | probs : 1D numpy.array 269 | Has shape (N,) where N is the number of words. 270 | """ 271 | if not s.strip(): 272 | return [[], np.zeros((0,), dtype=np.float32)] 273 | words, data = self.pre_process(s) 274 | probs = self.model.predict(data)[0][:, 1] 275 | return words, probs 276 | 277 | def extract_geostrings(self, s, prob_thresh=0.5): 278 | """ 279 | Extract the geostrings from the article text. 280 | 281 | Inputs 282 | ------ 283 | s : str 284 | Article text. 285 | prob_thresh : float, 0 <= prob_thresh <= 1 286 | The threshold on probability above which words will be 287 | considered as part of a geostring. 288 | DEFAULT: 0.5 289 | 290 | Returns 291 | ------- 292 | geostrings : list of lists of strings 293 | The list of extracted geostrings from the article text. 294 | Each word is kept separated in the list. 295 | Example: 296 | [['1300', 'W.', 'Halsted'], ['Ohio']] 297 | """ 298 | words, probs = self.extract_geostring_probs(s) 299 | above_thresh = probs >= prob_thresh 300 | 301 | words = ["filler"] + words + ["filler"] 302 | probs = np.append(0, np.append(probs, 0)) 303 | 304 | above_thresh = np.concatenate([[False], above_thresh, [False]]).astype(np.int32) 305 | switch_ons = np.where(np.diff(above_thresh) == 1)[0] + 1 306 | switch_offs = np.where(np.diff(above_thresh) == -1)[0] + 1 307 | 308 | geostrings = [] 309 | probstrings = [] 310 | for on, off in zip(switch_ons, switch_offs): 311 | geostrings.append(words[on:off]) 312 | probstrings.append(probs[on:off]) 313 | 314 | return geostrings, probstrings 315 | 316 | @staticmethod 317 | def lat_longs_from_geostring_lists(geostring_lists, **kwargs): 318 | """ 319 | Get the latitude/longitude pairs from a list of geostrings as 320 | returned by `extract_geostrings`. Note that `extract_geostrings` 321 | returns a list of lists of words. 322 | 323 | Inputs 324 | ------ 325 | geostring_lists : List[List[str]] 326 | A length-N list of list of strings, as returned by 327 | `extract_geostrings`. 328 | Example: [['5500', 'S.', 'Woodlawn'], ['1700', 'S.', 'Halsted']] 329 | **kwargs : other parameters passed to `get_lat_longs_from_geostrings` 330 | 331 | Returns 332 | ------- 333 | coords : pandas.DataFrame 334 | A pandas DataFrame with columns "lat" and "long". Values are 335 | NaN if the geocoder returned no results. 336 | scores : numpy.array 337 | 1D, length-N numpy array of the scores, higher indicates more 338 | confidence. This is our best guess after masssaging the scores 339 | returned by the geocoder, and should not be taken as any sort 340 | of absolute rule. 341 | """ 342 | out = get_lat_longs_from_geostrings( 343 | [" ".join(gl) for gl in geostring_lists], **kwargs 344 | ) 345 | 346 | return out.coords_post, out.scores_post 347 | 348 | def community_area_from_coords(self, coords): 349 | """ 350 | Get the community area name that the coordinate lies in. 351 | 352 | Parameters 353 | ---------- 354 | coords : pandas.DataFrame 355 | A pandas dataframe with columns "lat" and "long". 356 | 357 | Returns 358 | ------- 359 | com_areas : List 360 | A list of community areas, one corresponding to each 361 | row of coords. An empty string indicates that the coord 362 | did not belong to any of the community areas. 363 | """ 364 | out = [] 365 | for _, coord in coords.iterrows(): 366 | p = Point(coord["long"], coord["lat"]) 367 | for com_name, com_shape in self.com_areas.items(): 368 | if com_shape.contains(p): 369 | out.append(com_name) 370 | break 371 | else: 372 | out.append("") 373 | return out 374 | 375 | def best_geostring(self, extracted_strs_and_probs: tuple): 376 | """ 377 | 378 | Parameters 379 | ---------- 380 | extracted_strs_and_probs : 2-tuple 381 | A 2-tuple of two lists containing a list of extracted geostrings at index zero 382 | and a list of extracted geostring probabilities at index one 383 | 384 | Returns 385 | ------- 386 | 2-tuple of one geostring of the best geostring 387 | OR False 388 | """ 389 | consider = [[], []] 390 | for geostring, probs in zip( 391 | extracted_strs_and_probs[0], extracted_strs_and_probs[1] 392 | ): 393 | is_neighborhood = False 394 | for neighborhood in neighborhoods: 395 | if neighborhood.lower() in " ".join(geostring).lower(): 396 | is_neighborhood = True 397 | if is_neighborhood or len(geostring) >= 3: 398 | consider[0].append((geostring)) 399 | consider[1].append((probs)) 400 | if consider[0]: 401 | avgs = [sum(i) / len(i) for i in consider[1]] 402 | max_index = avgs.index(max(avgs)) 403 | return consider[0][max_index] 404 | else: 405 | return '' 406 | 407 | -------------------------------------------------------------------------------- /lib/tagnews/data/ci-data/newsarticles_trainedcoding.csv: -------------------------------------------------------------------------------- 1 | 453893,2018-04-06 04:23:02.331286+00,tagnews 1.0.2,0.9999999493514471,40534 2 | 454415,2018-04-06 04:27:10.231305+00,tagnews 1.0.2,0.9896225820938951,41056 3 | 454491,2018-04-06 04:27:44.699546+00,tagnews 1.0.2,0.9168645188301731,41132 4 | 454644,2018-04-06 04:28:51.487924+00,tagnews 1.0.2,0.999932514377795,41285 5 | 455210,2018-04-06 04:33:12.303175+00,tagnews 1.0.2,0.999287893890405,41851 6 | 456158,2018-04-06 04:40:15.107791+00,tagnews 1.0.2,0.9999899196988399,42800 7 | 456190,2018-04-06 04:40:24.03267+00,tagnews 1.0.2,0.999887094530515,42832 8 | 457690,2018-04-06 04:51:33.089617+00,tagnews 1.0.2,0.9999088677496021,44332 9 | 458343,2018-04-06 04:56:30.545353+00,tagnews 1.0.2,0.907531415964436,44986 10 | 458774,2018-04-06 04:59:43.82406+00,tagnews 1.0.2,0.9979253520065241,45420 11 | 460927,2018-04-06 05:18:24.462887+00,tagnews 1.0.2,0.9100118116350899,47574 12 | 462142,2018-04-06 05:28:22.308611+00,tagnews 1.0.2,0.9999995557910791,48789 13 | 467811,2018-04-06 06:19:54.057142+00,tagnews 1.0.2,0.997743130725998,54456 14 | 470170,2018-04-06 06:40:19.891585+00,tagnews 1.0.2,0.9999775872018041,56816 15 | 472166,2018-04-06 06:56:48.795115+00,tagnews 1.0.2,0.999768276509597,58813 16 | 472582,2018-04-06 07:00:34.682456+00,tagnews 1.0.2,0.8882503668315859,59229 17 | 473190,2018-04-06 07:07:26.558123+00,tagnews 1.0.2,0.9998427982345229,59836 18 | 473404,2018-04-06 07:09:30.134+00,tagnews 1.0.2,0.9929430356020791,60050 19 | 473939,2018-04-06 07:13:31.574314+00,tagnews 1.0.2,0.9964149984724949,60585 20 | 475062,2018-04-06 07:24:31.885005+00,tagnews 1.0.2,0.9367940282996899,61709 21 | 475324,2018-04-06 07:26:42.040181+00,tagnews 1.0.2,0.994860947249206,61971 22 | 487373,2018-04-06 09:16:36.261811+00,tagnews 1.0.2,0.9908202992658509,74059 23 | 491582,2018-04-06 09:53:08.206784+00,tagnews 1.0.2,0.9988898360762579,78269 24 | 492024,2018-04-06 09:57:05.231426+00,tagnews 1.0.2,0.999973199132512,78711 25 | 492289,2018-04-06 09:59:24.69061+00,tagnews 1.0.2,0.999808315805894,78976 26 | 493040,2018-04-06 10:08:21.531175+00,tagnews 1.0.2,0.9999004512399681,79724 27 | 494190,2018-04-06 10:17:59.220825+00,tagnews 1.0.2,0.999999980935647,80874 28 | 496875,2018-04-06 10:41:33.208945+00,tagnews 1.0.2,0.999483423538713,83560 29 | 500625,2018-04-06 11:15:45.916888+00,tagnews 1.0.2,0.999156888223627,87395 30 | 501009,2018-04-06 11:19:24.342627+00,tagnews 1.0.2,0.999933181042262,87780 31 | 501808,2018-04-06 11:25:46.986691+00,tagnews 1.0.2,0.995632505923119,88579 32 | 502490,2018-04-06 11:31:56.199454+00,tagnews 1.0.2,0.999987595846957,89261 33 | 502547,2018-04-06 11:32:23.489778+00,tagnews 1.0.2,0.9965366249994351,89318 34 | 502889,2018-04-06 11:35:09.913288+00,tagnews 1.0.2,0.9524503271232819,89660 35 | 503875,2018-04-06 11:44:09.105877+00,tagnews 1.0.2,0.99984092412567,90646 36 | 507757,2018-04-06 12:20:35.009847+00,tagnews 1.0.2,0.985260566422226,94509 37 | 507786,2018-04-06 12:20:48.815289+00,tagnews 1.0.2,0.9996901103399721,94538 38 | 510712,2018-04-06 12:46:44.622491+00,tagnews 1.0.2,0.971614759823332,97498 39 | 511271,2018-04-06 12:51:43.420295+00,tagnews 1.0.2,0.9922923576575211,98058 40 | 512592,2018-04-06 13:05:25.884409+00,tagnews 1.0.2,0.99993577514431,99361 41 | 514308,2018-04-06 13:20:56.85264+00,tagnews 1.0.2,0.9911938024182759,101077 42 | 514617,2018-04-06 13:23:49.111337+00,tagnews 1.0.2,0.9138501467819742,101386 43 | 515297,2018-04-06 13:29:21.0853+00,tagnews 1.0.2,0.9999860022949809,102066 44 | 516352,2018-04-06 13:38:53.543665+00,tagnews 1.0.2,0.994323934760144,103122 45 | 516386,2018-04-06 13:39:15.999084+00,tagnews 1.0.2,0.999935627754646,103156 46 | 517535,2018-04-06 13:48:50.789743+00,tagnews 1.0.2,0.9991068685300749,104306 47 | 517769,2018-04-06 13:51:15.925762+00,tagnews 1.0.2,0.98407007504748,104540 48 | 518677,2018-04-06 13:59:46.477226+00,tagnews 1.0.2,0.9999780090434369,105448 49 | 519589,2018-04-06 14:10:03.884068+00,tagnews 1.0.2,0.999966351973624,106346 50 | 521136,2018-04-06 14:23:57.373069+00,tagnews 1.0.2,0.9967965743400659,107893 51 | 522104,2018-04-06 14:32:30.666413+00,tagnews 1.0.2,0.9826979416411509,108861 52 | 526095,2018-04-06 15:10:55.746265+00,tagnews 1.0.2,0.8428486532137309,112858 53 | 526113,2018-04-06 15:11:06.186507+00,tagnews 1.0.2,0.99871130956743,112876 54 | 529649,2018-04-06 15:42:59.30025+00,tagnews 1.0.2,0.9947046965727321,116413 55 | 533365,2018-04-06 16:19:16.735275+00,tagnews 1.0.2,0.9792654478542471,120162 56 | 534145,2018-04-06 16:26:44.38019+00,tagnews 1.0.2,0.973129736132725,120942 57 | 534864,2018-04-06 16:33:07.889769+00,tagnews 1.0.2,0.9999966310233621,121661 58 | 537443,2018-04-06 16:57:04.395494+00,tagnews 1.0.2,0.95064963726574,124241 59 | 536014,2018-04-06 16:44:00.951708+00,tagnews 1.0.2,0.99998367863464,122811 60 | 538379,2018-04-06 17:07:12.966202+00,tagnews 1.0.2,0.9987915520879151,125154 61 | 539462,2018-04-06 17:17:50.055981+00,tagnews 1.0.2,0.863883951028128,126236 62 | 542197,2018-04-06 17:42:44.372711+00,tagnews 1.0.2,0.9865234365113491,128971 63 | 544884,2018-04-06 18:09:45.995975+00,tagnews 1.0.2,0.999995294977297,131633 64 | 545652,2018-04-06 18:16:40.182721+00,tagnews 1.0.2,0.9977864966673121,132403 65 | 545924,2018-04-06 18:19:22.821435+00,tagnews 1.0.2,0.999978634170278,132675 66 | 547592,2018-04-06 18:34:40.982291+00,tagnews 1.0.2,0.9654272379126729,134343 67 | 547861,2018-04-06 18:37:15.278881+00,tagnews 1.0.2,0.9797029097256109,134612 68 | 548251,2018-04-06 18:41:09.706316+00,tagnews 1.0.2,0.959968104909423,135002 69 | 548801,2018-04-06 18:45:50.652815+00,tagnews 1.0.2,0.9999578705512691,135552 70 | 548826,2018-04-06 18:46:05.682786+00,tagnews 1.0.2,0.9812507013257259,135577 71 | 549021,2018-04-06 18:47:54.53721+00,tagnews 1.0.2,0.9874391977978849,135772 72 | 549647,2018-04-06 18:54:08.913032+00,tagnews 1.0.2,0.9957128218479301,136398 73 | 550063,2018-04-06 18:58:16.794197+00,tagnews 1.0.2,0.9411575466816129,136814 74 | 551242,2018-04-06 19:11:19.077545+00,tagnews 1.0.2,0.999855649439592,137963 75 | 551960,2018-04-06 19:17:43.937509+00,tagnews 1.0.2,0.992348301676597,138681 76 | 552438,2018-04-06 19:22:19.283931+00,tagnews 1.0.2,0.9293200935215621,139159 77 | 555583,2018-04-06 19:51:29.329114+00,tagnews 1.0.2,0.9983273264004301,142304 78 | 555900,2018-04-06 19:54:30.206493+00,tagnews 1.0.2,0.9996439141829279,142621 79 | 559515,2018-04-06 20:29:30.512525+00,tagnews 1.0.2,0.998004769621765,146218 80 | 561091,2018-04-06 20:44:26.450771+00,tagnews 1.0.2,0.9978320764347309,147795 81 | 561538,2018-04-06 20:48:30.220341+00,tagnews 1.0.2,0.9999984941589128,148242 82 | 562446,2018-04-06 20:57:10.686435+00,tagnews 1.0.2,0.967552833211205,149150 83 | 563189,2018-04-06 21:05:39.600147+00,tagnews 1.0.2,0.9862481476081439,149865 84 | 570417,2018-04-06 22:15:15.316543+00,tagnews 1.0.2,0.788684471902453,157068 85 | 571856,2018-04-06 22:28:26.990609+00,tagnews 1.0.2,0.9995254014306991,158507 86 | 573824,2018-04-06 22:46:12.837637+00,tagnews 1.0.2,0.981930328102285,160476 87 | 574304,2018-04-06 22:50:16.968366+00,tagnews 1.0.2,0.9966144524368841,160956 88 | 574517,2018-04-06 22:52:01.449551+00,tagnews 1.0.2,0.9485902236238009,161169 89 | 576100,2018-04-06 23:09:01.193554+00,tagnews 1.0.2,0.988857692303698,162729 90 | 577661,2018-04-06 23:22:51.826114+00,tagnews 1.0.2,0.974163605880917,164291 91 | 578001,2018-04-06 23:26:09.611614+00,tagnews 1.0.2,0.9999607688401649,164631 92 | 578317,2018-04-06 23:29:04.555426+00,tagnews 1.0.2,0.999911923675472,164947 93 | 580278,2018-04-06 23:47:26.485186+00,tagnews 1.0.2,0.9886499410462121,166908 94 | 580407,2018-04-06 23:48:44.515143+00,tagnews 1.0.2,0.9804183485201959,167037 95 | 580457,2018-04-06 23:49:16.947558+00,tagnews 1.0.2,0.9435893034373599,167087 96 | 584188,2018-04-07 00:27:36.648565+00,tagnews 1.0.2,0.999452789511913,170803 97 | 589047,2018-04-07 01:17:40.407635+00,tagnews 1.0.2,0.9997032201925992,175658 98 | 614671,2018-04-07 05:47:41.381881+00,tagnews 1.0.2,0.923683116129813,201260 99 | 617068,2018-04-07 06:13:34.906861+00,tagnews 1.0.2,0.9999999998184541,203651 100 | 626743,2018-04-07 07:53:33.372862+00,tagnews 1.0.2,0.999307402755212,213344 101 | 627366,2018-04-07 08:00:05.556935+00,tagnews 1.0.2,0.999233766009999,213968 102 | 628811,2018-04-07 08:16:27.001801+00,tagnews 1.0.2,0.9998885006610941,215414 103 | 638085,2018-04-07 09:52:01.276691+00,tagnews 1.0.2,0.999991081666442,224701 104 | 655730,2018-04-07 12:52:48.982083+00,tagnews 1.0.2,0.611761421637548,242395 105 | 664187,2018-04-07 14:18:16.932903+00,tagnews 1.0.2,0.940635779357718,250841 106 | 673593,2018-04-07 15:51:48.004491+00,tagnews 1.0.2,0.999999440852851,260241 107 | 679017,2018-04-07 16:46:03.567665+00,tagnews 1.0.2,0.97342543252198,265658 108 | 703432,2018-04-07 20:45:09.373865+00,tagnews 1.0.2,0.9439736876444,290036 109 | 703434,2018-04-07 20:45:10.40857+00,tagnews 1.0.2,0.971157345809923,290038 110 | 680518,2018-04-07 17:02:42.151431+00,tagnews 1.0.2,0.9044098048439742,267157 111 | 680698,2018-04-07 17:04:51.132479+00,tagnews 1.0.2,0.998803081336373,267325 112 | 681512,2018-04-07 17:13:30.161124+00,tagnews 1.0.2,0.995728484527155,268139 113 | 694916,2018-04-07 19:23:10.986639+00,tagnews 1.0.2,0.9998013135720291,281528 114 | 695016,2018-04-07 19:24:14.881732+00,tagnews 1.0.2,0.9999974846780271,281628 115 | 698396,2018-04-07 19:55:28.491522+00,tagnews 1.0.2,0.99964667989408,285008 116 | 698507,2018-04-07 19:56:34.685989+00,tagnews 1.0.2,0.9927044974262328,285119 117 | 702101,2018-04-07 20:32:51.635439+00,tagnews 1.0.2,0.987630525694864,288705 118 | 705758,2018-04-07 21:09:12.118101+00,tagnews 1.0.2,0.9508571504826291,292352 119 | 709872,2018-04-07 21:47:14.228435+00,tagnews 1.0.2,0.9979933013970591,296467 120 | 716443,2018-04-07 22:47:09.648682+00,tagnews 1.0.2,0.326910330888475,303024 121 | 727147,2018-04-08 00:23:07.83915+00,tagnews 1.0.2,0.981861782740461,313702 122 | 728505,2018-04-08 00:34:41.867748+00,tagnews 1.0.2,0.16869949219008698,315061 123 | 729271,2018-04-08 00:41:36.337504+00,tagnews 1.0.2,0.995228266194327,315827 124 | 730663,2018-04-08 00:53:40.816585+00,tagnews 1.0.2,0.9999152063211558,317223 125 | 739867,2018-04-08 02:15:18.435313+00,tagnews 1.0.2,0.999999999114922,326413 126 | 740942,2018-04-08 02:24:43.982541+00,tagnews 1.0.2,0.8448991642964859,327488 127 | 766495,2018-04-08 06:11:50.514599+00,tagnews 1.0.2,0.9943096817014441,353037 128 | 768075,2018-04-08 06:26:17.501704+00,tagnews 1.0.2,0.9997591877520999,354618 129 | 776203,2018-04-08 07:40:14.890831+00,tagnews 1.0.2,0.999374242538893,362825 130 | 781435,2018-04-08 08:29:14.581644+00,tagnews 1.0.2,0.9823034105854179,368055 131 | 413550,2018-04-06 01:11:07.73102+00,tagnews 1.0.2,0.9998575366841641,132 132 | 413637,2018-04-06 01:11:19.713638+00,tagnews 1.0.2,0.9363483455726559,219 133 | 415147,2018-04-06 01:17:00.942544+00,tagnews 1.0.2,0.95221771381613,1761 134 | 415662,2018-04-06 01:18:12.646466+00,tagnews 1.0.2,0.9023008565400741,2276 135 | 417225,2018-04-06 01:21:51.371533+00,tagnews 1.0.2,0.9625021379247131,3840 136 | 417506,2018-04-06 01:22:32.058913+00,tagnews 1.0.2,0.918737580752234,4121 137 | 417583,2018-04-06 01:22:43.567975+00,tagnews 1.0.2,0.99972004364052,4198 138 | 417670,2018-04-06 01:22:54.964258+00,tagnews 1.0.2,0.99835708444398,4285 139 | 417739,2018-04-06 01:23:05.351858+00,tagnews 1.0.2,0.9999997197983299,4354 140 | 418293,2018-04-06 01:24:22.771378+00,tagnews 1.0.2,0.9999878349356909,4908 141 | 419207,2018-04-06 01:26:36.866343+00,tagnews 1.0.2,0.9999012082107359,5822 142 | 419765,2018-04-06 01:28:03.48474+00,tagnews 1.0.2,0.9999715841932209,6380 143 | 419836,2018-04-06 01:28:16.467132+00,tagnews 1.0.2,0.9947189203452851,6451 144 | 419919,2018-04-06 01:28:27.687662+00,tagnews 1.0.2,0.999960548834855,6534 145 | 420216,2018-04-06 01:29:11.211941+00,tagnews 1.0.2,0.9442981731740959,6831 146 | 420500,2018-04-06 01:29:49.04032+00,tagnews 1.0.2,0.980764656949998,7115 147 | 420547,2018-04-06 01:29:55.363319+00,tagnews 1.0.2,0.9832552983445559,7162 148 | 420694,2018-04-06 01:30:19.057729+00,tagnews 1.0.2,0.9881275134573192,7309 149 | 420802,2018-04-06 01:30:33.855733+00,tagnews 1.0.2,0.9999239993581641,7417 150 | 421150,2018-04-06 01:31:23.423822+00,tagnews 1.0.2,0.968428807713687,7765 151 | 421153,2018-04-06 01:31:23.826251+00,tagnews 1.0.2,0.99881334561032,7768 152 | 421396,2018-04-06 01:31:57.144215+00,tagnews 1.0.2,0.999992699275227,8011 153 | 421815,2018-04-06 01:32:57.282279+00,tagnews 1.0.2,0.972751737259355,8430 154 | 422211,2018-04-06 01:33:55.110953+00,tagnews 1.0.2,0.966741584957682,8826 155 | 422747,2018-04-06 01:35:13.558814+00,tagnews 1.0.2,0.94497039979141,9362 156 | 423375,2018-04-06 01:36:42.42577+00,tagnews 1.0.2,0.8450758451966,9990 157 | 423430,2018-04-06 01:36:49.088615+00,tagnews 1.0.2,0.9999214414211991,10045 158 | 423440,2018-04-06 01:36:50.919187+00,tagnews 1.0.2,0.997539950456985,10055 159 | 423745,2018-04-06 01:37:31.853592+00,tagnews 1.0.2,0.99391700724742,10360 160 | 424048,2018-04-06 01:38:15.090671+00,tagnews 1.0.2,0.9772771440389341,10663 161 | 424319,2018-04-06 01:38:52.743871+00,tagnews 1.0.2,0.9654367904633449,10934 162 | 424604,2018-04-06 01:39:38.331158+00,tagnews 1.0.2,0.999998508945548,11219 163 | 425064,2018-04-06 01:40:45.68868+00,tagnews 1.0.2,0.962874638291766,11679 164 | 425264,2018-04-06 01:41:14.548616+00,tagnews 1.0.2,0.9983590464239879,11879 165 | 425292,2018-04-06 01:41:17.894869+00,tagnews 1.0.2,0.78292962101689,11907 166 | 425341,2018-04-06 01:41:23.704755+00,tagnews 1.0.2,0.9404141647615001,11956 167 | 426267,2018-04-06 01:43:26.075762+00,tagnews 1.0.2,0.9884139618494581,12914 168 | 426851,2018-04-06 01:44:40.313328+00,tagnews 1.0.2,0.8884907339904599,13498 169 | 428876,2018-04-06 02:54:22.545312+00,tagnews 1.0.2,0.9933536233887491,15531 170 | 430230,2018-04-06 02:57:09.02851+00,tagnews 1.0.2,0.99993935178435,16887 171 | 430749,2018-04-06 02:58:14.953914+00,tagnews 1.0.2,0.999997576502179,17407 172 | 430966,2018-04-06 02:58:42.411204+00,tagnews 1.0.2,0.999797196071687,17624 173 | 431223,2018-04-06 02:59:15.301091+00,tagnews 1.0.2,0.9999047187628759,17881 174 | 431512,2018-04-06 02:59:50.802354+00,tagnews 1.0.2,0.9993146756468541,18171 175 | 431877,2018-04-06 03:00:47.693256+00,tagnews 1.0.2,0.997268248405229,18534 176 | 432356,2018-04-06 03:01:49.769789+00,tagnews 1.0.2,0.999713222538032,18999 177 | 432510,2018-04-06 03:02:08.648613+00,tagnews 1.0.2,0.977858386569711,19152 178 | 434949,2018-04-06 03:07:09.800331+00,tagnews 1.0.2,0.9999855795547621,21593 179 | 435494,2018-04-06 03:08:15.407483+00,tagnews 1.0.2,0.9458754625382351,22138 180 | 435500,2018-04-06 03:08:15.939637+00,tagnews 1.0.2,0.996230356277238,22144 181 | 436155,2018-04-06 03:09:37.308522+00,tagnews 1.0.2,0.9999984987696869,22800 182 | 437372,2018-04-06 03:12:06.851803+00,tagnews 1.0.2,0.727250099092021,24019 183 | 440314,2018-04-06 03:18:14.179475+00,tagnews 1.0.2,0.985039327167572,26962 184 | 441456,2018-04-06 03:20:34.515571+00,tagnews 1.0.2,0.9999556382799301,28104 185 | 442355,2018-04-06 03:22:28.605809+00,tagnews 1.0.2,0.998576721748682,29003 186 | 442449,2018-04-06 03:22:40.799598+00,tagnews 1.0.2,0.993966600112061,29097 187 | 445543,2018-04-06 03:28:44.805305+00,tagnews 1.0.2,0.99982631322319,32192 188 | 448043,2018-04-06 03:38:48.768498+00,tagnews 1.0.2,0.7778902633735899,34694 189 | 448194,2018-04-06 03:39:57.436842+00,tagnews 1.0.2,0.901220353889572,34846 190 | 448259,2018-04-06 03:40:11.057364+00,tagnews 1.0.2,0.969840685449547,34911 191 | 448414,2018-04-06 03:41:03.00998+00,tagnews 1.0.2,0.917062525553955,35066 192 | 449338,2018-04-06 03:48:20.438671+00,tagnews 1.0.2,0.9951635402008809,35990 193 | 451353,2018-04-06 04:04:49.279498+00,tagnews 1.0.2,0.9687636729776951,37996 194 | 452473,2018-04-06 04:12:59.601278+00,tagnews 1.0.2,0.9990444458982649,39114 195 | -------------------------------------------------------------------------------- /lib/notebooks/keras-glove-testing-api-example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/kevin/Documents/github/article-tagging/lib\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "cd .." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "name": "stderr", 27 | "output_type": "stream", 28 | "text": [ 29 | "Using TensorFlow backend.\n" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "import os\n", 35 | "import tagnews\n", 36 | "import pandas as pd\n", 37 | "from keras.models import Sequential\n", 38 | "from keras.layers import LSTM, Dense, TimeDistributed\n", 39 | "from keras.utils import to_categorical\n", 40 | "from keras.callbacks import ModelCheckpoint\n", 41 | "import numpy as np\n", 42 | "import json\n", 43 | "import requests\n", 44 | "import keras" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "glove = tagnews.load_glove('tagnews/data/glove.6B.50d.txt')\n" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 4, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "with open('tagnews/data/training.txt', encoding='utf-8') as f:\n", 63 | " our_training_data = f.read()\n", 64 | " \n", 65 | "training_df = pd.DataFrame([x.split() for x in our_training_data.split('\\n')],\n", 66 | " columns=['word', 'tag'])\n", 67 | "training_df.iloc[:,1] = training_df.iloc[:,1].apply(int)\n", 68 | "training_df['all_tags'] = 'NA'\n", 69 | "\n", 70 | "# If you want to join our data w/ kaggle data, you can do this.\n", 71 | "# ner = tagnews.load_ner_data('tagnews/data/')\n", 72 | "# pd.concat([training_df, ner]).reset_index(drop=True)\n", 73 | "\n", 74 | "# If you just want to use our data, you can do this.\n", 75 | "ner = training_df\n", 76 | "\n", 77 | "ner = ner[['word', 'all_tags', 'tag']]" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 5, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "ner = pd.concat([ner,\n", 87 | " pd.DataFrame(ner['word'].str[0].str.isupper().values),\n", 88 | " pd.DataFrame(glove.loc[ner['word'].str.lower()].values)],\n", 89 | " axis='columns')\n", 90 | "ner.fillna(value=0.0, inplace=True)\n", 91 | "\n", 92 | "data_dim = 51\n", 93 | "timesteps = 25 # only during training, testing can take arbitrary length.\n", 94 | "num_classes = 2\n", 95 | "\n", 96 | "train_val_split = int(19 * ner.shape[0] / 20.)\n", 97 | "\n", 98 | "ner_train_idxs = range(0, train_val_split - timesteps, timesteps)\n", 99 | "x_train = np.array([ner.iloc[i:i+timesteps, 3:].values\n", 100 | " for i in ner_train_idxs])\n", 101 | "y_train = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)\n", 102 | " for i in ner_train_idxs])\n", 103 | "\n", 104 | "ner_val_idxs = range(train_val_split, ner.shape[0] - timesteps, timesteps)\n", 105 | "x_val = np.array([ner.iloc[i:i+timesteps, 3:].values\n", 106 | " for i in ner_val_idxs])\n", 107 | "y_val = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)\n", 108 | " for i in ner_val_idxs])" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "____________________________________________________________________________________________________\n", 121 | "Layer (type) Output Shape Param # \n", 122 | "====================================================================================================\n", 123 | "lstm_1 (LSTM) (None, None, 32) 10752 \n", 124 | "____________________________________________________________________________________________________\n", 125 | "lstm_2 (LSTM) (None, None, 8) 1312 \n", 126 | "____________________________________________________________________________________________________\n", 127 | "time_distributed_1 (TimeDistributed) (None, None, 2) 18 \n", 128 | "====================================================================================================\n", 129 | "Total params: 12,082\n", 130 | "Trainable params: 12,082\n", 131 | "Non-trainable params: 0\n", 132 | "____________________________________________________________________________________________________\n", 133 | "None\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "model = Sequential()\n", 139 | "model.add(LSTM(32, return_sequences=True, input_shape=(None, data_dim)))\n", 140 | "model.add(LSTM(8, return_sequences=True))\n", 141 | "model.add(TimeDistributed(Dense(2, activation='softmax')))\n", 142 | "model.compile(loss='categorical_crossentropy',\n", 143 | " optimizer='adam',\n", 144 | " metrics=['categorical_accuracy'])\n", 145 | "print(model.summary(100))" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 7, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "os.makedirs('tmp', exist_ok=True)\n", 157 | "checkpointer = ModelCheckpoint(filepath='./tmp/weights-{epoch:02d}.hdf5',\n", 158 | " monitor='val_categorical_accuracy',\n", 159 | " verbose=1,\n", 160 | " save_best_only=True)\n", 161 | "\n", 162 | "class OurAUC(keras.callbacks.Callback):\n", 163 | " def on_epoch_end(self, epoch, logs={}):\n", 164 | " # Go to https://geo-extract-tester.herokuapp.com/ and download\n", 165 | " # the validation data (validation.txt).\n", 166 | " with open('validation.txt', encoding='utf-8') as f:\n", 167 | " s = f.read()\n", 168 | "\n", 169 | " gloved_data = pd.concat([pd.DataFrame([[w[0].isupper()] for w in s.split('\\n') if w]),\n", 170 | " glove.loc[[w for w in s.split('\\n') if w]].fillna(0).reset_index(drop=True)],\n", 171 | " axis='columns')\n", 172 | " glove_time_size = 100\n", 173 | " preds_batched = []\n", 174 | " i = 0\n", 175 | " while gloved_data[i:i+glove_time_size].size:\n", 176 | " preds_batched.append(model.predict(np.expand_dims(gloved_data[i:i+glove_time_size],\n", 177 | " axis=0))[0][:,1])\n", 178 | " i += glove_time_size\n", 179 | "\n", 180 | " with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'w') as f:\n", 181 | " for prob in [p for pred in preds_batched for p in pred]:\n", 182 | " f.write(str(prob) + '\\n')\n", 183 | "\n", 184 | " with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'rb') as f:\n", 185 | " url = 'https://geo-extract-tester.herokuapp.com/api/score'\n", 186 | " r = requests.post(url, files={'file': f})\n", 187 | " try:\n", 188 | " print('AUC: {:.5f}'.format(json.loads(r.text)['auc']))\n", 189 | " except KeyError:\n", 190 | " raise ValueError('Problem retrieving AUC from API. Is your validation set up to date?')\n", 191 | "\n", 192 | "our_auc = OurAUC()" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 8, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "name": "stdout", 202 | "output_type": "stream", 203 | "text": [ 204 | "Train on 2467 samples, validate on 129 samples\n", 205 | "Epoch 1/20\n", 206 | "Epoch 00000: val_categorical_accuracy improved from -inf to 0.93054, saving model to ./tmp/weights-00.hdf5\n", 207 | "AUC: 0.88599\n", 208 | "27s - loss: 0.3390 - categorical_accuracy: 0.9053 - val_loss: 0.2362 - val_categorical_accuracy: 0.9305\n", 209 | "Epoch 2/20\n", 210 | "Epoch 00001: val_categorical_accuracy did not improve\n", 211 | "AUC: 0.93386\n", 212 | "26s - loss: 0.2037 - categorical_accuracy: 0.9177 - val_loss: 0.1728 - val_categorical_accuracy: 0.9271\n", 213 | "Epoch 3/20\n", 214 | "Epoch 00002: val_categorical_accuracy did not improve\n", 215 | "AUC: 0.94096\n", 216 | "26s - loss: 0.1584 - categorical_accuracy: 0.9369 - val_loss: 0.1627 - val_categorical_accuracy: 0.9253\n", 217 | "Epoch 4/20\n", 218 | "Epoch 00003: val_categorical_accuracy did not improve\n", 219 | "AUC: 0.94627\n", 220 | "26s - loss: 0.1458 - categorical_accuracy: 0.9429 - val_loss: 0.1583 - val_categorical_accuracy: 0.9243\n", 221 | "Epoch 5/20\n", 222 | "Epoch 00004: val_categorical_accuracy did not improve\n", 223 | "AUC: 0.94879\n", 224 | "27s - loss: 0.1399 - categorical_accuracy: 0.9448 - val_loss: 0.1532 - val_categorical_accuracy: 0.9262\n", 225 | "Epoch 6/20\n", 226 | "Epoch 00005: val_categorical_accuracy did not improve\n", 227 | "AUC: 0.95070\n", 228 | "26s - loss: 0.1351 - categorical_accuracy: 0.9465 - val_loss: 0.1526 - val_categorical_accuracy: 0.9287\n", 229 | "Epoch 7/20\n", 230 | "Epoch 00006: val_categorical_accuracy did not improve\n", 231 | "AUC: 0.95202\n", 232 | "26s - loss: 0.1326 - categorical_accuracy: 0.9467 - val_loss: 0.1512 - val_categorical_accuracy: 0.9281\n", 233 | "Epoch 8/20\n", 234 | "Epoch 00007: val_categorical_accuracy did not improve\n", 235 | "AUC: 0.95270\n", 236 | "27s - loss: 0.1301 - categorical_accuracy: 0.9488 - val_loss: 0.1527 - val_categorical_accuracy: 0.9281\n", 237 | "Epoch 9/20\n", 238 | "Epoch 00008: val_categorical_accuracy did not improve\n", 239 | "AUC: 0.95297\n", 240 | "27s - loss: 0.1276 - categorical_accuracy: 0.9493 - val_loss: 0.1465 - val_categorical_accuracy: 0.9274\n", 241 | "Epoch 10/20\n", 242 | "Epoch 00009: val_categorical_accuracy did not improve\n", 243 | "AUC: 0.95275\n", 244 | "28s - loss: 0.1255 - categorical_accuracy: 0.9493 - val_loss: 0.1444 - val_categorical_accuracy: 0.9287\n", 245 | "Epoch 11/20\n", 246 | "Epoch 00010: val_categorical_accuracy did not improve\n", 247 | "AUC: 0.95273\n", 248 | "27s - loss: 0.1241 - categorical_accuracy: 0.9496 - val_loss: 0.1439 - val_categorical_accuracy: 0.9281\n", 249 | "Epoch 12/20\n", 250 | "Epoch 00011: val_categorical_accuracy did not improve\n", 251 | "AUC: 0.95465\n", 252 | "27s - loss: 0.1231 - categorical_accuracy: 0.9498 - val_loss: 0.1443 - val_categorical_accuracy: 0.9268\n", 253 | "Epoch 13/20\n", 254 | "Epoch 00012: val_categorical_accuracy did not improve\n", 255 | "AUC: 0.95379\n", 256 | "27s - loss: 0.1211 - categorical_accuracy: 0.9507 - val_loss: 0.1492 - val_categorical_accuracy: 0.9284\n", 257 | "Epoch 14/20\n", 258 | "Epoch 00013: val_categorical_accuracy did not improve\n", 259 | "AUC: 0.95501\n", 260 | "27s - loss: 0.1195 - categorical_accuracy: 0.9510 - val_loss: 0.1436 - val_categorical_accuracy: 0.9274\n", 261 | "Epoch 15/20\n", 262 | "Epoch 00014: val_categorical_accuracy did not improve\n", 263 | "AUC: 0.95443\n", 264 | "27s - loss: 0.1170 - categorical_accuracy: 0.9527 - val_loss: 0.1405 - val_categorical_accuracy: 0.9290\n", 265 | "Epoch 16/20\n", 266 | "Epoch 00015: val_categorical_accuracy did not improve\n", 267 | "AUC: 0.95387\n", 268 | "26s - loss: 0.1151 - categorical_accuracy: 0.9536 - val_loss: 0.1395 - val_categorical_accuracy: 0.9281\n", 269 | "Epoch 17/20\n", 270 | "Epoch 00016: val_categorical_accuracy did not improve\n", 271 | "AUC: 0.95428\n", 272 | "27s - loss: 0.1135 - categorical_accuracy: 0.9538 - val_loss: 0.1402 - val_categorical_accuracy: 0.9278\n", 273 | "Epoch 18/20\n", 274 | "Epoch 00017: val_categorical_accuracy did not improve\n", 275 | "AUC: 0.95323\n", 276 | "27s - loss: 0.1120 - categorical_accuracy: 0.9546 - val_loss: 0.1450 - val_categorical_accuracy: 0.9287\n", 277 | "Epoch 19/20\n", 278 | "Epoch 00018: val_categorical_accuracy improved from 0.93054 to 0.93240, saving model to ./tmp/weights-18.hdf5\n", 279 | "AUC: 0.95366\n", 280 | "27s - loss: 0.1107 - categorical_accuracy: 0.9557 - val_loss: 0.1386 - val_categorical_accuracy: 0.9324\n", 281 | "Epoch 20/20\n", 282 | "Epoch 00019: val_categorical_accuracy improved from 0.93240 to 0.93240, saving model to ./tmp/weights-19.hdf5\n", 283 | "AUC: 0.95260\n", 284 | "27s - loss: 0.1078 - categorical_accuracy: 0.9570 - val_loss: 0.1414 - val_categorical_accuracy: 0.9324\n" 285 | ] 286 | }, 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "" 291 | ] 292 | }, 293 | "execution_count": 8, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "model.fit(x_train, y_train,\n", 300 | " epochs=20,\n", 301 | " validation_data=(x_val, y_val),\n", 302 | " callbacks=[checkpointer, our_auc],\n", 303 | " verbose=2)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "# Go to https://geo-extract-tester.herokuapp.com/ and download\n", 313 | "# the validation data (validation.txt).\n", 314 | "with open('validation.txt', encoding='utf-8') as f:\n", 315 | " s = f.read()\n", 316 | "\n", 317 | "gloved_data = glove.loc[[w for w in s.split('\\n') if w]].fillna(0)\n", 318 | "glove_time_size = 100\n", 319 | "preds_batched = []\n", 320 | "i = 0\n", 321 | "while gloved_data[i:i+glove_time_size].size:\n", 322 | " preds_batched.append(model.predict(np.expand_dims(gloved_data[i:i+glove_time_size], axis=0))[0][:,1])\n", 323 | " i += glove_time_size\n", 324 | "\n", 325 | "preds = [p for pred in preds_batched for p in pred]\n", 326 | "\n", 327 | "print('\\n'.join(['{:>15} {:>9.4f}'.format(w, p) for (w, p) in zip(words, preds)][400:500]))\n", 328 | " \n", 329 | "with open('guesses.txt', 'w') as f:\n", 330 | " for prob in [p for pred in preds_batched for p in pred]:\n", 331 | " f.write(str(prob) + '\\n')\n", 332 | "\n", 333 | "# Now go to https://geo-extract-tester.herokuapp.com/ and upload `guesses.txt` to see how you did!" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": { 340 | "collapsed": true 341 | }, 342 | "outputs": [], 343 | "source": [] 344 | } 345 | ], 346 | "metadata": { 347 | "kernelspec": { 348 | "display_name": "Python 3", 349 | "language": "python", 350 | "name": "python3" 351 | }, 352 | "language_info": { 353 | "codemirror_mode": { 354 | "name": "ipython", 355 | "version": 3 356 | }, 357 | "file_extension": ".py", 358 | "mimetype": "text/x-python", 359 | "name": "python", 360 | "nbconvert_exporter": "python", 361 | "pygments_lexer": "ipython3", 362 | "version": "3.6.1" 363 | } 364 | }, 365 | "nbformat": 4, 366 | "nbformat_minor": 2 367 | } 368 | -------------------------------------------------------------------------------- /lib/tagnews/data/ci-data/newsarticles_trainedlocation.csv: -------------------------------------------------------------------------------- 1 | 2285,Marquez-Connerly,,,413550 2 | 2286,7500 block of Ridge Avenue,,,413550 3 | 2287,"500 block of Elmwood Avenue,",,,413550 4 | 3352,Southwest Side Little Village,,,415147 5 | 3353,"3300 block of West 27th Street,",,,415147 6 | 3708,5700 block of North Christiana,,,415662 7 | 5640,Marquette,,,417739 8 | 5641,and Marquette,,,417739 9 | 5642,of 79th and Marquette. Much,,,417739 10 | 5643,Fernando Chavez’s,,,417739 11 | 7077,West,,,419207 12 | 7078,District,,,419207 13 | 7516,1700 block of North Rockwell,,,419765 14 | 7610,Side,,,419836 15 | 7611,3800 block of North Freemont,,,419836 16 | 7612,Harbor,,,419836 17 | 7699,South Side Englewood neighborhood.,,,419919 18 | 7700,5800 block of South Elizabeth Street,,,419919 19 | 7701,5800 block of South Racine,,,419919 20 | 7702,59th Street,,,419919 21 | 7947,2400 block of West,,,420216 22 | 8846,2400 block of North Lincoln,,,421150 23 | 8847,"Felix Hotel, 111 W. Huron. She",,,421150 24 | 8848,The,,,421150 25 | 9022,Northwest Side. The trio,,,421396 26 | 9023,3400 block of North Lawndale,,,421396 27 | 9024,4800 block of South Lake,,,421396 28 | 9025,3700 block of South,,,421396 29 | 9026,3300 block of West,,,421396 30 | 9701,County Courthouse,,,422211 31 | 10619,Monday,,,423375 32 | 10620,6700 to 6900 blocks of South Justine. Anyone,,,423375 33 | 10652,"Chicago,",,,423430 34 | 10655,Street,,,423440 35 | 10924,County,,,423745 36 | 10925,200 block of Sherry Lane. A,,,423745 37 | 11175,block of North,,,424048 38 | 11176,"bakery,",,,424048 39 | 11177,"1000 block of North Boulevard,",,,424048 40 | 11178,"1010 North Blvd,",,,424048 41 | 11380,WIRE December,,,424319 42 | 11381,Gresham District,,,424319 43 | 11382,Halsted St.,,,424319 44 | 11383,Chevrolet Caprice,,,424319 45 | 11384,79th and Peoria,,,424319 46 | 11856,2100 block of North Milwaukee Avenue.,,,424604 47 | 11857,"2100 block of North Rockwell Street,",,,424604 48 | 12384,Circuit,,,425064 49 | 12589,7100 block of South Normal Avenue.,,,425292 50 | 12590,7100 block of South Normal,,,425292 51 | 12635,2700 block of South Vernon Avenue.,,,425341 52 | 13250,2700,,,426267 53 | 16959,Lawn,,,430230 54 | 16960,"63rd Street,",,,430230 55 | 16961,3600 block of South 52nd Court,,,430230 56 | 16962,west,,,430230 57 | 16963,63rd St.,,,430230 58 | 16964,"Street apartment,",,,430230 59 | 16965,Street. Neighbors,,,430230 60 | 16966,Investigators,,,430230 61 | 16967,Street,,,430230 62 | 16968,Midway Lounge,,,430230 63 | 16969,"Street, reading",,,430230 64 | 17494,leg. The,,,430749 65 | 17495,6400 block of South Morgan Street,,,430749 66 | 17496,6000 block of,,,430749 67 | 17497,gallery,,,430749 68 | 17704,ne ighborhood,,,430966 69 | 17705,"South Side, police. The",,,430966 70 | 17706,1400 block of East 75th Street,,,430966 71 | 18145,"Area,",,,431512 72 | 18146,south,,,431512 73 | 18147,west,,,431512 74 | 18148,114th,,,431512 75 | 18571,of South,,,431877 76 | 18572,8400 block of South Kingston Avenue.,,,431877 77 | 22421,intersection of East 79th Street,,,436155 78 | 22422,Street bus. She,,,436155 79 | 23381,Her,,,437372 80 | 23382,North Broadway,,,437372 81 | 27583,4400 block of South Hermitage,,,441456 82 | 28555,Roseland neigh borhood. Police,,,442355 83 | 28556,Street,,,442355 84 | 28589,Michigan,,,442449 85 | 31881,South East End.,,,445543 86 | 31882,South Dobson. Police,,,445543 87 | 35730,S. Ahrens,,,449338 88 | 37603,West Farragut,,,451353 89 | 37604,"W. Berwyn,",,,451353 90 | 38654,Whitehall Hotel ][2],,,452473 91 | 38655,"100 block of East Delaware Place,",,,452473 92 | 38656,"floor,""",,,452473 93 | 40168,of 91st Street and South,,,453893 94 | 40169,South Side,,,453893 95 | 40170,9000 block of South,,,453893 96 | 40171,4900 block of West Ferdinand Street,,,453893 97 | 40172,3600 block of West Diversey Ave,,,453893 98 | 40173,"8000 block of South Manistee Avenue,",,,453893 99 | 40174,"1900 block of South Marshall Boulevard,",,,453893 100 | 40175,1100 block of North Pulaski Road,,,453893 101 | 40176,2300 block of South Rockwell,,,453893 102 | 40177,"7800 block of South Paulina Street,",,,453893 103 | 40178,2300 block of South Washtenaw Avenue,,,453893 104 | 40179,"Street,",,,453893 105 | 40694,"Street,",,,454491 106 | 42410,Side,,,456158 107 | 42411,Morgan Park,,,456158 108 | 42412,Southwest Side. The,,,456158 109 | 42413,2200 block of West Barry,,,456158 110 | 42414,Lake View neighborhood.,,,456158 111 | 42415,of,,,456158 112 | 42416,19th,,,456158 113 | 42451,4500 block of South Paulina Street.,,,456190 114 | 42452,"arm,",,,456190 115 | 43893,5600 block of West Grand,,,457690 116 | 43894,When I,,,457690 117 | 44375,South Side,,,458343 118 | 44376,Bank,,,458343 119 | 44377,9400 block of South Ashland Avenue,,,458343 120 | 44795,Thorobreds,,,458774 121 | 44796,"floor,",,,458774 122 | 47308,South Deering neighborhood,,,462142 123 | 47309,"city’s Far South Side,",,,462142 124 | 47310,"9800 block of South Merrill Avenue,",,,462142 125 | 47311,South,,,462142 126 | 47312,2200 block of East 97th Street,,,462142 127 | 47313,9400 block of South Rhodes Avenue,,,462142 128 | 47314,8600 block of South Kingston,,,462142 129 | 47315,block of North,,,462142 130 | 47316,"5600 block of West Washington Boulevard,",,,462142 131 | 47317,Garfield Park neighborhood. Claude Snulligan,,,462142 132 | 47318,0,,,462142 133 | 47319,"100 block of South Pulaski Road,",,,462142 134 | 47320,block of South Drake,,,462142 135 | 47321,South Deering neighborhood. Terrance Wright,,,462142 136 | 47322,99th Street,,,462142 137 | 47323,"5000 block of South Carpenter Street,",,,462142 138 | 47324,"1300 block of West Estes Avenue,",,,462142 139 | 52727,"Side Wednesday night,",,,470170 140 | 52728,Street,,,470170 141 | 52729,block of East 147th Street,,,470170 142 | 54460,"4400 block of North Sheridan Road,",,,472166 143 | 54461,When,,,472166 144 | 54462,block of West Lawrence,,,472166 145 | 54719,"100 block of East Cass Street,",,,472582 146 | 54720,Hospital,,,472582 147 | 54721,Lenox;,,,472582 148 | 54722,"apartment,",,,472582 149 | 55052,West Pullman,,,473190 150 | 55053,"Far South Side,",,,473190 151 | 55054,of South Morgan,,,473190 152 | 56448,block of Pfingston Road,,,475062 153 | 56449,2200 block of Central Street,,,475062 154 | 69421,P Stone,,,491582 155 | 69422,7800 block of South Kingston,,,491582 156 | 69711,block of South Escanaba,,,492024 157 | 69712,8000 block of South Escanaba,,,492024 158 | 69713,10900,,,492024 159 | 69714,South Racine,,,492024 160 | 69987,"time, Pedraza said. The Harrison",,,492289 161 | 69988,The,,,492289 162 | 70500,E. Boughton Road.,,,493040 163 | 70501,1725 W. Boughton,,,493040 164 | 70502,400 block of East Briarcliff Road,,,493040 165 | 70503,DUI,,,493040 166 | 70504,block of Woodcreek,,,493040 167 | 70505,Cache Road,,,493040 168 | 70506,400 block of New Avenue,,,493040 169 | 70507,600 block of Jordan Avenue,,,493040 170 | 70508,1700 block of William Drive,,,493040 171 | 71384,"6500 block of South Maryland,",,,494190 172 | 71385,Maryland,,,494190 173 | 71386,"Sedell Brown,",,,494190 174 | 73400,9700 block of South Greenwood Avenue,,,496875 175 | 76891,110 block of South Michigan,,,500625 176 | 77309,1100 block of Pleasant of,,,501009 177 | 77310,block of Superior Street,,,501009 178 | 77311,6400 block of Roosevelt Road,,,501009 179 | 77312,400 block of Augusta Boulevard,,,501009 180 | 77313,1000 block of South Elmwood Avenue,,,501009 181 | 77314,1100 block of South Humphrey Avenue,,,501009 182 | 77315,"100 block of North Humphrey Avenue,",,,501009 183 | 77316,1000 block of Woodbine between,,,501009 184 | 77317,600 block of Highland Avenue. •A,,,501009 185 | 77318,720 W. North,,,501009 186 | 77319,400 block of Washington between,,,501009 187 | 77320,800 block of Home Avenue,,,501009 188 | 77321,5900 block of Chicago Avenue,,,501009 189 | 77322,400 block of North Humphrey Avenue.,,,501009 190 | 77323,block of Lake,,,501009 191 | 77324,8000 block of South Drexel,,,501009 192 | 78941,3700 block of West 119th Street,,,502490 193 | 78942,Friday,,,502490 194 | 78943,"3700 block of West 119th Street,",,,502490 195 | 78944,8700 block of South Burley Avenue,,,502490 196 | 78945,Pontrelli,,,502490 197 | 79020,"Boston Marathon,",,,502547 198 | 79021,Three,,,502547 199 | 79022,Boston Marathon,,,502547 200 | 79299,"Boston Marathon,",,,502889 201 | 79300,Chic,,,502889 202 | 79301,Marathon,,,502889 203 | 83804,Assembly,,,507757 204 | 83805,south suburbs. They,,,507757 205 | 88363,6400 block of North Albany Avenue,,,512592 206 | 90167,west,,,514617 207 | 90607,"6700 block of South Evans Avenue,",,,515297 208 | 90608,Police,,,515297 209 | 90609,Their,,,515297 210 | 90610,65th and Maryland,,,515297 211 | 90611,"5700 block of South Washtenaw Avenue,",,,515297 212 | 93343,block of 175th Street,,,517535 213 | 93344,of,,,517535 214 | 93529,Meadows,,,517769 215 | 93530,Cook County Circuit,,,517769 216 | 94346,1200 block of North Mayfield,,,518677 217 | 95295,Side,,,519589 218 | 95296,intersection of West 69th Street and South,,,519589 219 | 95297,9700 block of South Merrion Avenue,,,519589 220 | 95298,"2200 block of East 75th Street,",,,519589 221 | 95299,"600 block of East 79th Street,",,,519589 222 | 96653,"Side Wednesday afternoon,",,,521136 223 | 96654,and Loomis streets,,,521136 224 | 97904,1800 block of North Damen,,,522104 225 | 108439,"3900 block of North Long,",,,533365 226 | 109412,A Cook,,,534145 227 | 109413,Corro,,,534145 228 | 109414,"4800 block of Oakton Street,",,,534145 229 | 109415,Corro,,,534145 230 | 109416,105 of,,,534145 231 | 109417,Skokie,,,534145 232 | 109994,"Roseland neighborhood,",,,534864 233 | 109995,10500 block of South LaSalle Street,,,534864 234 | 109996,"back,",,,534864 235 | 113173,forest,,,538379 236 | 113174,PT Cruiser,,,538379 237 | 113175,Cap Sauers Holdings Nature,,,538379 238 | 113176,Breit,,,538379 239 | 114033,block of North,,,539462 240 | 118406,"Austin neighborhood,",,,544884 241 | 118407,700 block of North Parkside,,,544884 242 | 118408,2090,,,544884 243 | 119130,Loop. San,,,545652 244 | 119131,The intersection,,,545652 245 | 119132,Yojimbo's,,,545652 246 | 119133,"Larrabee,",,,545652 247 | 119134,cyclist's path. Townsend's,,,545652 248 | 119135,"18th District lockup,",,,545652 249 | 119136,Clybourn-Larrabee intersection,,,545652 250 | 119137,Clybourn-Larrabee intersection,,,545652 251 | 119138,d esignate,,,545652 252 | 119139,street,,,545652 253 | 119140,Honorary Bobby,,,545652 254 | 119141,east coast. Before unveiling,,,545652 255 | 119219,He,,,545924 256 | 121737,1600 block of South,,,548801 257 | 121738,4200 block of West Lake Street,,,548801 258 | 121739,"West Side,",,,548801 259 | 121740,4200 block of South Fifth,,,548801 260 | 121741,"200 block of North Karlov Avenue,",,,548801 261 | 122020,No,,,549021 262 | 122459,South Halsted,,,549647 263 | 122460,Auburn-Gresham,,,549647 264 | 122461,Tuesday,,,549647 265 | 123640,southwest,,,551242 266 | 123641,"arm,",,,551242 267 | 123642,Will County,,,551242 268 | 123643,block of Francis Street,,,551242 269 | 123644,"Gerald Chamberlain Jr.,",,,551242 270 | 123645,1300 block of Englewood Avenue,,,551242 271 | 124322,Bank of America branch located,,,551960 272 | 124323,18460 Governors Highway,,,551960 273 | 124324,approximately,,,551960 274 | 124325,183rd Street,,,551960 275 | 124326,183rd Street. Parker,,,551960 276 | 124626,"Court House,",,,552438 277 | 127597,2700 block of N. Mango. Police,,,555583 278 | 127598,2500 block of N.,,,555583 279 | 127853,The,,,555900 280 | 130841,"County board,",,,559515 281 | 131998,9000 block of South,,,561091 282 | 131999,9100 block of South Bishop,,,561091 283 | 132253,Bluebird,,,561538 284 | 133136,200 block of West Diversey Parkway.,,,563189 285 | 137732,block of South,,,571856 286 | 137733,4300 block of North Sheridan Road,,,571856 287 | 137734,"3500 block of North Broadway,",,,571856 288 | 137735,"5500 block of North Clark,",,,571856 289 | 140874,South Laflin,,,576100 290 | 140875,West Side,,,576100 291 | 142014,Calumet Heights neighborhood,,,578001 292 | 142015,Chicago's South Side.,,,578001 293 | 142016,9300 block of South Stony,,,578001 294 | 142372,Austin,,,578317 295 | 142373,5400 block of West Madison Street,,,578317 296 | 142374,Austin,,,578317 297 | 142375,That,,,578317 298 | 143657,"School,",,,580278 299 | 143905,Police,,,580407 300 | 143925,south side of,,,580457 301 | 146364,"3700 block of 83rd Place,",,,584188 302 | 146365,Two,,,584188 303 | 149490,"8200 block of South Whipple Street,",,,589047 304 | 149491,"7900 block of South California Avenue,",,,589047 305 | 163511,7400 block of South South,,,614671 306 | 163512,"Street,",,,614671 307 | 165137,430,,,617068 308 | 172159,Austin neighborhood. Demetrius Bronson,,,627366 309 | 172160,900 block of Lorel Avenue.,,,627366 310 | 172161,DiBella,,,627366 311 | 172162,"900 block of Lorel,",,,627366 312 | 173206,West Englewood,,,628811 313 | 173207,South Side,,,628811 314 | 173208,1600 block of West 71st Street,,,628811 315 | 173209,South Chicago neighborhood,,,628811 316 | 179094,Pulaski Road. The,,,638085 317 | 212069,5500 block of South Hermitage,,,673593 318 | 212070,Lower West Side. The,,,673593 319 | 212071,block of South Western,,,673593 320 | 212072,Humboldt Park,,,673593 321 | 212073,West Side. A,,,673593 322 | 212074,1100 block of North Ridgeway,,,673593 323 | 212075,South Shore neighborhood. Officers,,,673593 324 | 212076,6900 block of South Clyde,,,673593 325 | 212077,2200 block of South Lawndale,,,673593 326 | 212078,West Side. Hector Badillo,,,673593 327 | 212079,700 block of North California Avenue,,,673593 328 | 212080,"400 block of North Trumbull,",,,673593 329 | 212081,Southwest Side Brighton Park,,,673593 330 | 212082,"2600 block of West 39th,",,,673593 331 | 212083,Woodlawn,,,673593 332 | 212084,South Side. The,,,673593 333 | 212085,6200 block of South Drexel,,,673593 334 | 212086,South Side. The,,,673593 335 | 212087,block of West 79th,,,673593 336 | 212088,Trumbull Park neighborhood,,,673593 337 | 212089,Far South Side. A,,,673593 338 | 212090,block of South Yates,,,673593 339 | 212091,South Side. At,,,673593 340 | 219217,West Garfield Park neighborhood. The,,,680518 341 | 219218,Kostner,,,680518 342 | 219219,"4300 block of West Adams,",,,680518 343 | 219305,block of 81st Court,,,680698 344 | 219306,Palos,,,680698 345 | 219307,Department,,,680698 346 | 219308,"Correction,",,,680698 347 | 234322,north,,,694916 348 | 234323,Drive,,,694916 349 | 234324,Park,,,694916 350 | 234400,3500 block of Wonder Lake,,,695016 351 | 242709,A Chicago,,,703432 352 | 242710,Side,,,703432 353 | 244808,Jefferson Park,,,705758 354 | 244809,Northwest Side. Kyle Brandon,,,705758 355 | 244810,5000 block of North Long,,,705758 356 | 248279,Wrigleyville,,,709872 357 | 248280,N orth Side. Jarqueese O’Brian,,,709872 358 | 248281,3700 block of North Fremont,,,709872 359 | 248282,South Side,,,709872 360 | 248283,11500 block of South Throop,,,709872 361 | 253595,block of South Calumet Avenue,,,716443 362 | 253596,block of South Prairie Avenue,,,716443 363 | 253597,10400 block of South Indiana Avenue,,,716443 364 | 253598,10500 block of South Forest Avenue,,,716443 365 | 253599,10400 block of South calumet Avenue,,,716443 366 | 253600,800 block of East 103rd Street •,,,716443 367 | 253601,500 block of East 105th Street •,,,716443 368 | 253602,400 block of East 107th Street The,,,716443 369 | 262599,800 block of N. Michigan Ave.,,,727147 370 | 262600,Gold Coast,,,727147 371 | 262601,1700 block of W. Wabansia Ave.,,,727147 372 | 262602,700 block of W. Hubbard St.,,,727147 373 | 262603,3500 block of N. Clark St.,,,727147 374 | 262604,1900 block of N. Lincoln Ave.,,,727147 375 | 262605,Lincoln,,,727147 376 | 262606,700 block of N. Armour St.,,,727147 377 | 262607,1300 block of N. Bosworth Ave.,,,727147 378 | 263732,west,,,728505 379 | 263733,of Amesbury Road,,,728505 380 | 264502,South Loop. Samantha Salazar,,,729271 381 | 264503,1100 block of South Indiana,,,729271 382 | 264504,South Side Auburn Gresham neighborhood.,,,729271 383 | 264505,County Jail,,,729271 384 | 265724,3900 block of North Ashland Avenue,,,730663 385 | 265725,6200 block of North Western Avenue,,,730663 386 | 265726,1900 block of West Peterson Avenue •,,,730663 387 | 265727,4400 block of North Broadway •,,,730663 388 | 265728,4800 block of North Broadway •,,,730663 389 | 265729,2800 block of North Broadway No,,,730663 390 | 273333,1600 block of South St. Louis,,,739867 391 | 273334,1800 block of West 87,,,739867 392 | 273335,St. On,,,739867 393 | 273336,1800 block of W. 87,,,739867 394 | 273337,"St,",,,739867 395 | 273338,1200 block of W. 79 th,,,739867 396 | 273339,block of S. Marshell,,,739867 397 | 273340,approximately 2:28,,,739867 398 | 273341,7700 block of S. Cottage Grove,,,739867 399 | 273342,1100 block of North Lockwood,,,739867 400 | 273343,1300 block of South Millard,,,739867 401 | 273344,7100 block of South State,,,739867 402 | 273345,4200 block of West Addison,,,739867 403 | 273346,6600 block of South Capmbell. He,,,739867 404 | 273347,1300 block of South Throop. A,,,739867 405 | 273348,400 block of East,,,739867 406 | 273349,Street.,,,739867 407 | 273350,6600 block of South Cottage Grove. The,,,739867 408 | 273351,"6800 block of South Crandon,",,,739867 409 | 273352,He,,,739867 410 | 273353,1600 block of South St. Louis. He,,,739867 411 | 273354,A,,,739867 412 | 273355,block of South Wallace,,,739867 413 | 273356,approximately,,,739867 414 | 273357,5800 block of South King Drive,,,739867 415 | 273358,block of West,,,739867 416 | 273359,4500 block of South Pulaski. One,,,739867 417 | 273360,street. An,,,739867 418 | 297211,South Side Fuller Park,,,766495 419 | 297212,"200 block of West 47th Street. Initially,",,,766495 420 | 305403,South May Street,,,776203 421 | -------------------------------------------------------------------------------- /lib/notebooks/keras-glove-with-street-names-better.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "/Users/josh/Documents/chihack/article-tagging/lib\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "cd '/Users/josh/Documents/chihack/article-tagging/lib'" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "name": "stderr", 27 | "output_type": "stream", 28 | "text": [ 29 | "/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n", 30 | " return f(*args, **kwds)\n" 31 | ] 32 | } 33 | ], 34 | "source": [ 35 | "from numpy.random import seed\n", 36 | "seed(1)\n", 37 | "from tensorflow import set_random_seed\n", 38 | "set_random_seed(2)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "import os\n", 50 | "import tagnews\n", 51 | "import pandas as pd\n", 52 | "from keras.models import Sequential\n", 53 | "from keras.layers import LSTM, Dense, TimeDistributed\n", 54 | "from keras.utils import to_categorical\n", 55 | "from keras.callbacks import ModelCheckpoint\n", 56 | "import numpy as np\n", 57 | "import json\n", 58 | "import requests\n", 59 | "import keras\n", 60 | "import shutil" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "glove = tagnews.load_glove('tagnews/data/glove.6B.50d.txt')" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "(400000, 50)" 83 | ] 84 | }, 85 | "execution_count": 5, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "glove.shape" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 6, 97 | "metadata": { 98 | "collapsed": true 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "glove.loc['address_vec'] = glove.loc[['street', 'avenue', 'place', 'road', 'block', 'main', 'city', 'west', 'east', 'north', 'south']].mean()\n", 103 | "glove.loc['neighborhood_vec'] = glove.loc[['neighborhood', 'burrough', 'community', 'area']].mean()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 7, 109 | "metadata": { 110 | "collapsed": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "with open('tagnews/data/Chicago_Street_Names.csv') as street_names:\n", 115 | " streets = street_names.read().splitlines()[1:]\n", 116 | "streets = [i.lower() for i in streets]\n", 117 | "\n", 118 | "with open('tagnews/data/chicago_neighborhoods.csv') as neighborhoods:\n", 119 | " hoods = neighborhoods.read().splitlines()\n", 120 | "hoods = list(set([j.lower().replace('\\\"', '') for j in hoods]))\n", 121 | "\n", 122 | "for name in streets:\n", 123 | " glove.loc[name] = glove.loc['address_vec']\n", 124 | "for hood in hoods:\n", 125 | " glove.loc[hood] = glove.loc['neighborhood_vec']" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 8, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "with open('tagnews/data/training.txt', encoding='utf-8') as f:\n", 137 | " our_training_data = f.read()\n", 138 | " \n", 139 | "training_df = pd.DataFrame([x.split() for x in our_training_data.split('\\n') if x],\n", 140 | " columns=['word', 'tag'])\n", 141 | "\n", 142 | "training_df.iloc[:,1] = training_df.iloc[:,1].apply(int)\n", 143 | "training_df['all_tags'] = 'NA'\n", 144 | "\n", 145 | "# If you want to join our data w/ kaggle data, you can do this.\n", 146 | "# ner = tagnews.load_ner_data('tagnews/data/')\n", 147 | "# pd.concat([training_df, ner]).reset_index(drop=True)\n", 148 | "\n", 149 | "# If you just want to use our data, you can do this.\n", 150 | "ner = training_df\n", 151 | "\n", 152 | "ner = ner[['word', 'all_tags', 'tag']]" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 9, 158 | "metadata": { 159 | "collapsed": true 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "# pd.DataFrame(glove.loc[ner.loc[ner['word'] == 'Woodlawn']['word'].str.lower()].values)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 10, 169 | "metadata": { 170 | "collapsed": true 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "ner = pd.concat([ner,\n", 175 | " pd.DataFrame(ner['word'].str[0].str.isupper().values),\n", 176 | " pd.DataFrame(glove.loc[ner['word'].str.lower()].values),\n", 177 | " pd.DataFrame(ner['word'].str.isnumeric().values),\n", 178 | " pd.DataFrame(ner['word'].str.len().values)],\n", 179 | " axis='columns')\n", 180 | "ner.fillna(value=0.0, inplace=True)\n", 181 | "\n", 182 | "data_dim = 53\n", 183 | "timesteps = 25 # only during training, testing can take arbitrary length.\n", 184 | "num_classes = 2\n", 185 | "\n", 186 | "train_val_split = int(19 * ner.shape[0] / 20.)\n", 187 | "\n", 188 | "ner_train_idxs = range(0, train_val_split - timesteps, timesteps)\n", 189 | "x_train = np.array([ner.iloc[i:i+timesteps, 3:].values\n", 190 | " for i in ner_train_idxs])\n", 191 | "y_train = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)\n", 192 | " for i in ner_train_idxs])\n", 193 | "\n", 194 | "ner_val_idxs = range(train_val_split, ner.shape[0] - timesteps, timesteps)\n", 195 | "x_val = np.array([ner.iloc[i:i+timesteps, 3:].values\n", 196 | " for i in ner_val_idxs])\n", 197 | "y_val = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)\n", 198 | " for i in ner_val_idxs])" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 11, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "____________________________________________________________________________________________________\n", 211 | "Layer (type) Output Shape Param # \n", 212 | "====================================================================================================\n", 213 | "lstm_1 (LSTM) (None, None, 32) 11008 \n", 214 | "____________________________________________________________________________________________________\n", 215 | "lstm_2 (LSTM) (None, None, 8) 1312 \n", 216 | "____________________________________________________________________________________________________\n", 217 | "time_distributed_1 (TimeDistributed) (None, None, 2) 18 \n", 218 | "====================================================================================================\n", 219 | "Total params: 12,338\n", 220 | "Trainable params: 12,338\n", 221 | "Non-trainable params: 0\n", 222 | "____________________________________________________________________________________________________\n", 223 | "None\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "model = Sequential()\n", 229 | "model.add(LSTM(32, return_sequences=True, input_shape=(None, data_dim)))\n", 230 | "model.add(LSTM(8, return_sequences=True))\n", 231 | "model.add(TimeDistributed(Dense(2, activation='softmax')))\n", 232 | "model.compile(loss='categorical_crossentropy',\n", 233 | " optimizer='adam',\n", 234 | " metrics=['categorical_accuracy'])\n", 235 | "print(model.summary(100))" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 12, 241 | "metadata": { 242 | "collapsed": true 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "os.makedirs('tmp', exist_ok=True)\n", 247 | "checkpointer = ModelCheckpoint(filepath='./tmp/weights-{epoch:02d}.hdf5',\n", 248 | " monitor='val_categorical_accuracy',\n", 249 | " verbose=1,\n", 250 | " save_best_only=True)\n", 251 | "\n", 252 | "class OurAUC(keras.callbacks.Callback):\n", 253 | " def on_epoch_end(self, epoch, logs={}):\n", 254 | " # Go to https://geo-extract-tester.herokuapp.com/ and download\n", 255 | " # the validation data (validation.txt).\n", 256 | " '''with open('validation.txt', encoding='utf-8') as f:\n", 257 | " s = f.read()\n", 258 | "\n", 259 | " gloved_data = pd.concat([pd.DataFrame([[w[0].isupper()] for w in s.split('\\n') if w]),\n", 260 | " glove.loc[[w for w in s.split('\\n') if w]].fillna(0).reset_index(drop=True)],\n", 261 | " axis='columns')\n", 262 | " glove_time_size = 100\n", 263 | " preds_batched = []\n", 264 | " i = 0\n", 265 | " while gloved_data[i:i+glove_time_size].size:\n", 266 | " preds_batched.append(model.predict(np.expand_dims(gloved_data[i:i+glove_time_size],\n", 267 | " axis=0))[0][:,1])\n", 268 | " i += glove_time_size\n", 269 | "\n", 270 | " with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'w') as f:\n", 271 | " for prob in [p for pred in preds_batched for p in pred]:\n", 272 | " f.write(str(prob) + '\\n')'''\n", 273 | "\n", 274 | " with open('validation.txt', encoding='utf-8') as f:\n", 275 | " s = f.read()\n", 276 | "\n", 277 | " gloved_data = pd.concat([pd.DataFrame([[w[0].isupper()] for w in s.split('\\n') if w]),\n", 278 | " glove.loc[[w for w in s.split('\\n') if w]].fillna(0).reset_index(drop=True),\n", 279 | " pd.DataFrame([[w[0].isnumeric()] for w in s.split('\\n') if w]),\n", 280 | " pd.DataFrame([[len(w[0])] for w in s.split('\\n') if w])],\n", 281 | " axis='columns')\n", 282 | " glove_time_size = 100\n", 283 | " preds_batched = []\n", 284 | " i = 0\n", 285 | " while gloved_data[i:i+glove_time_size].size:\n", 286 | " preds_batched.append(model.predict(np.expand_dims(gloved_data[i:i+glove_time_size],\n", 287 | " axis=0))[0][:,1])\n", 288 | " i += glove_time_size\n", 289 | "\n", 290 | " with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'w') as f:\n", 291 | " for prob in [p for pred in preds_batched for p in pred]:\n", 292 | " f.write(str(prob) + '\\n')\n", 293 | "\n", 294 | " with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'rb') as f:\n", 295 | " url = 'https://geo-extract-tester.herokuapp.com/api/score'\n", 296 | " r = requests.post(url, files={'file': f})\n", 297 | " try:\n", 298 | " print('AUC: {:.5f}'.format(json.loads(r.text)['auc']))\n", 299 | " except KeyError:\n", 300 | " raise ValueError('Problem retrieving AUC from API. Is your validation set up to date?')\n", 301 | "\n", 302 | "our_auc = OurAUC()" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 13, 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "name": "stdout", 312 | "output_type": "stream", 313 | "text": [ 314 | "Train on 6271 samples, validate on 330 samples\n", 315 | "Epoch 1/20\n", 316 | "Epoch 00001: val_categorical_accuracy improved from -inf to 0.97697, saving model to ./tmp/weights-01.hdf5\n", 317 | "AUC: 0.92249\n", 318 | " - 60s - loss: 0.1762 - categorical_accuracy: 0.9638 - val_loss: 0.0828 - val_categorical_accuracy: 0.9770\n", 319 | "Epoch 2/20\n", 320 | "Epoch 00002: val_categorical_accuracy improved from 0.97697 to 0.97867, saving model to ./tmp/weights-02.hdf5\n", 321 | "AUC: 0.94973\n", 322 | " - 58s - loss: 0.0841 - categorical_accuracy: 0.9666 - val_loss: 0.0573 - val_categorical_accuracy: 0.9787\n", 323 | "Epoch 3/20\n", 324 | "Epoch 00003: val_categorical_accuracy improved from 0.97867 to 0.98048, saving model to ./tmp/weights-03.hdf5\n", 325 | "AUC: 0.95489\n", 326 | " - 59s - loss: 0.0708 - categorical_accuracy: 0.9734 - val_loss: 0.0526 - val_categorical_accuracy: 0.9805\n", 327 | "Epoch 4/20\n", 328 | "Epoch 00004: val_categorical_accuracy did not improve\n", 329 | "AUC: 0.95848\n", 330 | " - 59s - loss: 0.0666 - categorical_accuracy: 0.9751 - val_loss: 0.0506 - val_categorical_accuracy: 0.9792\n", 331 | "Epoch 5/20\n", 332 | "Epoch 00005: val_categorical_accuracy did not improve\n", 333 | "AUC: 0.95974\n", 334 | " - 61s - loss: 0.0639 - categorical_accuracy: 0.9760 - val_loss: 0.0487 - val_categorical_accuracy: 0.9805\n", 335 | "Epoch 6/20\n", 336 | "Epoch 00006: val_categorical_accuracy did not improve\n", 337 | "AUC: 0.96138\n", 338 | " - 63s - loss: 0.0623 - categorical_accuracy: 0.9765 - val_loss: 0.0483 - val_categorical_accuracy: 0.9804\n", 339 | "Epoch 7/20\n", 340 | "Epoch 00007: val_categorical_accuracy improved from 0.98048 to 0.98109, saving model to ./tmp/weights-07.hdf5\n", 341 | "AUC: 0.95850\n", 342 | " - 61s - loss: 0.0609 - categorical_accuracy: 0.9769 - val_loss: 0.0490 - val_categorical_accuracy: 0.9811\n", 343 | "Epoch 8/20\n", 344 | "Epoch 00008: val_categorical_accuracy did not improve\n", 345 | "AUC: 0.96303\n", 346 | " - 63s - loss: 0.0598 - categorical_accuracy: 0.9772 - val_loss: 0.0462 - val_categorical_accuracy: 0.9807\n", 347 | "Epoch 9/20\n", 348 | "Epoch 00009: val_categorical_accuracy did not improve\n", 349 | "AUC: 0.96292\n", 350 | " - 62s - loss: 0.0589 - categorical_accuracy: 0.9774 - val_loss: 0.0468 - val_categorical_accuracy: 0.9808\n", 351 | "Epoch 10/20\n", 352 | "Epoch 00010: val_categorical_accuracy did not improve\n", 353 | "AUC: 0.96326\n", 354 | " - 59s - loss: 0.0581 - categorical_accuracy: 0.9774 - val_loss: 0.0462 - val_categorical_accuracy: 0.9806\n", 355 | "Epoch 11/20\n", 356 | "Epoch 00011: val_categorical_accuracy did not improve\n", 357 | "AUC: 0.96347\n", 358 | " - 63s - loss: 0.0569 - categorical_accuracy: 0.9778 - val_loss: 0.0456 - val_categorical_accuracy: 0.9800\n", 359 | "Epoch 12/20\n", 360 | "Epoch 00012: val_categorical_accuracy did not improve\n", 361 | "AUC: 0.96203\n", 362 | " - 60s - loss: 0.0563 - categorical_accuracy: 0.9781 - val_loss: 0.0449 - val_categorical_accuracy: 0.9802\n", 363 | "Epoch 13/20\n", 364 | "Epoch 00013: val_categorical_accuracy did not improve\n", 365 | "AUC: 0.96189\n", 366 | " - 61s - loss: 0.0553 - categorical_accuracy: 0.9784 - val_loss: 0.0458 - val_categorical_accuracy: 0.9808\n", 367 | "Epoch 14/20\n", 368 | "Epoch 00014: val_categorical_accuracy did not improve\n", 369 | "AUC: 0.95982\n", 370 | " - 60s - loss: 0.0544 - categorical_accuracy: 0.9784 - val_loss: 0.0457 - val_categorical_accuracy: 0.9810\n", 371 | "Epoch 15/20\n", 372 | "Epoch 00015: val_categorical_accuracy did not improve\n", 373 | "AUC: 0.96014\n", 374 | " - 64s - loss: 0.0536 - categorical_accuracy: 0.9788 - val_loss: 0.0465 - val_categorical_accuracy: 0.9806\n", 375 | "Epoch 16/20\n", 376 | "Epoch 00016: val_categorical_accuracy did not improve\n", 377 | "AUC: 0.96055\n", 378 | " - 62s - loss: 0.0529 - categorical_accuracy: 0.9790 - val_loss: 0.0462 - val_categorical_accuracy: 0.9808\n", 379 | "Epoch 17/20\n", 380 | "Epoch 00017: val_categorical_accuracy did not improve\n", 381 | "AUC: 0.96207\n", 382 | " - 63s - loss: 0.0522 - categorical_accuracy: 0.9793 - val_loss: 0.0464 - val_categorical_accuracy: 0.9802\n", 383 | "Epoch 18/20\n", 384 | "Epoch 00018: val_categorical_accuracy improved from 0.98109 to 0.98145, saving model to ./tmp/weights-18.hdf5\n", 385 | "AUC: 0.96180\n", 386 | " - 64s - loss: 0.0511 - categorical_accuracy: 0.9798 - val_loss: 0.0459 - val_categorical_accuracy: 0.9815\n", 387 | "Epoch 19/20\n", 388 | "Epoch 00019: val_categorical_accuracy did not improve\n", 389 | "AUC: 0.95842\n", 390 | " - 59s - loss: 0.0508 - categorical_accuracy: 0.9803 - val_loss: 0.0470 - val_categorical_accuracy: 0.9804\n", 391 | "Epoch 20/20\n", 392 | "Epoch 00020: val_categorical_accuracy did not improve\n", 393 | "AUC: 0.95720\n", 394 | " - 61s - loss: 0.0498 - categorical_accuracy: 0.9802 - val_loss: 0.0467 - val_categorical_accuracy: 0.9810\n" 395 | ] 396 | }, 397 | { 398 | "data": { 399 | "text/plain": [ 400 | "" 401 | ] 402 | }, 403 | "execution_count": 13, 404 | "metadata": {}, 405 | "output_type": "execute_result" 406 | } 407 | ], 408 | "source": [ 409 | "model.fit(x_train, y_train,\n", 410 | " epochs=20,\n", 411 | " validation_data=(x_val, y_val),\n", 412 | " callbacks=[checkpointer, our_auc],\n", 413 | " verbose=2)" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": { 420 | "collapsed": true 421 | }, 422 | "outputs": [], 423 | "source": [] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": { 429 | "collapsed": true 430 | }, 431 | "outputs": [], 432 | "source": [] 433 | } 434 | ], 435 | "metadata": { 436 | "kernelspec": { 437 | "display_name": "Python 3", 438 | "language": "python", 439 | "name": "python3" 440 | }, 441 | "language_info": { 442 | "codemirror_mode": { 443 | "name": "ipython", 444 | "version": 3 445 | }, 446 | "file_extension": ".py", 447 | "mimetype": "text/x-python", 448 | "name": "python", 449 | "nbconvert_exporter": "python", 450 | "pygments_lexer": "ipython3", 451 | "version": "3.6.3" 452 | } 453 | }, 454 | "nbformat": 4, 455 | "nbformat_minor": 2 456 | } 457 | -------------------------------------------------------------------------------- /lib/notebooks/geo-string-result-explorations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sys\n", 10 | "import json\n", 11 | "sys.path.append('..')\n", 12 | "\n", 13 | "import tagnews\n", 14 | "import folium" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stderr", 24 | "output_type": "stream", 25 | "text": [ 26 | "..\\tagnews\\utils\\load_data.py:185: RuntimeWarning: 1 location strings were not found in the bodytext.\n", 27 | " RuntimeWarning)\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "df = tagnews.load_data()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "crimetags = tagnews.CrimeTags()\n", 42 | "geoextractor = tagnews.GeoCoder()" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 4, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "Article ID: 205277\n", 55 | "south: 6.251640619617706\n", 56 | "Ind.,: 3.4382634669318946\n", 57 | "1800 block of East 222nd Place: 1.1430729818468413\n", 58 | "1700 block of West 220th Place,: 1.5479177721231407\n" 59 | ] 60 | }, 61 | { 62 | "data": { 63 | "text/html": [ 64 | "
" 65 | ], 66 | "text/plain": [ 67 | "" 68 | ] 69 | }, 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "while True:\n", 77 | " random_sample = df.sample(1)\n", 78 | " article_text = random_sample['bodytext'].iloc[0]\n", 79 | " if crimetags.tagtext_proba(article_text).max() < .15:\n", 80 | " continue\n", 81 | " # print(article_text)\n", 82 | " geostrings = [' '.join(gs) for gs in geoextractor.extract_geostrings(article_text, prob_thresh=0.5)]\n", 83 | " geocode_results = tagnews.get_lat_longs_from_geostrings(geostrings)\n", 84 | " lat_longs_raw = geocode_results.lat_longs_raw\n", 85 | " lat_longs_post = geocode_results.lat_longs_post\n", 86 | " \n", 87 | " raw_scores = []\n", 88 | " for gr in geocode_results.full_responses_raw:\n", 89 | " try:\n", 90 | " raw_scores.append(json.loads(gr.response.content)['result'][0]['score'])\n", 91 | " except:\n", 92 | " raw_scores.append(None)\n", 93 | " post_scores = []\n", 94 | " for gr in geocode_results.full_responses_post:\n", 95 | " try:\n", 96 | " post_scores.append(json.loads(gr.response.content)['result'][0]['score'])\n", 97 | " except:\n", 98 | " post_scores.append(None)\n", 99 | "\n", 100 | " if not geostrings:\n", 101 | " continue\n", 102 | " \n", 103 | " print('Article ID: {}'.format(random_sample.index[0]))\n", 104 | "\n", 105 | " m = folium.Map(location=[41.87871, -87.6298])\n", 106 | "\n", 107 | " for geostring, lat_long_raw, lat_long_post, raw_score, post_score in zip(geostrings, lat_longs_raw, lat_longs_post, raw_scores, post_scores):\n", 108 | " if lat_long_raw is None:\n", 109 | " print(' Unable to code raw \"{}\"'.format(geostring))\n", 110 | " else:\n", 111 | " folium.Marker(lat_long_raw, popup=geostring + ' ; RAW ; {}'.format(raw_score)).add_to(m)\n", 112 | " \n", 113 | " if lat_long_post is None:\n", 114 | " print(' Unable to code post-processed \"{}\"'.format(geostring))\n", 115 | " else:\n", 116 | " folium.Marker(lat_long_post, popup=geostring + ' ; POST ; {}'.format(post_score)).add_to(m)\n", 117 | " \n", 118 | " try:\n", 119 | " print('{}: {}'.format(geostring, raw_score / post_score))\n", 120 | " except:\n", 121 | " print('{}: {}'.format(geostring, 'N/A'))\n", 122 | " break\n", 123 | "\n", 124 | "m" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 5, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "**CHICAGO (STMW) --** A 23-year-old man who was shot in south suburban Sauk Vill\n", 137 | "age died early Thursday.\n", 138 | "\n", 139 | "Manuel G. Montoya was pronounced dead at 1:19 a.m. at St. Margaret Hospital in\n", 140 | "Dyer, Ind., a Lake County coroner’s office statement said.\n", 141 | "\n", 142 | "He was shot in the 1800 block of East 222nd Place in Sauk Village, but police\n", 143 | "and a representative at the coroner’s office could not say when the shooting\n", 144 | "happened.\n", 145 | "\n", 146 | "He died of a gunshot wound, and his death was ruled a homicide, the coroner’s\n", 147 | "office said.\n", 148 | "\n", 149 | "Montoya lived in the 1700 block of West 220th Place, about half a mile\n", 150 | "northeast of the shooting.\n", 151 | "\n", 152 | "Sauk Village police could not provide further details early Thursday.\n", 153 | "\n", 154 | "_(Source: Sun-Times Media Wire (C) Chicago Sun-Times 2015. All Rights\n", 155 | "Reserved. This material may not be published, broadcast, rewritten, or\n", 156 | "redistributed.)_\n", 157 | "\n", 158 | "![][1]\n", 159 | "\n", 160 | " [1]: http://pixel.wp.com/b.gif?host=chicago.cbslocal.com&blog=15116062&post=6\n", 161 | "49158&subd=cbschicago&ref=&feed=1\n", 162 | "\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "print(article_text)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [] 176 | } 177 | ], 178 | "metadata": { 179 | "kernelspec": { 180 | "display_name": "Python 3", 181 | "language": "python", 182 | "name": "python3" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "name": "ipython", 187 | "version": 3 188 | }, 189 | "file_extension": ".py", 190 | "mimetype": "text/x-python", 191 | "name": "python", 192 | "nbconvert_exporter": "python", 193 | "pygments_lexer": "ipython3", 194 | "version": "3.6.1" 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 2 199 | } 200 | --------------------------------------------------------------------------------