├── .python-version
├── lib
    ├── tagnews
    │   ├── geoloc
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   └── lstm
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── saved
    │   │   │   │       └── .gitignore
    │   │   │   │   └── save_model.py
    │   │   ├── __init__.py
    │   │   └── tag.py
    │   ├── crimetype
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   └── binary_stemmed_logistic
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── save_model.py
    │   │   ├── __init__.py
    │   │   ├── cli.py
    │   │   ├── benchmark.py
    │   │   └── tag.py
    │   ├── data
    │   │   ├── ci-data
    │   │   │   ├── .gitignore
    │   │   │   ├── newsarticles_category.csv
    │   │   │   ├── newsarticles_usercoding.csv
    │   │   │   ├── newsarticles_usercoding_categories.csv
    │   │   │   ├── newsarticles_trainedcoding.csv
    │   │   │   └── newsarticles_trainedlocation.csv
    │   │   ├── .gitignore
    │   │   └── column_names.txt
    │   ├── senteval
    │   │   ├── __init__.py
    │   │   ├── police_words.py
    │   │   └── eval.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── model_helpers.py
    │   │   ├── quick_map.py
    │   │   ├── neighborhoods.py
    │   │   ├── load_vectorizer.py
    │   │   └── utils.py
    │   ├── __init__.py
    │   └── tests
    │   │   ├── test_crimetype_tag.py
    │   │   ├── test_load_data.py
    │   │   └── test_geocoder.py
    └── notebooks
    │   ├── extract-geostring-example.ipynb
    │   ├── keras-glove-testing-api-example.ipynb
    │   ├── keras-glove-with-street-names-better.ipynb
    │   └── geo-string-result-explorations.ipynb
├── .pylintrc
├── CODEOWNERS
├── r_models
    ├── .DS_Store
    └── qj_models_explore.R
├── .gitignore
├── LICENSE
├── pyproject.toml
├── .travis.yml
├── .github
    └── workflows
    │   └── publish.yml
├── README.md
└── CONTRIBUTING.md


/.python-version:
--------------------------------------------------------------------------------
1 | 3.9
2 | 


--------------------------------------------------------------------------------
/lib/tagnews/geoloc/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | generated-members=pandas.*
2 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @jherzberg @mchladek @RJWorth


--------------------------------------------------------------------------------
/lib/tagnews/crimetype/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/tagnews/geoloc/models/lstm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/tagnews/data/ci-data/.gitignore:
--------------------------------------------------------------------------------
1 | !*.csv
2 | 


--------------------------------------------------------------------------------
/lib/tagnews/geoloc/models/lstm/saved/.gitignore:
--------------------------------------------------------------------------------
1 | *.hdf5
2 | 


--------------------------------------------------------------------------------
/lib/tagnews/crimetype/models/binary_stemmed_logistic/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/tagnews/data/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | *.tgz
3 | *.bin
4 | glove*
5 | 


--------------------------------------------------------------------------------
/lib/tagnews/geoloc/__init__.py:
--------------------------------------------------------------------------------
1 | from . import tag
2 | 
3 | __all__ = [tag]
4 | 


--------------------------------------------------------------------------------
/lib/tagnews/senteval/__init__.py:
--------------------------------------------------------------------------------
1 | from . import eval, police_words
2 | 
3 | __all__ = [eval, police_words]
4 | 


--------------------------------------------------------------------------------
/r_models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chicago-justice-project/article-tagging/HEAD/r_models/.DS_Store


--------------------------------------------------------------------------------
/lib/tagnews/crimetype/__init__.py:
--------------------------------------------------------------------------------
1 | from . import tag
2 | from . import benchmark
3 | 
4 | __all__ = [tag, benchmark]
5 | 


--------------------------------------------------------------------------------
/lib/tagnews/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from . import load_data
2 | from . import load_vectorizer
3 | 
4 | __all__ = [load_data, load_vectorizer]
5 | 


--------------------------------------------------------------------------------
/lib/tagnews/senteval/police_words.py:
--------------------------------------------------------------------------------
1 | police_words_list = ["police", "officer", "cop", "officers", "pigs"]
2 | 
3 | bins = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]
4 | num_bins = len(bins)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.ipynb_checkpoints*
 3 | *.pkl
 4 | lib/tagnews.egg-info
 5 | build/
 6 | dist/
 7 | .eggs/
 8 | .cache/
 9 | .DS_Store
10 | .coverage
11 | .pytest_cache*
12 | *.gz
13 | lib/tagnews/data/*.geojson
14 | .vscode*
15 | .idea*
16 | 


--------------------------------------------------------------------------------
/lib/tagnews/utils/model_helpers.py:
--------------------------------------------------------------------------------
 1 | from nltk import word_tokenize
 2 | from nltk.stem import WordNetLemmatizer
 3 | 
 4 | 
 5 | class LemmaTokenizer(object):
 6 |     def __init__(self):
 7 |         self.wnl = WordNetLemmatizer()
 8 | 
 9 |     def __call__(self, doc):
10 |         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
11 | 


--------------------------------------------------------------------------------
/lib/tagnews/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import utils
 2 | from . import crimetype
 3 | 
 4 | from .crimetype.tag import CrimeTags
 5 | from .senteval.eval import SentimentGoogler
 6 | from .geoloc.tag import GeoCoder, get_lat_longs_from_geostrings
 7 | from .utils.load_data import load_data
 8 | from .utils.load_vectorizer import load_glove
 9 | 
10 | __all__ = [utils, crimetype, CrimeTags, GeoCoder, SentimentGoogler,
11 |            get_lat_longs_from_geostrings, load_data, load_glove]
12 | 


--------------------------------------------------------------------------------
/lib/tagnews/tests/test_crimetype_tag.py:
--------------------------------------------------------------------------------
 1 | import tagnews
 2 | 
 3 | 
 4 | class Test_Crimetype():
 5 |     @classmethod
 6 |     def setup_method(cls):
 7 |         cls.model = tagnews.CrimeTags()
 8 | 
 9 |     def test_tagtext(self):
10 |         self.model.tagtext('This is example article text')
11 | 
12 |     def test_tagtext_proba(self):
13 |         article = 'Murder afoul, someone has been shot!'
14 |         probs = self.model.tagtext_proba(article)
15 |         max_prob = probs.max()
16 |         max_type = probs.idxmax()
17 |         tags = self.model.tagtext(article,
18 |                                   prob_thresh=max_prob-0.001)
19 |         assert max_type in tags
20 | 


--------------------------------------------------------------------------------
/lib/tagnews/utils/quick_map.py:
--------------------------------------------------------------------------------
 1 | # EXAMPLE
 2 | # https://maps.googleapis.com/maps/api/staticmap?size=400x400&markers=41.8850800,-87.6241350|41.880633,-87.629656&key=KEY
 3 | 
 4 | import webbrowser
 5 | 
 6 | 
 7 | def generate_api_string(lats_lons, key, size=400):
 8 |     print('Found {} addresses.'.format(len(lats_lons)))
 9 |     markers = []
10 |     for addr in lats_lons:
11 |         loc = '{},{}'.format(addr[0], addr[1])
12 |         markers.append(loc)
13 |     url_markers = '|'.join(markers)
14 |     full_str = ('https://maps.googleapis.com/maps/api/staticmap'
15 |                 '?size={}x{}&markers={}&key={}').format(
16 |         size, size, url_markers, key
17 |     )
18 |     return full_str
19 | 
20 | 
21 | def url_open(url):
22 |     webbrowser.open_new_tab(url)
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Article Tagging Development Team
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/lib/tagnews/tests/test_load_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | import pytest
 5 | 
 6 | import tagnews
 7 | 
 8 | 
 9 | class Test_LoadData():
10 |     @staticmethod
11 |     def setup_method():
12 |         os.makedirs('./tmp/', exist_ok=True)
13 | 
14 |     @staticmethod
15 |     def teardown_method():
16 |         shutil.rmtree('./tmp/', ignore_errors=True)
17 | 
18 |     def test_load_data(self):
19 |         df = tagnews.load_data()
20 |         assert df.size
21 | 
22 |     def test_load_data_nrows(self):
23 |         df = tagnews.load_data(nrows=2)
24 |         assert df.size
25 | 
26 |     def test_subsample_and_resave(self):
27 |         tagnews.utils.load_data.subsample_and_resave('./tmp/', n=1)
28 | 
29 |     def test_subsample_and_resave_raises_on_matching_folders(self):
30 |         with pytest.raises(RuntimeError):
31 |             tagnews.utils.load_data.subsample_and_resave(
32 |                 './tmp/', input_folder='./tmp/'
33 |             )
34 | 
35 | 
36 | class Test_LoadGlove():
37 |     def test_load_glove(self):
38 |         glove_path = os.path.join(
39 |                 os.path.dirname(__file__), '..', 'data', 'glove.6B.50d.txt')
40 |         glove = tagnews.load_glove(glove_path)
41 |         glove.loc['murder']
42 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "tagnews"
 7 | version = "1.5.0"
 8 | description = "automatically tag news articles with justice-related categories and extract location information"
 9 | authors = [
10 |   {name = "Kevin Rose"},
11 |   {name = "Josh Herzberg"},
12 |   {name = "Matt Sweeney"},
13 | ]
14 | readme = "README.md"
15 | requires-python = ">=3.9"
16 | dependencies = [
17 |   "google-cloud-language>=2.17.2",
18 |   "h5py>=3.14.0",
19 |   "keras>=2.15.0",
20 |   "nltk>=3.9.1",
21 |   "numpy>=1.26.4",
22 |   "pandas>=2.3.2",
23 |   "requests>=2.32.5",
24 |   "scikit-learn>=1.6.1",
25 |   "scipy>=1.13.1",
26 |   "shapely>=2.0.7",
27 |   "tensorflow==2.15.1",
28 | ]
29 | 
30 | [dependency-groups]
31 | dev = [
32 |   "pytest>=8.4.2",
33 |   "pytest-cov>=6.2.1",
34 | ]
35 | 
36 | [project.urls]
37 | Repository = "https://github.com/chicago-justice-project/article-tagging"
38 | 
39 | [tool.setuptools]
40 | package-dir = {"" = "lib"}
41 | packages = [
42 |   "tagnews",
43 |   "tagnews.utils",
44 |   "tagnews.crimetype",
45 |   "tagnews.crimetype.models.binary_stemmed_logistic",
46 |   "tagnews.geoloc",
47 |   "tagnews.geoloc.models.lstm",
48 |   "tagnews.senteval",
49 |   "tagnews.tests",
50 | ]
51 | 
52 | [tool.setuptools.package-data]
53 | tagnews = [
54 |   "crimetype/models/binary_stemmed_logistic/*.pkl",
55 |   "geoloc/models/lstm/saved/*.hdf5",
56 |   "data/glove.6B.50d.txt",
57 |   "data/Boundaries - Community Areas (current).geojson",
58 | ]
59 | 


--------------------------------------------------------------------------------
/lib/tagnews/crimetype/cli.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from .tag import CrimeTags
 3 | 
 4 | """
 5 | A command line interface to the automatic article crime taging.
 6 | Run with `python -m tagnews.crimetype.cli`
 7 | """
 8 | 
 9 | if __name__ == '__main__':
10 |     crimetags = CrimeTags()
11 | 
12 |     if len(sys.argv) == 1:
13 |         print(('Go ahead and start typing.'
14 |                '\nIf you are on a UNIX machine, hit ctrl-d when done.'
15 |                '\nIf you are on a Windows machine, hit ctrl-Z and'
16 |                ' then Enter when done.'))
17 |         s = sys.stdin.read()
18 |         preds = crimetags.tagtext_proba(s)
19 |         preds = preds.sort_values(ascending=False)
20 |         for tag, prob in zip(preds.index, preds.values):
21 |             print('{: >5}, {:.9f}'.format(tag, prob))
22 |     else:
23 |         if sys.argv[1] in ['-h', '--help']:
24 |             h = 'python -m tagnews.crimetype.tag [filename [filename [...]]]\n'
25 |             h += '\n'
26 |             h += 'If no filenames are provided, read and tag from stdin.\n'
27 |             h += '(Use ctrl-d to stop inputting to stdin.)\n'
28 |             h += '\n'
29 |             h += 'Otherwise, tag all filenames, outputting the tags as a CSV\n'
30 |             h += 'to the file <filename>.tagged.'
31 |             print(h)
32 |             quit()
33 |         for filename in sys.argv[1:]:
34 |             with open(filename) as f_in:
35 |                 preds = crimetags.tagtext_proba(f_in.read())
36 |             preds = preds.sort_values(ascending=False)
37 |             with open(filename + '.tagged', 'w') as f_out:
38 |                 for tag, prob in zip(preds.index, preds.values):
39 |                     f_out.write('{: >5}, {:.9f}\n'.format(tag, prob))
40 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | os:
 4 |   - linux
 5 | 
 6 | dist: focal
 7 | 
 8 | python:
 9 |   - "3.8"
10 |   - "3.9"
11 |   - "3.10"
12 | 
13 | env:
14 |   - FLAKE8=
15 |   - FLAKE8=yes
16 | 
17 | jobs:
18 |   allow_failures:
19 |     - python: "3.8"
20 |       env: FLAKE8=yes
21 | 
22 | sudo: false
23 | 
24 | install:
25 |   - pip install numpy
26 |   - pip install nltk
27 |   - pip install scikit-learn
28 |   - pip install pandas
29 |   - pip install scipy
30 |   - pip install tensorflow
31 |   - pip install h5py
32 |   - pip install keras
33 |   - pip install shapely
34 |   - pip install pytest
35 |   - pip install pytest-cov
36 |   - pip install requests
37 |   - pip install google-cloud-language
38 |   - |
39 |     if [[ $FLAKE8 ]]; then
40 |       pip install flake8
41 |     else
42 |       python -c "import nltk; nltk.download('punkt'); nltk.download('wordnet')"
43 |       python -c "import requests;\
44 |        r = requests.get('https://data.cityofchicago.org/api/geospatial/cauq-8yn6?method=export&format=GeoJSON');\
45 |        f = open('Boundaries - Community Areas (current).geojson', 'w');\
46 |        f.write(r.text)"
47 |       mv "Boundaries - Community Areas (current).geojson" lib/tagnews/data/
48 |       wget http://nlp.stanford.edu/data/glove.6B.zip --no-check-certificate
49 |       python -c "import zipfile; myzip = zipfile.ZipFile('glove.6B.zip'); myzip.extract('glove.6B.50d.txt')"
50 |       mv glove.6B.50d.txt lib/tagnews/data/
51 |       rm glove.6B.zip
52 |       mv lib/tagnews/data/ci-data/*.csv lib/tagnews/data/
53 |     fi
54 | 
55 | before_script:
56 |   - cd lib
57 | 
58 | script:
59 |   - |
60 |     if [[ $FLAKE8 ]]; then
61 |       flake8 --ignore=E261,E226,E402,W503
62 |     else
63 |       python -m tagnews.crimetype.models.binary_stemmed_logistic.save_model
64 |       python -m tagnews.geoloc.models.lstm.save_model 2
65 |       python -m pytest --cov-report term-missing --cov=tagnews
66 |     fi
67 | 


--------------------------------------------------------------------------------
/lib/tagnews/crimetype/models/binary_stemmed_logistic/save_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import sys
 4 | 
 5 | from ....utils import load_data as ld
 6 | from ....utils.model_helpers import LemmaTokenizer
 7 | import numpy as np
 8 | import sklearn
 9 | import sklearn.feature_extraction.text
10 | import sklearn.multiclass
11 | import sklearn.linear_model
12 | import pandas as pd
13 | 
14 | # needed to make pickle-ing work
15 | from nltk import word_tokenize # noqa
16 | from nltk.stem import WordNetLemmatizer # noqa
17 | 
18 | np.random.seed(1029384756)
19 | 
20 | if len(sys.argv) == 2:
21 |     df = ld.load_data(nrows=int(sys.argv[1]))
22 | elif len(sys.argv) == 1:
23 |     df = ld.load_data()
24 | else:
25 |     raise Exception('BAD ARGUMENTS')
26 | 
27 | crime_df = df.loc[df.loc[:, 'OEMC':'TASR'].any(axis=1), :]
28 | crime_df = pd.concat([
29 |     df.loc[df.loc[:, 'OEMC':'TASR'].any(axis=1), :],
30 |     df.loc[~df['relevant'], :].sample(n=min(3000, (~df['relevant']).sum()), axis=0)
31 | ], ignore_index=True)
32 | 
33 | vectorizer = sklearn.feature_extraction.text.CountVectorizer(
34 |     tokenizer=LemmaTokenizer(),
35 |     binary=True,
36 |     max_features=40000
37 | )
38 | 
39 | clf = sklearn.multiclass.OneVsRestClassifier(
40 |     sklearn.linear_model.LogisticRegression(verbose=0)
41 | )
42 | 
43 | X = vectorizer.fit_transform(crime_df['bodytext'].values)
44 | Y = crime_df.loc[:, 'OEMC':'TASR'].values
45 | 
46 | clf.fit(X, Y)
47 | 
48 | from ...tag import CrimeTags
49 | 
50 | crimetags = CrimeTags(clf=clf, vectorizer=vectorizer)
51 | 
52 | print(crimetags.tagtext_proba(('This is an article about drugs and'
53 |                                ' gangs.')))
54 | 
55 | import pickle
56 | 
57 | curr_time = time.strftime("%Y%m%d-%H%M%S")
58 | 
59 | with open(os.path.join(os.path.split(__file__)[0],
60 |                        'model-' + curr_time + '.pkl'), 'wb') as f:
61 |     pickle.dump(clf, f)
62 | with open(os.path.join(os.path.split(__file__)[0],
63 |                        'vectorizer-' + curr_time + '.pkl'), 'wb') as f:
64 |     pickle.dump(vectorizer, f)
65 | 


--------------------------------------------------------------------------------
/lib/tagnews/utils/neighborhoods.py:
--------------------------------------------------------------------------------
 1 | neighborhoods = [
 2 |     "Andersonville",
 3 |     "Archer Heights",
 4 |     "Ashburn",
 5 |     "Ashburn Estates",
 6 |     "Austin",
 7 |     "Avaondale",
 8 |     "Belmont Central",
 9 |     "Beverly",
10 |     "Beverly Woods",
11 |     "Brainerd",
12 |     "Bridgeport",
13 |     "Brighton Park",
14 |     "Bronceville",
15 |     "Bucktown",
16 |     "Burnside",
17 |     "Calumet Heights",
18 |     "Canaryville",
19 |     "Clearing",
20 |     "Chatham",
21 |     "Chinatown",
22 |     "Cottage Grove Heights",
23 |     "Cragin",
24 |     "Dunning",
25 |     "East Chicago",
26 |     "Edison Park",
27 |     "Edgebrook",
28 |     "Edgewater",
29 |     "Englewood",
30 |     "Ford City",
31 |     "Gage Park",
32 |     "Galewood",
33 |     "Garfield Park",
34 |     "Garfield Ridge",
35 |     "Gold Coast",
36 |     "Grand Crossing",
37 |     "Gresham",
38 |     "Hamilton Park",
39 |     "Humboldt Park",
40 |     "Hyde Park",
41 |     "Jefferson Park",
42 |     "Kelvyn Park",
43 |     "Kenwood",
44 |     "Kilbourn Park",
45 |     "Lake Meadows",
46 |     "Lakeview",
47 |     "Lawndale",
48 |     "Lincoln Park",
49 |     "Lincoln Square",
50 |     "Little Village",
51 |     "Logan Square",
52 |     "Longwood Manor",
53 |     "Loop",
54 |     "Marquette Park",
55 |     "McKinley Park",
56 |     "Midway",
57 |     "Morgan Park",
58 |     "Montclare",
59 |     "Mount Greenwood",
60 |     "North Center",
61 |     "Norwood Park",
62 |     "Old Irving Park",
63 |     "Old Town",
64 |     "Park Manor",
65 |     "Pilsen",
66 |     "Princeton Park",
67 |     "Portage Park",
68 |     "Pullman",
69 |     "Ravenswood",
70 |     "River North",
71 |     "River West",
72 |     "Rodgers Park",
73 |     "Roscoe VIllage",
74 |     "Roseland",
75 |     "Sauganash",
76 |     "Schorsch Village",
77 |     "Scottsdale",
78 |     "South Chicago",
79 |     "South Deering",
80 |     "South Loop",
81 |     "South Shore",
82 |     "Streeterville",
83 |     "Tri-Taylor",
84 |     "Ukrainian Village",
85 |     "United Center",
86 |     "Uptown",
87 |     "Vittum Park",
88 |     "Washington Heights",
89 |     "West Elsdon",
90 |     "West Loop",
91 |     "West Pullman",
92 |     "Westlawn",
93 |     "Wicker Park",
94 |     "Woodlawn",
95 |     "Wrigleyville",
96 |     "Wrigtwood",
97 | ]
98 | 


--------------------------------------------------------------------------------
/r_models/qj_models_explore.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | 
 3 | "Models explore"
 4 | # Explore results from 
 5 | # Load result files from loop in quant_justice_models.R
 6 | algorithm_summaries <- read.csv("algorith_summaries_031117.csv", stringsAsFactors = F)
 7 | ensemble_summaries <- read.csv("ensemble_summaries_031107.csv", stringsAsFactors = F)
 8 | algorithm_summaries_detailed <- read.csv("model_performance_measures_031107_cleaned.csv", stringsAsFactors = F) #NOTE: total_positive and total_negative values off for RANDOM FOREST ENSEMBLE
 9 | ensemble_summaries_detailed <- read.csv("ensemble_summaries_more_detail_031117_cleaned.csv", stringsAsFactors = F) 
10 | 
11 | 
12 | # Clean
13 | ensemble_summaries_detailed[ensemble_summaries_detailed == 99999] <- NA
14 | algorithm_summaries_detailed[colnames(select(algorithm_summaries_detailed, num_articles_predicted:Recall_for_max_Matt_coef))] <- sapply(algorithm_summaries_detailed[colnames(select(algorithm_summaries_detailed, num_articles_predicted:Recall_for_max_Matt_coef))], as.numeric)
15 | algorithm_summaries_detailed[algorithm_summaries_detailed == 99999.000] <- NA
16 | 
17 | 
18 | # Example explorations
19 | View(ensemble_summaries_detailed %>% group_by(crime_category) %>% summarise(mean_F_score = mean(F_Score, na.rm = T),
20 |                                                                      max_F_score = max(F_Score, na.rm = T),
21 |                                                                      mean_accuracy = mean(Accuracy, na.rm = T),
22 |                                                                      max_accuracy = max(Accuracy, na.rm = T)))
23 | 
24 | View(algorithm_summaries_detailed %>% group_by(model) %>% summarise(mean(AUC, na.rm = T), max(AUC, na.rm = T), 
25 |                                                       mean(Max_F_score, na.rm = T), max(Max_F_score, na.rm = T),
26 |                                                       mean(Max_Accuracy, na.rm = T), max(Max_Accuracy, na.rm = T),
27 |                                                       mean(Max_Matt_coef, na.rm = T), max(Max_Matt_coef, na.rm = T)))
28 | 
29 | View(algorithm_summaries_detailed %>% group_by(crime_category) %>% summarise(mean(AUC, na.rm = T), max(AUC, na.rm = T), 
30 |                                                       mean(Max_F_score, na.rm = T), max(Max_F_score, na.rm = T),
31 |                                                       mean(Max_Accuracy, na.rm = T), max(Max_Accuracy, na.rm = T),
32 |                                                       mean(Max_Matt_coef, na.rm = T), max(Max_Matt_coef, na.rm = T)))


--------------------------------------------------------------------------------
/lib/tagnews/data/ci-data/newsarticles_category.csv:
--------------------------------------------------------------------------------
 1 | 15,Juvenile,JUVE,2011-08-26 20:22:06.828537+00,t,other
 2 | 16,Re-Entry,REEN,2011-08-26 20:22:20.557875+00,t,other
 3 | 17,Violence,VIOL,2011-08-26 20:22:33.659219+00,t,other
 4 | 19,Probation,PROB,2011-08-26 20:23:14.245505+00,t,other
 5 | 20,Parole,PARL,2011-08-26 20:23:20.964542+00,t,other
 6 | 21,Criminal Justice Policy,CPLY,2011-08-26 20:23:35.241574+00,t,other
 7 | 29,Immigration,IMMG,2011-09-06 20:55:25.596941+00,t,other
 8 | 31,Unspecified Crime,UNSPC,2011-10-07 18:55:52.911149+00,t,other
 9 | 33,Arson,ARSN,2012-03-27 21:22:12.636112+00,t,crimes
10 | 34,Burlgary,BURG,2012-03-27 21:22:24.563428+00,t,crimes
11 | 4,Cook County Circuit Court,CCCC,2011-08-26 20:17:32.200297+00,t,orgs
12 | 5,Cook County Jail,CCJ,2011-08-26 20:17:45.062532+00,t,orgs
13 | 9,Domestic Violence,DOMV,2011-08-26 20:19:47.881876+00,t,crimes
14 | 35,Driving Under the Influence,DUI,2012-03-27 21:22:44.588387+00,t,crimes
15 | 22,Drugs,DRUG,2011-08-26 20:23:42.081659+00,t,crimes
16 | 30,Environmental Crimes,ENVI,2011-09-07 22:06:48.627019+00,t,crimes
17 | 36,Fraud,FRUD,2012-03-27 21:22:56.963232+00,t,crimes
18 | 24,Gangs,GANG,2011-08-26 20:23:59.845203+00,t,crimes
19 | 14,GLBTQ,GLBTQ,2011-08-26 20:21:54.769447+00,t,crimes
20 | 13,Gun Violence,GUNV,2011-08-26 20:21:24.513693+00,t,crimes
21 | 26,Homicides,HOMI,2011-08-26 20:24:23.339118+00,t,crimes
22 | 37,Robbery,ROBB,2012-03-27 21:23:04.531403+00,t,crimes
23 | 10,Sexual Assault,SEXA,2011-08-26 20:20:38.071264+00,t,crimes
24 | 28,Chicago Police Board,CPBD,2011-09-06 20:54:59.505925+00,t,orgs
25 | 2,Chicago Police Department,CPD,2011-08-26 20:16:27.480709+00,t,orgs
26 | 23,Chicago Public Schools,CPS,2011-08-26 20:23:52.5828+00,t,orgs
27 | 6,Cook County Sheriff's Police,CCSP,2011-08-26 20:18:03.825616+00,t,orgs
28 | 7,Cook County Public Defender's Office,CPUB,2011-08-26 20:18:52.323487+00,t,orgs
29 | 8,Illinois Department of Corrections,IDOC,2011-08-26 20:19:31.308972+00,t,orgs
30 | 3,Cook County State's Attorney's Office,SAO,2011-08-26 20:16:53.005243+00,t,orgs
31 | 11,Police Brutality,POLB,2011-08-26 20:20:53.565396+00,t,policing
32 | 32,Illinois State Court,ILSC,2011-10-07 18:56:14.66409+00,t,orgs
33 | 25,Illinois State Police,ILSP,2011-08-26 20:24:11.965487+00,t,orgs
34 | 27,Independent Police Review Authority,IPRA,2011-09-06 20:53:58.289631+00,t,orgs
35 | 1,Office of Emergency Management & Communications,OEMC,2011-08-25 15:49:06.569879+00,t,orgs
36 | 12,Police Misconduct,POLM,2011-08-26 20:21:04.201743+00,t,policing
37 | 38,Police Use of Taser,TASR,2012-03-27 21:23:26.656998+00,t,policing
38 | 18,Beat Realignment / Police Resouce Allocation,BEAT,2011-08-26 20:22:51.626161+00,t,policing
39 | 39,Civilian Office for Police Accountability,COPA,2017-05-12 19:02:36.800007+00,t,orgs
40 | 40,Deputy Inspector General for Police,DIGP,2017-05-12 19:02:54.114808+00,t,orgs
41 | 


--------------------------------------------------------------------------------
/lib/tagnews/utils/load_vectorizer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import sklearn.preprocessing
 4 | 
 5 | 
 6 | def load_glove(vectors_file, normalize=False):
 7 |     """
 8 |     Load a GloVe formatted file, which is simply of the format
 9 | 
10 |         <word_0><space><vec_0,0><space><vec_0,1><space>...<newline>
11 |         <word_1><space><vec_1,0><space><vec_1,1><space>...<newline>
12 |         ...
13 |     See https://github.com/stanfordnlp/GloVe for more information.
14 |     That link also has information on how to download the pre-trained
15 |     word vectorizer models. If the file you download is compressed,
16 |     you will need to uncompress it before using this function.
17 | 
18 |     Note that the loading speed and memory usage is highly depdendent
19 |     on what model you use. The downloadable model "glove.840B.300d.txt"
20 |     will take a few minutes to load and use 2.8 GB of memory, whereas the
21 |     model "glove.6B.50d.txt" will take a few seconds and use < 200 MB
22 |     of memory.
23 | 
24 |     Sample usage:
25 | 
26 |         >>> vectors = load_glove('tagnews/geoloc/glove.6B.50d.txt')
27 |         >>> text = 'This is a sentence and stuff.'
28 |         >>> # you should use an actual tokenizer for this step.
29 |         >>> vectorized_text = vectors.loc[[word.lower()
30 |         ...                                for word in text.split()]]
31 |         >>> print(vectorized_text.shape)
32 |             (6, 300)
33 |         >>> k = 5
34 |         >>> import numpy as np
35 |         >>> def euc(word):
36 |         ...   return np.sum((vectors.values-vectors.loc[word].values)**2.0, 1)
37 |         ...
38 |         >>> vectors.index[np.argpartition(euc('murder'), range(k))[:k]]
39 | 
40 |     Inputs:
41 |         vectors_file: path to file that contains GloVe formatted word
42 |             vectors.
43 |         normalize: Should the word vectors be normalized? See
44 |             https://stats.stackexchange.com/questions/177905/ for
45 |             a good discussion on the topic.
46 | 
47 |     Retuns:
48 |         vectors: NxM pandas dataframe whose rows are indexed by the word.
49 |     """
50 | 
51 |     with open(vectors_file, 'r', encoding='utf-8') as f:
52 |         for vocab_size, line in enumerate(f):
53 |             pass
54 |     vocab_size += 1
55 | 
56 |     vec_size = len(line.split(' ')) - 1
57 |     vectors = np.zeros((vocab_size, vec_size), dtype=np.float32)
58 |     words = np.empty(shape=(vocab_size), dtype=np.dtype('object'))
59 | 
60 |     with open(vectors_file, 'r', encoding='utf-8') as f:
61 |         for i, line in enumerate(f):
62 |             line = line.split(' ')
63 |             words[i] = line[0]
64 |             vectors[i] = [float(x) for x in line[1:]]
65 | 
66 |     vectors = pd.DataFrame(vectors, index=words, copy=False)
67 |     vectors = vectors.loc[~vectors.index.duplicated()]
68 | 
69 |     if normalize:
70 |         sklearn.preprocessing.normalize(vectors, copy=False)
71 | 
72 |     return vectors
73 | 


--------------------------------------------------------------------------------
/lib/tagnews/senteval/eval.py:
--------------------------------------------------------------------------------
 1 | from google.cloud import language_v2
 2 | 
 3 | from tagnews.senteval.police_words import police_words_list, bins
 4 | 
 5 | 
 6 | # def process_google_result(text):
 7 | #     document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT)
 8 | #     sentiment = client.analyze_entity_sentiment(document=document)
 9 | #
10 | #     for entity in sentiment.entities:
11 | #         clean_entity = "".join(filter(str.isalpha, entity)).lower()
12 | #
13 | #         if clean_entity in police_words_list:
14 | #
15 | #             for mention in entity.mentions:
16 | #                 return mention.sentiment.score
17 | 
18 | 
19 | class SentimentGoogler:
20 |     def __init__(self):
21 |         self.client = self.connect_to_client()
22 |         self.police_words = police_words_list
23 |         self.bins = bins[::-1] # reversed because we start with lower numbered bins
24 |         self.num_bins = len(bins)
25 | 
26 |     def run(self, doc_text):
27 |         sentiment_ = self.call_api(doc_text)
28 |         for entity in sentiment_.entities:
29 |             police_entity = self.is_police_entity(entity)
30 |             if police_entity:
31 |                 return self.sentiment_from_entity(police_entity)
32 | 
33 |     def connect_to_client(self):
34 |         return language_v2.LanguageServiceClient()
35 | 
36 |     def sentiment_from_entity(self, entity):
37 |         return entity.sentiment.score
38 | 
39 |     def call_api(self, doc_text):
40 |         """
41 |         Parameters
42 |         ----------
43 |         doc_text : str
44 |             article text
45 | 
46 |         Returns
47 |         -------
48 |         sentiment : json
49 |             google response call
50 |         """
51 |         document = language_v2.Document(content=doc_text, type_=language_v2.Document.Type.PLAIN_TEXT)
52 |         sentiment = self.client.analyze_entity_sentiment(document=document)
53 | 
54 |         return sentiment
55 | 
56 |     def is_police_entity(self, entity):
57 |         if entity in self.police_words:
58 |             return entity
59 |         for mention in entity.mentions:
60 |             if pre_process_text(mention.text.content) in self.police_words:
61 |                 return entity
62 |             return False
63 | 
64 |     def extract_google_priority_bin(self, article:str, cpd_model_val=1, cpd_val=1):
65 |         cop_word_counts = sum([article.count(substr) for substr in self.police_words])
66 |         score = 0.5 * cpd_val + 0.25 * cpd_model_val + 0.25 * min(cop_word_counts / (2 * len(self.police_words)), 1.)
67 |         bin = [bin for bin, bin_max_val in enumerate(self.bins) if bin_max_val >= score][-1]
68 |         return bin
69 | 
70 | 
71 | def pre_process_text(html_text):
72 |     """
73 |     Parameters
74 |     ----------
75 |     html_text : str
76 |         Article text.
77 | 
78 |     Returns
79 |     -------
80 |     words: str
81 |         lower case, just letters
82 |     """
83 |     words = "".join(filter(str.isalpha, html_text)).lower()
84 |     return words
85 | 


--------------------------------------------------------------------------------
/lib/tagnews/tests/test_geocoder.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | import tagnews
 5 | 
 6 | 
 7 | class Test_GeoCoder:
 8 |     @classmethod
 9 |     def setup_class(cls):
10 |         cls.model = tagnews.GeoCoder()
11 | 
12 |     def test_extract_geostrings(self):
13 |         self.model.extract_geostrings(
14 |             (
15 |                 "This is example article text with a location of"
16 |                 " 55th and Woodlawn where something happened."
17 |             )
18 |         )
19 | 
20 |     def test_extract_geostring_probs(self):
21 |         article = (
22 |             "This is example article text with a location of"
23 |             " 55th and Woodlawn where something happened."
24 |         )
25 |         words, probs = self.model.extract_geostring_probs(article)
26 |         max_prob = probs.max()
27 |         max_word = words[np.argmax(probs)]
28 |         geostrings = self.model.extract_geostrings(
29 |             article, prob_thresh=max_prob - 0.001
30 |         )
31 |         assert max_word in [word for geostring in geostrings for word in geostring][0]
32 | 
33 |     def test_extract_geostring_probs_word_not_in_glove(self):
34 |         """
35 |         Regression test for issue #105.
36 |         """
37 |         article = "___1234567890nonexistent0987654321___"
38 |         words, probs = self.model.extract_geostring_probs(article)
39 | 
40 |     def test_lat_longs_from_geostring_lists(self):
41 |         geostring_lists = [
42 |             ["5500", "S", "Woodlawn"],
43 |             ["100", "N.", "Wacker"],
44 |             ["thigh"],
45 |         ]
46 |         coords, scores = self.model.lat_longs_from_geostring_lists(
47 |             geostring_lists, sleep_secs=0.0
48 |         )
49 | 
50 |         assert coords.shape[0] == len(geostring_lists) == len(scores)
51 | 
52 |     def test_community_areas(self):
53 |         # Approximately 55th and Woodlawn, which is in Hyde Park.
54 |         coords = pd.DataFrame([[41.793465, -87.596930]], columns=["lat", "long"])
55 |         com_area = self.model.community_area_from_coords(coords)
56 |         assert com_area == ["HYDE PARK"]
57 | 
58 |     def test_best_geostring(self):
59 |         """Verify that the best_geostring function returns expected values"""
60 |         # Example from the readme
61 |         input1 = (
62 |             [
63 |                 ["1700", "block", "of", "S.", "Halsted", "Ave."],
64 |                 ["55th", "and", "Woodlawn,"],
65 |             ],
66 |             [
67 |                 np.array(
68 |                     [
69 |                         0.71738559,
70 |                         0.81395197,
71 |                         0.82227415,
72 |                         0.79400611,
73 |                         0.70529455,
74 |                         0.60538059,
75 |                     ]
76 |                 ),
77 |                 np.array(
78 |                     [
79 |                         0.79358339,
80 |                         0.69696939,
81 |                         0.68011874
82 |                     ]
83 |                 ),
84 |             ],
85 |         )
86 |         output1 = ["1700", "block", "of", "S.", "Halsted", "Ave."]
87 |         # Empty geostring example
88 |         input2, output2 = [(), ()], ''
89 |         for inpt, expected_output in zip([input1, input2], [output1, output2]):
90 |             actual_output = self.model.best_geostring(inpt)
91 |             assert (
92 |                 actual_output == expected_output
93 |             ), "ERROR: expected output != actual output for input {}/n  {} != {}".format(
94 |                 inpt, actual_output, expected_output
95 |             )
96 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
  1 | name: Publish to PyPI
  2 | 
  3 | on:
  4 |   # Allow manual triggers from any branch
  5 |   workflow_dispatch:
  6 |     inputs:
  7 |       environment:
  8 |         description: 'Choose environment to deploy to'
  9 |         required: true
 10 |         default: 'testpypi'
 11 |         type: choice
 12 |         options:
 13 |         - testpypi
 14 |         - pypi
 15 |   
 16 |   # Automatically trigger on new releases
 17 |   release:
 18 |     types: [published]
 19 | 
 20 | jobs:
 21 |   build:
 22 |     runs-on: ubuntu-latest
 23 |     environment: ${{ github.event.inputs.environment || 'pypi' }}
 24 |     
 25 |     steps:
 26 |     - uses: actions/checkout@v4
 27 |     
 28 |     - name: Set up Python
 29 |       uses: actions/setup-python@v5
 30 |       with:
 31 |         python-version-file: ".python-version"
 32 |     
 33 |     - name: Install uv
 34 |       uses: astral-sh/setup-uv@v6
 35 |       with:
 36 |         version: "0.8.9"
 37 | 
 38 |     - name: Install build/package dependencies
 39 |       run: uv sync --locked --all-extras --dev
 40 |     
 41 |     - name: Download required data files
 42 |       run: |
 43 |         # Download NLTK data
 44 |         uv run python -c "import nltk; nltk.download('punkt_tab', '.venv/nltk_data'); nltk.download('wordnet', '.venv/nltk_data')"
 45 |         
 46 |         # Download geographic data
 47 |         curl "https://data.cityofchicago.org/api/geospatial/igwz-8jzy?method=export&format=GeoJSON" -o "lib/tagnews/data/Boundaries - Community Areas (current).geojson"
 48 |         
 49 |         # Download and extract GloVe
 50 |         curl -O https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
 51 |         unzip glove.6B.zip glove.6B.50d.txt -d lib/tagnews/data
 52 |         rm glove.6B.zip
 53 |         
 54 |         # Move test data
 55 |         mv lib/tagnews/data/ci-data/*.csv lib/tagnews/data/
 56 |     
 57 |     - name: Train and save models
 58 |       run: |
 59 |         uv run python -m tagnews.crimetype.models.binary_stemmed_logistic.save_model
 60 |         uv run python -m tagnews.geoloc.models.lstm.save_model 2
 61 | 
 62 |     - name: Build package
 63 |       run: uv build
 64 | 
 65 |     - name: Store the distribution packages
 66 |       uses: actions/upload-artifact@v4
 67 |       with:
 68 |         name: python-package-distributions
 69 |         path: dist/
 70 | 
 71 |   publish-to-pypi:
 72 |     name: Publish to PyPI
 73 |     if: ${{ github.event.inputs.environment == 'pypi' }}
 74 |     needs:
 75 |     - build
 76 |     runs-on: ubuntu-latest
 77 | 
 78 |     environment:
 79 |       name: pypi
 80 |       url: https://pypi.org/p/tagnews
 81 | 
 82 |     permissions:
 83 |       id-token: write
 84 | 
 85 |     steps:
 86 |       - name: Download all the dists
 87 |         uses: actions/download-artifact@v4
 88 |         with:
 89 |           name: python-package-distributions
 90 |           path: dist/
 91 |       - name: Publish distribution to PyPI
 92 |         uses: pypa/gh-action-pypi-publish@release/v1
 93 | 
 94 |   publish-to-testpypi:
 95 |     name: Publish to Test PyPI
 96 |     if: ${{ github.event.inputs.environment == 'testpypi' }}
 97 |     needs:
 98 |     - build
 99 |     runs-on: ubuntu-latest
100 | 
101 |     environment:
102 |       name: testpypi
103 |       url: https://test.pypi.org/p/tagnews
104 | 
105 |     permissions:
106 |       id-token: write
107 | 
108 |     steps:
109 |       - name: Download all the dists
110 |         uses: actions/download-artifact@v4
111 |         with:
112 |           name: python-package-distributions
113 |           path: dist/
114 |       - name: Publish distribution to TestPyPi
115 |         uses: pypa/gh-action-pypi-publish@release/v1
116 |         with:
117 |           repository-url: https://test.pypi.org/legacy/
118 |           verbose: true
119 | 


--------------------------------------------------------------------------------
/lib/tagnews/crimetype/benchmark.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, print_function
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | 
  7 | def get_kfold_split(N, k=4):
  8 |     """
  9 |     Create groups used for k-fold cross validation.
 10 | 
 11 |     Parameters
 12 |     ----------
 13 |     N : number of samples to split
 14 |     k : number of groups used for cross validation
 15 | 
 16 |     Returns
 17 |     -------
 18 |     List of (index_train, index_test) pairs
 19 |     """
 20 |     np.random.seed(2017)
 21 |     idx = np.random.permutation(N)
 22 |     index_pairs = [(np.ones(N).astype(np.bool),
 23 |                     np.zeros(N).astype(np.bool))
 24 |                    for _ in range(k)]
 25 | 
 26 |     for i, fold_idx in enumerate(np.array_split(idx, k)):
 27 |         index_pairs[i][0][fold_idx] = 0
 28 |         index_pairs[i][1][fold_idx] = 1
 29 | 
 30 |     return index_pairs
 31 | 
 32 | 
 33 | def benchmark(clf_factory, X, Y, clf_params_dict=None, k=4, verbose=False):
 34 |     """
 35 |     benchmark a classifier on preprocessed data.
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     clf_factory :
 40 |         Function which returns a classifier. Classifiers implement
 41 |         a `fit` method and a `predict` method. The parameters
 42 |         clf_params will be passed to clf_factory.
 43 |     X : NxM matrix of features
 44 |     Y : NxL matrix of binary values. Y[i,j] indicates whether or
 45 |         not the j'th tag applies to the i'th article.
 46 |     clf_params_dict :
 47 |         dictionary of parameters passed to the classifier factory.
 48 |         If None, no parameters are passed.
 49 |     k : how many folds to use for cross validation
 50 |     verbose : Should status be printed?
 51 |     """
 52 |     if clf_params_dict is None:
 53 |         clf_params_dict = {}
 54 | 
 55 |     L = Y.shape[1]
 56 | 
 57 |     fold_indexes = get_kfold_split(X.shape[0], k)
 58 |     acc = np.zeros(k)
 59 |     tpr = np.zeros((k, L))
 60 |     fpr = np.zeros((k, L))
 61 |     ppv = np.zeros((k, L))
 62 | 
 63 |     clfs = []
 64 |     for i, (idx_trn, idx_tst) in enumerate(fold_indexes):
 65 |         if verbose:
 66 |             print('step {} of {}...'.format(i, k), end='')
 67 | 
 68 |         clf = clf_factory(**clf_params_dict)
 69 | 
 70 |         x_trn = X[idx_trn, :]
 71 |         y_trn = Y[idx_trn, :]
 72 | 
 73 |         x_tst = X[idx_tst, :]
 74 |         y_tst = Y[idx_tst, :]
 75 | 
 76 |         clf.fit(x_trn, y_trn)
 77 |         y_hat = clf.predict_proba(x_tst)
 78 |         y_hat = y_hat > 0.5
 79 | 
 80 |         y_hat.dtype = np.int8
 81 |         y_tst.dtype = np.int8
 82 | 
 83 |         acc[i] = (np.sum(y_tst == y_hat)) / float(y_tst.size)
 84 |         for j in range(L):
 85 |             tpr[i, j] = np.sum(y_tst[:, j] & y_hat[:, j]) / np.sum(y_tst[:, j])
 86 |             fpr[i, j] = (np.sum(np.logical_not(y_tst[:, j]) & y_hat[:, j])
 87 |                          / np.sum(np.logical_not(y_tst[:, j])))
 88 |             ppv[i, j] = np.sum(y_tst[:, j] & y_hat[:, j]) / np.sum(y_hat[:, j])
 89 | 
 90 |         clfs.append(clf)
 91 | 
 92 |         if verbose:
 93 |             print('done')
 94 | 
 95 |     return {'acc': acc, 'tpr': tpr, 'fpr': fpr, 'ppv': ppv, 'clfs': clfs}
 96 | 
 97 | 
 98 | def predict_articles(clf, vectorizer, df, n=100, seed=1029384756):
 99 |     np.random.seed(seed)
100 | 
101 |     pd.set_option('display.max_columns', 100)
102 |     pd.set_option('display.float_format', lambda x: '%.6f' % x)
103 | 
104 |     random_subset = np.random.choice(np.arange(df.shape[0]),
105 |                                      size=n,
106 |                                      replace=False)
107 | 
108 |     preds = clf.predict_proba(vectorizer.transform(
109 |         df.iloc[random_subset, 3].values
110 |     ))
111 |     preds = pd.DataFrame(preds)
112 |     preds.columns = df.loc[:, 'OEMC':'TASR'].columns
113 | 
114 |     for i, rand_i in enumerate(random_subset):
115 |         s = 'Article ID: ' + str(df.index[rand_i])
116 |         s += '\n' + df.iloc[rand_i, 3]
117 |         s += '\n Predicted Tags: '
118 |         s += str(preds.iloc[i, :].index[preds.iloc[i, :] > 0.5].values)
119 |         s += '\n' + str(preds.iloc[i, :])
120 |         s += '\n'
121 |         filename = 'test-tag-' + str(df.index[rand_i]) + '.txt'
122 |         with open(filename, 'w', encoding='utf-8') as f:
123 |             f.write(s)
124 | 


--------------------------------------------------------------------------------
/lib/tagnews/geoloc/models/lstm/save_model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | os.chdir(os.path.split(__file__)[0])
  5 | 
  6 | import glob
  7 | saved_files = glob.glob('saved/weights*.hdf5')
  8 | if saved_files:
  9 |     delete = input(('This will delete existing saved weight'
 10 |                     ' files, proceed? [y/n] '))
 11 |     while delete not in ['y', 'n']:
 12 |         delete = input(('This will delete existing saved weight'
 13 |                         ' files, proceed? [y/n] '))
 14 |     if delete == 'y':
 15 |         for f in saved_files:
 16 |             os.remove(f)
 17 |     else:
 18 |         print('Exiting.')
 19 |         exit()
 20 | 
 21 | from .... import utils
 22 | import pandas as pd
 23 | from keras.models import Sequential
 24 | from keras.layers import LSTM, Dense, TimeDistributed
 25 | from keras.utils import to_categorical
 26 | from keras.callbacks import ModelCheckpoint
 27 | import numpy as np
 28 | import json
 29 | import requests
 30 | import keras
 31 | 
 32 | if len(sys.argv) == 1:
 33 |     num_epochs = 20
 34 | else:
 35 |     num_epochs = int(sys.argv[1])
 36 | 
 37 | glove = utils.load_vectorizer.load_glove('../../../data/glove.6B.50d.txt')
 38 | # ner = utils.load_data.load_ner_data('../../../data/')
 39 | 
 40 | with open('training.txt', encoding='utf-8') as f:
 41 |     training_data = f.read()
 42 | 
 43 | training_df = pd.DataFrame([x.split() for x in training_data.split('\n') if x],
 44 |                            columns=['word', 'tag'])
 45 | training_df.iloc[:, 1] = training_df.iloc[:, 1].apply(int)
 46 | training_df['all_tags'] = 'NA'
 47 | 
 48 | ner = training_df # pd.concat([training_df, ner]).reset_index(drop=True)
 49 | ner = ner[['word', 'all_tags', 'tag']]
 50 | 
 51 | ner = pd.concat([ner,
 52 |                  pd.DataFrame(ner['word'].str[0].str.isupper().values),
 53 |                  pd.DataFrame(glove.reindex(ner['word'].str.lower()).values)],
 54 |                 axis='columns')
 55 | ner.fillna(value=0.0, inplace=True)
 56 | 
 57 | data_dim = 51
 58 | timesteps = 25 # only during training, testing can take arbitrary length.
 59 | num_classes = 2
 60 | 
 61 | train_val_split = int(19 * ner.shape[0] / 20.)
 62 | 
 63 | ner_train_idxs = range(0, train_val_split - timesteps, timesteps)
 64 | x_train = np.asarray([ner.iloc[i:i+timesteps, 3:].values
 65 |                     for i in ner_train_idxs]).astype(np.float32)
 66 | y_train = np.asarray([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)
 67 |                     for i in ner_train_idxs]).astype(np.float32)
 68 | 
 69 | ner_val_idxs = range(train_val_split, ner.shape[0] - timesteps, timesteps)
 70 | x_val = np.asarray([ner.iloc[i:i+timesteps, 3:].values
 71 |                   for i in ner_val_idxs]).astype(np.float32)
 72 | y_val = np.asarray([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)
 73 |                   for i in ner_val_idxs]).astype(np.float32)
 74 | 
 75 | model = Sequential()
 76 | model.add(LSTM(32, return_sequences=True, input_shape=(None, data_dim)))
 77 | model.add(LSTM(8, return_sequences=True))
 78 | model.add(TimeDistributed(Dense(2, activation='softmax')))
 79 | model.compile(loss='categorical_crossentropy',
 80 |               optimizer='adam',
 81 |               metrics=['categorical_accuracy'])
 82 | print(model.summary(100))
 83 | 
 84 | checkpointer = ModelCheckpoint(filepath='./saved/weights-{epoch:02d}.hdf5',
 85 |                                monitor='val_categorical_accuracy',
 86 |                                mode='max',
 87 |                                verbose=1,
 88 |                                save_best_only=True)
 89 | 
 90 | with open('validation.txt', encoding='utf-8') as f:
 91 |     s = f.read()
 92 | val_words = [w for w in s.split('\n') if w]
 93 | 
 94 | gloved_data = pd.concat(
 95 |     [pd.DataFrame([[w[0].isupper()] for w in val_words]),
 96 |      glove.reindex([w for w in val_words]).fillna(0).reset_index(drop=True)],
 97 |     axis='columns'
 98 | )
 99 | 
100 | 
101 | class OurAUC(keras.callbacks.Callback):
102 |     def on_epoch_end(self, epoch, logs={}):
103 |         # Go to https://geo-extract-tester.herokuapp.com/ and download
104 |         # the validation data (validation.txt).
105 | 
106 |         glove_time_size = 100
107 |         preds_batched = []
108 |         i = 0
109 |         while gloved_data[i:i+glove_time_size].size:
110 |             preds_batched.append(
111 |                 model.predict(np.asarray(np.expand_dims(gloved_data[i:i+glove_time_size],
112 |                                              axis=0)).astype(np.float32))[0][:, 1]
113 |             )
114 |             i += glove_time_size
115 | 
116 |         with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'w') as f:
117 |             for prob in [p for pred in preds_batched for p in pred]:
118 |                 f.write(str(prob) + '\n')
119 | 
120 |         with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'rb') as f:
121 |             url = 'https://geo-extract-tester.herokuapp.com/api/score'
122 |             r = requests.post(url, files={'file': f})
123 |             r = json.loads(r.text)
124 |             auc = r['auc']
125 |             print('AUC: {:.5f}, high score? {}'.format(auc, r['high_score']))
126 | 
127 |         os.remove('guesses-{epoch:02d}.txt'.format(epoch=epoch))
128 |         logs['val_auc'] = auc
129 | 
130 | 
131 | #our_auc = OurAUC()
132 | 
133 | model.fit(x_train, y_train,
134 |           epochs=num_epochs,
135 |           validation_data=(x_val, y_val),
136 |           callbacks=[checkpointer],
137 |           verbose=2)
138 | 
139 | idx = slice(501, 550)
140 | pd.set_option('display.width', 200)
141 | df_to_print = pd.DataFrame(
142 |     model.predict(np.asarray(np.expand_dims(ner.iloc[idx, 3:].values, axis=0)).astype(np.float32))[0][:, 1:],
143 |     columns=['prob_geloc']
144 | )
145 | print(pd.concat([ner.iloc[idx, :3].reset_index(drop=True), df_to_print],
146 |                 axis='columns'))
147 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.org/chicago-justice-project/article-tagging.svg?branch=master)](https://travis-ci.org/chicago-justice-project/article-tagging)
  2 | 
  3 | # tagnews
  4 | 
  5 | `tagnews` is a Python library that can
  6 | 
  7 | * Automatically categorize the text from news articles with type-of-crime tags, e.g. homicide, arson, gun violence, etc.
  8 | * Automatically extract the locations discussed in the news article text, e.g. "55th and Woodlawn" and "1700 block of S. Halsted".
  9 | * Retrieve the latitude/longitude pairs for said locations using an instance of the pelias geocoder hosted by CJP.
 10 | * Get the community areas those lat/long pairs belong to using a shape file downloaded from the city data portal parsed by the `shapely` python library.
 11 | 
 12 | Sound interesting? There's example usage below!
 13 | 
 14 | You can find the source code on [GitHub](https://github.com/chicago-justice-project/article-tagging).
 15 | 
 16 | ## Installation
 17 | 
 18 | You can install `tagnews` with pip:
 19 | 
 20 | ```
 21 | pip install tagnews
 22 | ```
 23 | 
 24 | **NOTE:** You will need to install some [NLTK](http://www.nltk.org/) packages as well:
 25 | 
 26 | ```python
 27 | >>> import nltk
 28 | >>> nltk.download('punkt_tab')
 29 | >>> nltk.download('wordnet')
 30 | ```
 31 | 
 32 | Beware, `tagnews` requires python >= 3.9.
 33 | 
 34 | ## Example
 35 | 
 36 | The main classes are `tagnews.CrimeTags` and `tagnews.GeoCoder`.
 37 | 
 38 | ```python
 39 | >>> import tagnews
 40 | >>> crimetags = tagnews.CrimeTags()
 41 | >>> article_text = ('The homicide occurred at the 1700 block of S. Halsted Ave.'
 42 | ...   ' It happened just after midnight. Another person was killed at the'
 43 | ...   ' intersection of 55th and Woodlawn, where a lone gunman')
 44 | >>> crimetags.tagtext_proba(article_text)
 45 | HOMI     0.739159
 46 | VIOL     0.146943
 47 | GUNV     0.134798
 48 | ...
 49 | >>> crimetags.tagtext(article_text, prob_thresh=0.5)
 50 | ['HOMI']
 51 | >>> geoextractor = tagnews.GeoCoder()
 52 | >>> prob_out = geoextractor.extract_geostring_probs(article_text)
 53 | >>> list(zip(*prob_out))
 54 | [..., ('at', 0.0044685714), ('the', 0.005466637), ('1700', 0.7173856),
 55 |  ('block', 0.81395197), ('of', 0.82227415), ('S.', 0.7940061),
 56 |  ('Halsted', 0.70529455), ('Ave.', 0.60538065), ...]
 57 | >>> geostrings = geoextractor.extract_geostrings(article_text, prob_thresh=0.5)
 58 | >>> geostrings
 59 | [['1700', 'block', 'of', 'S.', 'Halsted', 'Ave.'], ['55th', 'and', 'Woodlawn,']]
 60 | >>> coords, scores = geoextractor.lat_longs_from_geostring_lists(geostrings)
 61 | >>> coords
 62 |          lat       long
 63 | 0  41.859021 -87.646934
 64 | 1  41.794816 -87.597422
 65 | >>> scores # confidence in the lat/longs as returned by pelias, higher is better
 66 | array([0.878, 1.   ])
 67 | >>> geoextractor.community_area_from_coords(coords)
 68 | ['LOWER WEST SIDE', 'HYDE PARK']
 69 | ```
 70 | 
 71 | ## Limitations
 72 | 
 73 | This project uses Machine Learning to automate data cleaning/preparation tasks that would be cost and time prohibitive to perform using people. Like all Machine Learning projects, *the results are not perfect, and in some cases may look just plain bad*.
 74 | 
 75 | We strived to build the best models possible, but perfect accuracy is rarely possible. If you have thoughts on how to do better, please consider [reporting an issue](https://github.com/chicago-justice-project/article-tagging/issues/new), or better yet  [contributing](https://github.com/chicago-justice-project/article-tagging/blob/master/CONTRIBUTING.md).
 76 | 
 77 | ## How can I contribute?
 78 | 
 79 | Great question! Please see [CONTRIBUTING.md](https://github.com/chicago-justice-project/article-tagging/blob/master/CONTRIBUTING.md).
 80 | 
 81 | ## Problems?
 82 | 
 83 | If you have problems, please [report an issue](https://github.com/chicago-justice-project/article-tagging/issues/new). Anything that is behaving unexpectedly is an issue, and should be reported. If you are getting bad or unexpected results, that is also an issue, and should be reported. We may not be able to do anything about it, but more data rarely degrades performance.
 84 | 
 85 | ## Background
 86 | 
 87 | We want to compare the amount of different types of crimes are reported in certain areas vs. the actual occurrence amount in those areas. In essence, *are some crimes under-represented in certain areas but over-represented in others?* This is the main question driving the analysis.
 88 | 
 89 | This question came from the [Chicago Justice Project](http://chicagojustice.org/). They have been interested in answering this question for quite a while, and have been collecting the data necessary to have a data-backed answer. Their efforts include
 90 | 
 91 | 1. Scraping RSS feeds of articles written by Chicago area news outlets for several years, allowing them to collect almost half a million articles.
 92 | 2. Organizing an amazing group of [volunteers](http://chicagojustice.org/volunteer-for-cjp/) that have helped them tag these articles with crime categories like "Gun Violence" and "Drugs", but also organizations such as "Cook County State's Attorney's Office", "Illinois State Police", "Chicago Police Department", and other miscellaneous categories such as "LGBTQ", "Immigration".
 93 | 3. The web UI used to do this tagging was also recently updated to allow highlighting of geographic information, resulting in several hundred articles with labeled location sub-strings.
 94 | 
 95 | Most of the code for those components can be found [here](https://github.com/chicago-justice-project/chicago-justice).
 96 | 
 97 | A group actively working on this project meets every Tuesday at [Chi Hack Night](https://chihacknight.org/).
 98 | 
 99 | ## See Also
100 | 
101 | * [Chicago Justice Project](http://chicagojustice.org/)
102 | * [Source code of other CJP projects](https://github.com/chicago-justice-project)
103 | * [... including the database/web scraping side of things](https://github.com/chicago-justice-project/chicago-justice)
104 | * [What is Chi Hack Night?](https://chihacknight.org/about.html)
105 | 


--------------------------------------------------------------------------------
/lib/tagnews/crimetype/tag.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import glob
  4 | import time
  5 | import pandas as pd
  6 | 
  7 | # not used explicitly, but this needs to be imported like this
  8 | # for unpickling to work.
  9 | from ..utils.model_helpers import LemmaTokenizer # noqa
 10 | 
 11 | """
 12 | Contains the CrimeTags class that allows tagging of articles.
 13 | """
 14 | 
 15 | MODEL_LOCATION = os.path.join(os.path.split(__file__)[0],
 16 |                               'models',
 17 |                               'binary_stemmed_logistic')
 18 | 
 19 | TAGS = ['OEMC', 'CPD', 'SAO', 'CCCC', 'CCJ', 'CCSP',
 20 |         'CPUB', 'IDOC', 'DOMV', 'SEXA', 'POLB', 'POLM',
 21 |         'GUNV', 'GLBTQ', 'JUVE', 'REEN', 'VIOL', 'BEAT',
 22 |         'PROB', 'PARL', 'CPLY', 'DRUG', 'CPS', 'GANG', 'ILSP',
 23 |         'HOMI', 'IPRA', 'CPBD', 'IMMG', 'ENVI', 'UNSPC',
 24 |         'ILSC', 'ARSN', 'BURG', 'DUI', 'FRUD', 'ROBB', 'TASR']
 25 | 
 26 | 
 27 | def load_model(location=MODEL_LOCATION):
 28 |     """
 29 |     Load a model from the given folder `location`.
 30 |     There should be at least one file named model-TIME.pkl and
 31 |     a file named vectorizer-TIME.pkl inside the folder.
 32 | 
 33 |     The files with the most recent timestamp are loaded.
 34 |     """
 35 |     models = glob.glob(os.path.join(location, 'model*.pkl'))
 36 |     if not models:
 37 |         raise RuntimeError(('No models to load. Run'
 38 |                             ' "python -m tagnews.crimetype.models.'
 39 |                             'binary_stemmed_logistic.save_model"'))
 40 |     model = models.pop()
 41 |     while models:
 42 |         model_time = time.strptime(model[-19:-4], '%Y%m%d-%H%M%S')
 43 |         new_model_time = time.strptime(models[0][-19:-4], '%Y%m%d-%H%M%S')
 44 |         if model_time < new_model_time:
 45 |             model = models[0]
 46 |         models = models[1:]
 47 | 
 48 |     with open(model, 'rb') as f:
 49 |         clf = pickle.load(f)
 50 | 
 51 |     with open(os.path.join(location, 'vectorizer-' + model[-19:-4] + '.pkl'),
 52 |               'rb') as f:
 53 |         vectorizer = pickle.load(f)
 54 | 
 55 |     return clf, vectorizer
 56 | 
 57 | 
 58 | class CrimeTags():
 59 |     """
 60 |     CrimeTags let you tag articles. Neat!
 61 |     """
 62 |     def __init__(self,
 63 |                  model_directory=MODEL_LOCATION,
 64 |                  clf=None,
 65 |                  vectorizer=None):
 66 |         """
 67 |         Load a model from the given `model_directory`.
 68 |         See `load_model` for more information.
 69 | 
 70 |         Alternatively, the classifier and vectorizer can be
 71 |         provided. If one is provided, then both must be provided.
 72 |         """
 73 |         if clf is None and vectorizer is None:
 74 |             self.clf, self.vectorizer = load_model(model_directory)
 75 |         elif clf is None or vectorizer is None:
 76 |             raise ValueError(('clf and vectorizer must both be None,'
 77 |                               ' or both be not None'))
 78 |         else:
 79 |             self.clf, self.vectorizer = clf, vectorizer
 80 | 
 81 |     def tagtext_proba(self, text):
 82 |         """
 83 |         Compute the probability each tag applies to the given text.
 84 | 
 85 |         inputs:
 86 |             text: A python string.
 87 |         returns:
 88 |             pred_proba: A pandas series indexed by the tag name.
 89 |         """
 90 |         x = self.vectorizer.transform([text])
 91 |         y_hat = self.clf.predict_proba(x)
 92 |         preds = pd.DataFrame(y_hat)
 93 |         preds.columns = TAGS
 94 |         preds = preds.T.iloc[:, 0].sort_values(ascending=False)
 95 |         return preds
 96 | 
 97 |     def tagtext(self, text, prob_thresh=0.5):
 98 |         """
 99 |         Tag a string with labels.
100 | 
101 |         inputs:
102 |             text: A python string.
103 |             prob_thresh: The threshold on probability at which point
104 |                 the tag will be applied.
105 |         returns:
106 |             preds: A list of tags that have > prob_thresh probability
107 |                 according to the model.
108 |         """
109 |         preds = self.tagtext_proba(text)
110 |         return preds[preds > prob_thresh].index.values.tolist()
111 | 
112 |     def relevant_proba(self, text):
113 |         """
114 |         Outputs the probability that the given text is relevant.
115 |         This probability is computed naively as the maximum of
116 |         the probabilities each tag applies to the text.
117 | 
118 |         A more nuanced method would compute a joint probability.
119 | 
120 |         inputs:
121 |             text: A python string.
122 | 
123 |         returns:
124 |             relevant_proba: Probability the text is relevant.
125 |         """
126 |         return max(self.tagtext_proba(text))
127 | 
128 |     def relevant(self, text, prob_thresh=0.05):
129 |         """
130 |         Determines whether given text is relevant or not. Relevance
131 |         is defined as whether any tag has more than prob_thresh
132 |         chance of applying to the text according to the model.
133 | 
134 |         inputs:
135 |             text: A python string.
136 |             prob_thresh: The threshold on probability that
137 |                 determines relevance. If no tags have >=
138 |                 prob_thresh of applying to the text, then
139 |                 the text is not relevant.
140 |         returns:
141 |             relevant: Boolean. Is the text "relevant"?
142 |         """
143 |         return len(self.tagtext(text, prob_thresh)) > 0
144 | 
145 |     def get_contributions(self, text):
146 |         """
147 |         Rank the words in the text by their contribution to each
148 |         category. This function assumes that clf has an attribute
149 |         `coef_` and that vectorizer has an attribute
150 |         `inverse_transform`.
151 | 
152 |         inputs:
153 |             text: A python string.
154 |         returns:
155 |             contributions: Pandas panel keyed off [category, word].
156 | 
157 |         Example:
158 |         >>> s = 'This is an article about drugs and gangs.'
159 |         >>> s += ' Written by the amazing Kevin Rose.'
160 |         >>> p = tagger.get_contributions(s)
161 |         >>> p['DRUG'].sort_values('weight', ascending=False)
162 |                      weight
163 |         drug       5.549870
164 |         copyright  0.366905
165 |         gang       0.194773
166 |         this       0.124590
167 |         an        -0.004484
168 |         article   -0.052026
169 |         is        -0.085534
170 |         about     -0.154800
171 |         kevin     -0.219028
172 |         rose      -0.238296
173 |         and       -0.316201
174 |         .         -0.853208
175 |         """
176 |         p = {}
177 |         vec = self.vectorizer.transform([text])
178 |         vec_inv = self.vectorizer.inverse_transform(vec)
179 |         for i, tag in enumerate(TAGS):
180 |             p[tag] = pd.DataFrame(
181 |                 index=vec_inv,
182 |                 data={'weight': self.clf.coef_[i, vec.nonzero()[1]]}
183 |             )
184 |         return pd.Panel(p)
185 | 


--------------------------------------------------------------------------------
/lib/notebooks/extract-geostring-example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "C:\\Users\\kevin.rose\\Documents\\GitHub\\cjp-article-tagging\\lib\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "cd .."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {
 24 |     "collapsed": true
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "import tagnews\n",
 29 |     "import pandas as pd"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {
 36 |     "collapsed": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "# Download (and extract if needed) a saved glove data from\n",
 41 |     "# https://github.com/stanfordnlp/GloVe\n",
 42 |     "# and save it to tagnews/data/\n",
 43 |     "glove = tagnews.load_glove('tagnews/data/glove.6B.50d.txt')"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 4,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stderr",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "b'Skipping line 281837: expected 25 fields, saw 34\\n'\n",
 56 |       "C:\\Users\\kevin.rose\\AppData\\Local\\Continuum\\Anaconda2\\envs\\cjp\\lib\\site-packages\\numpy\\lib\\arraysetops.py:463: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
 57 |       "  mask |= (ar1 == a)\n"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "# Download (and extract if needed) the NER data from\n",
 63 |     "# https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data\n",
 64 |     "# and save it to tagnews/data/\n",
 65 |     "ner = tagnews.load_ner_data('tagnews/data/')"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 5,
 71 |    "metadata": {
 72 |     "collapsed": true
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "ner = pd.concat([ner, pd.DataFrame(glove.loc[ner['word'].str.lower()].values)], axis='columns')"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 6,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "Asserted correct vectorizations 998 times.\n"
 89 |      ]
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "num_asserts = 0\n",
 94 |     "for i, row in ner.sample(1000).iterrows():\n",
 95 |     "    if not any(row.iloc[2:].isnull()):\n",
 96 |     "        assert (glove.loc[row['word'].lower()].values == row.iloc[3:].values).all()\n",
 97 |     "        num_asserts += 1\n",
 98 |     "print('Asserted correct vectorizations', num_asserts, 'times.')"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 7,
104 |    "metadata": {
105 |     "collapsed": true
106 |    },
107 |    "outputs": [],
108 |    "source": [
109 |     "import sklearn.ensemble"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 8,
115 |    "metadata": {
116 |     "collapsed": true
117 |    },
118 |    "outputs": [],
119 |    "source": [
120 |     "clf = sklearn.ensemble.RandomForestClassifier()"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 9,
126 |    "metadata": {
127 |     "collapsed": true
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "# be careful doing this if you are relying on sequential-ness!\n",
132 |     "ner.fillna(value=0.0, inplace=True)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 10,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
144 |        "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
145 |        "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
146 |        "            min_samples_leaf=1, min_samples_split=2,\n",
147 |        "            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
148 |        "            oob_score=False, random_state=None, verbose=0,\n",
149 |        "            warm_start=False)"
150 |       ]
151 |      },
152 |      "execution_count": 10,
153 |      "metadata": {},
154 |      "output_type": "execute_result"
155 |     }
156 |    ],
157 |    "source": [
158 |     "clf.fit(ner.iloc[:200000, 3:], ner['tag'].iloc[:200000].values)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 11,
164 |    "metadata": {},
165 |    "outputs": [
166 |     {
167 |      "data": {
168 |       "text/plain": [
169 |        "array([[ 0.04864864,  0.95135136],\n",
170 |        "       [ 0.2663006 ,  0.7336994 ],\n",
171 |        "       [ 1.        ,  0.        ]])"
172 |       ]
173 |      },
174 |      "execution_count": 11,
175 |      "metadata": {},
176 |      "output_type": "execute_result"
177 |     }
178 |    ],
179 |    "source": [
180 |     "clf.predict_proba(glove.loc[['london', 'france', 'napkins']])"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 12,
186 |    "metadata": {
187 |     "collapsed": true
188 |    },
189 |    "outputs": [],
190 |    "source": [
191 |     "# Go to https://geo-extract-tester.herokuapp.com/ and download\n",
192 |     "# the validation data (validation.txt).\n",
193 |     "with open('validation.txt', encoding='utf-8') as f:\n",
194 |     "    s = f.read()"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 13,
200 |    "metadata": {
201 |     "collapsed": true
202 |    },
203 |    "outputs": [],
204 |    "source": [
205 |     "with open('guesses.txt', 'w') as f:\n",
206 |     "    for prob in clf.predict_proba(glove.loc[[w for w in s.split('\\n') if w]].fillna(0))[:,1]:\n",
207 |     "        f.write(str(prob) + '\\n')"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {
213 |     "collapsed": true
214 |    },
215 |    "source": [
216 |     "Now go to https://geo-extract-tester.herokuapp.com/ and upload `guesses.txt` to see how you did!"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {
223 |     "collapsed": true
224 |    },
225 |    "outputs": [],
226 |    "source": []
227 |   }
228 |  ],
229 |  "metadata": {
230 |   "kernelspec": {
231 |    "display_name": "Python 3",
232 |    "language": "python",
233 |    "name": "python3"
234 |   },
235 |   "language_info": {
236 |    "codemirror_mode": {
237 |     "name": "ipython",
238 |     "version": 3
239 |    },
240 |    "file_extension": ".py",
241 |    "mimetype": "text/x-python",
242 |    "name": "python",
243 |    "nbconvert_exporter": "python",
244 |    "pygments_lexer": "ipython3",
245 |    "version": "3.6.1"
246 |   }
247 |  },
248 |  "nbformat": 4,
249 |  "nbformat_minor": 2
250 | }
251 | 


--------------------------------------------------------------------------------
/lib/tagnews/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import requests
  3 | import shutil
  4 | import pandas as pd
  5 | import logging
  6 | 
  7 | logger = logging.getLogger()
  8 | logger.setLevel(logging.INFO)
  9 | 
 10 | def load_all_data():
 11 |     # fmt: off
 12 |     """ articles_df, categories_df, trainedcategoryrelevance_df, trainedcoding_df, usercoding_df, usercoding_categories_df, trainedlocation"""
 13 |     # fmt: on
 14 |     newssource = load_newssource()
 15 |     articles = load_articles()
 16 |     categories = load_categories()
 17 |     trainedcategoryrelevance = load_trainedcategoryrelevance()
 18 |     trainedlocation = load_trainedlocation()
 19 |     trainedcoding = load_trainedcoding()
 20 |     usercoding = load_usercoding()
 21 |     usercoding_categories = load_usercoding_categories()
 22 | #    trainedsentiment = load_trainedsentiment()
 23 | #    trainedsentimententities = load_trainedsentimententities()
 24 |     return (
 25 |         newssource,
 26 |         articles,
 27 |         categories,
 28 |         trainedcategoryrelevance,
 29 |         trainedcoding,
 30 |         usercoding,
 31 |         usercoding_categories,
 32 |         trainedlocation,
 33 | #        trainedsentiment,
 34 | #        trainedsentimententities
 35 |     )
 36 | 
 37 | def load_data_subset():
 38 |     # fmt: off
 39 |     """ articles_df, categories_df, trainedcategoryrelevance_df, trainedcoding_df, usercoding_df, usercoding_categories_df, trainedlocation"""
 40 |     # fmt: on
 41 |     newssource = load_newssource()
 42 |     articles = load_articles_nohtml()
 43 |     categories = load_categories()
 44 |     trainedcategoryrelevance = load_trainedcategoryrelevance()
 45 |     trainedlocation = load_trainedlocation()
 46 |     trainedcoding = load_trainedcoding()
 47 |     usercoding = load_usercoding()
 48 |     usercoding_categories = load_usercoding_categories()
 49 | 
 50 |     return (
 51 |         newssource,
 52 |         articles,
 53 |         categories,
 54 |         trainedcategoryrelevance,
 55 |         trainedcoding,
 56 |         usercoding,
 57 |         usercoding_categories,
 58 |         trainedlocation
 59 |     )
 60 | 
 61 | def load_newssource():
 62 |     newsource = pd.read_csv(
 63 |         "./cjp_tables/newsarticles_newssource.csv.gz", header=None, compression="gzip", low_memory=False
 64 |     )
 65 |     newsource.columns = [
 66 |         "source_id",
 67 |         "source_name",
 68 |         "short_name",
 69 |         "legacy_feed_id",
 70 |     ]
 71 |     print(f"news sources loaded. size: {newsource.shape}")
 72 |     return newsource
 73 | 
 74 | 
 75 | def load_articles():
 76 |     # Read CSV file of articles but exclude the original html (orig_html) column
 77 |     article = pd.read_csv(
 78 |         "./cjp_tables/newsarticles_article.csv.gz", header=None, usecols=[0,1,2,4,5,6,7,8,9,10], compression="gzip", low_memory=False
 79 |     )
 80 |     article.columns = [
 81 |         "id",
 82 |         "feedname",
 83 |         "url",
 84 |         "title",
 85 |         "bodytext",
 86 |         "relevant",
 87 |         "created",
 88 |         "last_modified",
 89 |         "news_source_id",
 90 |         "author",
 91 |     ]
 92 |     print(f"articles loaded. size: {article.shape}")
 93 |     return article
 94 | 
 95 | def load_articles_nohtml():
 96 |     article = pd.read_csv(
 97 |         "./cjp_tables/newsarticles_article.csv.gz", header=None, compression="gzip", low_memory=False
 98 |     )
 99 |     article.columns = [
100 |         "id",
101 |         "feedname",
102 |         "url",
103 |         "title",
104 |         "bodytext",
105 |         "relevant",
106 |         "created",
107 |         "last_modified",
108 |         "news_source_id",
109 |         "author",
110 |     ]
111 |     print(f"articles loaded. size: {article.shape}")
112 |     return article
113 | 
114 | 
115 | def load_categories():
116 |     categories = pd.read_csv(
117 |         "./cjp_tables/newsarticles_category.csv.gz", header=None, compression="gzip", low_memory=False
118 |     )
119 |     categories.columns = ["id", "title", "abbreviation", "created", "active", "kind"]
120 |     print(f"categories loaded. size: {categories.shape}")
121 |     return categories
122 | 
123 | 
124 | def load_trainedcategoryrelevance():
125 |     trainedcategoryrelevance = pd.read_csv(
126 |         "./cjp_tables/newsarticles_trainedcategoryrelevance.csv.gz", header=None, compression="gzip", low_memory=False
127 |     )
128 |     trainedcategoryrelevance.columns = ["id", "relevance", "category_id", "coding_id"]
129 |     print(f"trainedcategoryrelevance loaded. size: {trainedcategoryrelevance.shape}")
130 |     return trainedcategoryrelevance
131 | 
132 | 
133 | def load_trainedcoding():
134 |     trainedcoding = pd.read_csv(
135 |         "./cjp_tables/newsarticles_trainedcoding.csv.gz",
136 |         header=None,
137 |         compression="gzip",
138 |         low_memory=False
139 |     )
140 |     trainedcoding.columns = [
141 |         "id",
142 |         "date",
143 |         "model_info",
144 |         "relevance",
145 |         "article_id",
146 |         "sentiment",
147 |         "bin",
148 |         "sentiment_processed",
149 |     ]
150 |     print(f"trainedcoding loaded. size: {trainedcoding.shape}")
151 |     return trainedcoding
152 | 
153 | 
154 | def load_trainedlocation():
155 |     trainedlocation = pd.read_csv(
156 |         "./cjp_tables/newsarticles_trainedlocation.csv.gz",
157 |         header=None,
158 |         compression="gzip",
159 |         low_memory=False
160 |     )
161 |     trainedlocation.columns = [
162 |         "id",
163 |         "text",
164 |         "latitude",
165 |         "longitude",
166 |         "coding_id",
167 |         "confidence",
168 |         "neighborhood",
169 |         "is_best"
170 |     ]
171 |     print(f"trainedlocation loaded. size: {trainedlocation.shape}")
172 |     return trainedlocation
173 | 
174 | 
175 | def load_usercoding():
176 |     usercoding = pd.read_csv(
177 |         "./cjp_tables/newsarticles_usercoding.csv.gz", header=None, compression="gzip", low_memory=False
178 |     )
179 |     usercoding.columns = [
180 |         "id",
181 |         "date",
182 |         "relevant",
183 |         "article_id",
184 |         "user_id",
185 |         "locations",
186 |         "sentiment",
187 |     ]
188 |     print(f"usercoding loaded. size: {usercoding.shape}")
189 |     return usercoding
190 | 
191 | 
192 | def load_usercoding_categories():
193 |     usercoding_categories = pd.read_csv(
194 |         "./cjp_tables/newsarticles_usercoding_categories.csv.gz",
195 |         header=None,
196 |         compression="gzip",
197 |         low_memory=False
198 |     )
199 |     usercoding_categories.columns = ["id", "usercoding_id", "category_id"]
200 |     print(f"usercoding_categories loaded. size: {usercoding_categories.shape}")
201 |     return usercoding_categories
202 | 
203 | 
204 | def load_trainedsentiment():
205 |     trainedsentiment = pd.read_csv(
206 |         "./cjp_tables/newsarticles_trainedsentiment.csv.gz",
207 |         header=None,
208 |         compression="gzip",
209 |         low_memory=False
210 |     )
211 |     trainedsentiment.columns = [
212 |         "id",
213 |         "date",
214 |         "api_response",
215 |         "coding_id",
216 |     ]
217 |     print(f"trainedsentiment loaded. size: {trainedsentiment.shape}")
218 |     return trainedsentiment
219 | 
220 | 
221 | def load_trainedsentimententities():
222 |     trainedsentimententities = pd.read_csv(
223 |         "./cjp_tables/newsarticles_trainedsentimententities.csv.gz",
224 |         header=None,
225 |         compression="gzip",
226 |         low_memory=False
227 |     )
228 |     trainedsentimententities.columns = [
229 |         "id",
230 |         "index",
231 |         "entity",
232 |         "sentiment",
233 |         "coding_id",
234 |         "response_id",
235 |     ]
236 |     print(f"trainedsentimententities loaded. size: {trainedsentimententities.shape}")
237 |     return trainedsentimententities
238 | 


--------------------------------------------------------------------------------
/lib/tagnews/data/ci-data/newsarticles_usercoding.csv:
--------------------------------------------------------------------------------
  1 | 132,2017-04-30 23:51:50.887415+00,t,132,,[]
  2 | 219,2017-04-30 23:51:51.342135+00,t,219,,[]
  3 | 1728,2017-04-30 23:51:59.692187+00,t,1761,,[]
  4 | 2227,2017-04-30 23:52:02.546656+00,t,2276,,[]
  5 | 3789,2017-04-30 23:52:11.735746+00,t,3840,,[]
  6 | 4069,2017-04-30 23:52:13.664501+00,t,4121,,[]
  7 | 4146,2017-04-30 23:52:14.110165+00,t,4198,,[]
  8 | 4233,2017-04-30 23:52:14.538316+00,t,4285,,[]
  9 | 4302,2017-04-30 23:52:14.930339+00,t,4354,,[]
 10 | 4856,2017-04-30 23:52:17.915424+00,t,4908,,[]
 11 | 5770,2017-04-30 23:52:22.935989+00,t,5822,,[]
 12 | 6327,2017-04-30 23:52:26.249772+00,t,6380,,[]
 13 | 6398,2017-04-30 23:52:26.686142+00,t,6451,,[]
 14 | 6480,2017-04-30 23:52:27.364435+00,t,6534,,[]
 15 | 6776,2017-04-30 23:52:28.925421+00,t,6831,,[]
 16 | 7060,2017-04-30 23:52:31.092362+00,t,7115,,[]
 17 | 7107,2017-04-30 23:52:31.438787+00,t,7162,,[]
 18 | 7254,2017-04-30 23:52:32.551403+00,t,7309,,[]
 19 | 7362,2017-04-30 23:52:33.369139+00,t,7417,,[]
 20 | 7710,2017-04-30 23:52:36.095998+00,t,7765,,[]
 21 | 7713,2017-04-30 23:52:36.118787+00,t,7768,,[]
 22 | 7956,2017-04-30 23:52:37.469579+00,t,8011,,[]
 23 | 8373,2017-04-30 23:52:39.798142+00,t,8430,,[]
 24 | 8769,2017-04-30 23:52:42.065576+00,t,8826,,[]
 25 | 9304,2017-04-30 23:52:45.118601+00,t,9362,,[]
 26 | 9932,2017-04-30 23:52:48.919135+00,t,9990,,[]
 27 | 9987,2017-04-30 23:52:49.377954+00,t,10045,,[]
 28 | 9997,2017-04-30 23:52:49.43844+00,t,10055,,[]
 29 | 10301,2017-04-30 23:52:51.064391+00,t,10360,,[]
 30 | 10604,2017-04-30 23:52:52.870625+00,t,10663,,[]
 31 | 10874,2017-04-30 23:52:54.366605+00,t,10934,,[]
 32 | 11159,2017-04-30 23:52:56.28323+00,t,11219,,[]
 33 | 11619,2017-04-30 23:52:59.181713+00,t,11679,,[]
 34 | 11819,2017-04-30 23:53:00.454832+00,t,11879,,[]
 35 | 11847,2017-04-30 23:53:00.622677+00,t,11907,,[]
 36 | 11896,2017-04-30 23:53:00.926895+00,t,11956,,[]
 37 | 12822,2017-04-30 23:53:05.984266+00,t,12914,,[]
 38 | 13406,2017-04-30 23:53:09.270184+00,t,13498,,[]
 39 | 15416,2017-04-30 23:53:19.617777+00,t,15531,,[]
 40 | 16772,2017-04-30 23:53:27.433229+00,t,16887,,[]
 41 | 17292,2017-04-30 23:53:30.668672+00,t,17407,,[]
 42 | 17509,2017-04-30 23:53:32.07463+00,t,17624,,[]
 43 | 17808,2017-04-30 23:53:33.953668+00,t,18171,,[]
 44 | 18167,2017-04-30 23:53:35.977106+00,t,18534,,[]
 45 | 18630,2017-04-30 23:53:38.531453+00,t,18999,,[]
 46 | 18783,2017-04-30 23:53:39.49154+00,t,19152,,[]
 47 | 21221,2017-04-30 23:53:53.693364+00,t,21593,,[]
 48 | 21766,2017-04-30 23:53:57.044322+00,t,22138,,[]
 49 | 21772,2017-04-30 23:53:57.081285+00,t,22144,,[]
 50 | 22428,2017-04-30 23:54:00.861717+00,t,22800,,[]
 51 | 23050,2017-04-30 23:54:05.848318+00,t,24019,,[]
 52 | 25991,2017-04-30 23:54:23.585039+00,t,26962,,[]
 53 | 27133,2017-04-30 23:54:30.188898+00,t,28104,,[]
 54 | 28032,2017-04-30 23:54:35.285352+00,t,29003,,[]
 55 | 28126,2017-04-30 23:54:35.815621+00,t,29097,,[]
 56 | 31220,2017-04-30 23:54:52.834565+00,t,32192,,[]
 57 | 33722,2017-04-30 23:55:06.745282+00,t,34694,,[]
 58 | 33873,2017-04-30 23:55:07.777538+00,t,34846,,[]
 59 | 33938,2017-04-30 23:55:08.330327+00,t,34911,,[]
 60 | 34093,2017-04-30 23:55:09.340739+00,t,35066,,[]
 61 | 35017,2017-04-30 23:55:15.075316+00,f,35990,,[]
 62 | 37023,2017-04-30 23:55:26.855568+00,t,37996,,[]
 63 | 38136,2017-04-30 23:55:33.215386+00,t,39114,,[]
 64 | 39556,2017-04-30 23:55:41.143696+00,t,40534,,[]
 65 | 40077,2017-04-30 23:55:44.202924+00,f,41056,,[]
 66 | 40153,2017-04-30 23:55:44.732382+00,f,41132,,[]
 67 | 40306,2017-04-30 23:55:45.526339+00,t,41285,,[]
 68 | 40871,2017-04-30 23:55:48.76526+00,t,41851,,[]
 69 | 41820,2017-04-30 23:55:53.945199+00,t,42800,,[]
 70 | 41852,2017-04-30 23:55:54.10859+00,t,42832,,[]
 71 | 43350,2017-04-30 23:56:02.26225+00,t,44332,,[]
 72 | 44004,2017-04-30 23:56:06.142511+00,t,44986,,[]
 73 | 44438,2017-04-30 23:56:08.611559+00,t,45420,,[]
 74 | 46592,2017-04-30 23:56:20.186414+00,t,47574,,[]
 75 | 47805,2017-04-30 23:56:26.71788+00,t,48789,,[]
 76 | 53469,2017-04-30 23:56:56.080526+00,t,54456,,[]
 77 | 55829,2017-04-30 23:57:08.615496+00,t,56816,,[]
 78 | 57826,2017-04-30 23:57:19.575145+00,t,58813,,[]
 79 | 58242,2017-04-30 23:57:21.829651+00,t,59229,,[]
 80 | 58848,2017-04-30 23:57:25.128702+00,t,59836,,[]
 81 | 59062,2017-04-30 23:57:26.194408+00,t,60050,,[]
 82 | 59597,2017-04-30 23:57:28.999912+00,t,60585,,[]
 83 | 60721,2017-04-30 23:57:35.250609+00,t,61709,,[]
 84 | 60981,2017-04-30 23:57:36.692958+00,t,61971,,[]
 85 | 72429,2017-04-30 23:58:42.287373+00,t,74059,,[]
 86 | 74956,2017-04-30 23:59:00.479804+00,t,78269,,[]
 87 | 75398,2017-04-30 23:59:02.800105+00,t,78711,,[]
 88 | 75663,2017-04-30 23:59:04.153408+00,t,78976,,[]
 89 | 76411,2017-04-30 23:59:08.242627+00,t,79724,,[]
 90 | 77257,2017-04-30 23:59:13.056702+00,t,80874,,[]
 91 | 78695,2017-04-30 23:59:22.060328+00,t,83560,,[]
 92 | 80662,2017-04-30 23:59:35.556954+00,t,87395,,[]
 93 | 81047,2017-04-30 23:59:37.738259+00,t,87780,,[]
 94 | 81846,2017-04-30 23:59:41.760211+00,t,88579,,[]
 95 | 82527,2017-04-30 23:59:45.566285+00,t,89261,,[]
 96 | 82584,2017-04-30 23:59:45.8657+00,t,89318,,[]
 97 | 82926,2017-04-30 23:59:47.529423+00,t,89660,,[]
 98 | 83912,2017-04-30 23:59:52.299707+00,t,90646,,[]
 99 | 87771,2017-05-01 00:00:11.788641+00,f,94509,,[]
100 | 87800,2017-05-01 00:00:11.919481+00,f,94538,,[]
101 | 90724,2017-05-01 00:00:27.111264+00,f,97498,,[]
102 | 91284,2017-05-01 00:00:30.207335+00,f,98058,,[]
103 | 92580,2017-05-01 00:00:37.140398+00,t,99361,,[]
104 | 94294,2017-05-01 00:00:46.24013+00,t,101077,,[]
105 | 94603,2017-05-01 00:00:47.925523+00,t,101386,,[]
106 | 95282,2017-05-01 00:00:51.443114+00,t,102066,,[]
107 | 96332,2017-05-01 00:00:57.111733+00,t,103122,,[]
108 | 96361,2017-05-01 00:00:57.267301+00,f,103156,,[]
109 | 97506,2017-05-01 00:01:03.755725+00,t,104306,,[]
110 | 97740,2017-05-01 00:01:04.81288+00,t,104540,,[]
111 | 98644,2017-05-01 00:01:09.924047+00,t,105448,,[]
112 | 99538,2017-05-01 00:01:14.54655+00,t,106346,,[]
113 | 101082,2017-05-01 00:01:22.678444+00,t,107893,,[]
114 | 102047,2017-05-01 00:01:28.15472+00,t,108861,,[]
115 | 106009,2017-05-01 00:01:49.779743+00,t,112858,,[]
116 | 106027,2017-05-01 00:01:49.867794+00,f,112876,,[]
117 | 109564,2017-05-01 00:02:10.073368+00,t,116413,,[]
118 | 113250,2017-05-01 00:02:29.867255+00,t,120162,,[]
119 | 114030,2017-05-01 00:02:34.170748+00,t,120942,,[]
120 | 114749,2017-05-01 00:02:37.969701+00,t,121661,,[]
121 | 115899,2017-05-01 00:02:44.403655+00,f,122811,,[]
122 | 117328,2017-05-01 00:02:52.058585+00,t,124241,,[]
123 | 118240,2017-05-01 00:02:57.183845+00,t,125154,,[]
124 | 119322,2017-05-01 00:03:03.171412+00,t,126236,,[]
125 | 122057,2017-05-01 00:03:18.284443+00,t,128971,,[]
126 | 124718,2017-05-01 00:03:33.214867+00,t,131633,,[]
127 | 125488,2017-05-01 00:03:37.356293+00,t,132403,,[]
128 | 125760,2017-05-01 00:03:38.728316+00,t,132675,,[]
129 | 127428,2017-05-01 00:03:48.033591+00,t,134343,,[]
130 | 127697,2017-05-01 00:03:49.396559+00,f,134612,,[]
131 | 128087,2017-05-01 00:03:51.428787+00,t,135002,,[]
132 | 128637,2017-05-01 00:03:54.390846+00,t,135552,,[]
133 | 128662,2017-05-01 00:03:54.520546+00,t,135577,,[]
134 | 128857,2017-05-01 00:03:55.583598+00,f,135772,,[]
135 | 129483,2017-05-01 00:03:58.949281+00,t,136398,,[]
136 | 129899,2017-05-01 00:04:01.022955+00,f,136814,,[]
137 | 131048,2017-05-01 00:04:08.111101+00,t,137963,,[]
138 | 131765,2017-05-01 00:04:11.956763+00,t,138681,,[]
139 | 132243,2017-05-01 00:04:14.476838+00,t,139159,,[]
140 | 135388,2017-05-01 00:04:32.30874+00,t,142304,,[]
141 | 135705,2017-05-01 00:04:34.105896+00,t,142621,,[]
142 | 139290,2017-05-01 00:04:53.206682+00,t,146218,,[]
143 | 140867,2017-05-01 00:05:01.678874+00,t,147795,,[]
144 | 141314,2017-05-01 00:05:04.282004+00,t,148242,,[]
145 | 142222,2017-05-01 00:05:09.117977+00,t,149150,,[]
146 | 142937,2017-05-01 00:05:12.890425+00,t,149865,,[]
147 | 150134,2017-05-01 00:05:51.013016+00,t,157068,,[]
148 | 151573,2017-05-01 00:05:58.867502+00,t,158507,,[]
149 | 153541,2017-05-01 00:06:09.803195+00,t,160476,,[]
150 | 154021,2017-05-01 00:06:12.487757+00,t,160956,,[]
151 | 154234,2017-05-01 00:06:13.546425+00,f,161169,,[]
152 | 155785,2017-05-01 00:06:21.76145+00,t,162729,,[]
153 | 157346,2017-05-01 00:06:30.15392+00,t,164291,,[]
154 | 157684,2017-05-01 00:06:32.002844+00,t,164631,,[]
155 | 157999,2017-05-01 00:06:33.765279+00,t,164947,,[]
156 | 159958,2017-05-01 00:06:43.879207+00,f,166908,,[]
157 | 160087,2017-05-01 00:06:44.536764+00,f,167037,,[]
158 | 160137,2017-05-01 00:06:44.778934+00,t,167087,,[]
159 | 162892,2017-05-01 00:07:00.330939+00,t,170803,,[]
160 | 165333,2017-05-01 00:07:16.799013+00,t,175658,,[]
161 | 177862,2017-05-01 00:08:37.746581+00,t,201260,,[]
162 | 180249,2017-05-01 00:08:49.482782+00,t,203651,,[]
163 | 184791,2017-05-01 00:09:19.417519+00,t,213344,,[]
164 | 185415,2017-05-01 00:09:22.305448+00,t,213968,,[]
165 | 186861,2017-05-01 00:09:30.312094+00,t,215414,,[]
166 | 193258,2017-05-01 00:10:05.523489+00,t,224701,,[]
167 | 195838,2017-05-01 00:10:59.159411+00,t,250841,,[]
168 | 199329,2017-05-01 00:12:11.686742+00,t,288705,,[]
169 | 200327,2017-05-01 00:12:20.586629+00,t,292352,,[]
170 | 203291,2017-05-31 19:39:42.124411+00,t,327488,133.0,[]
171 | 204359,2017-06-13 21:37:17.99267+00,t,265658,130.0,[]
172 | 204592,2017-06-15 01:06:02.665167+00,t,313702,134.0,[]
173 | 204900,2017-06-20 18:26:30.680508+00,t,290038,142.0,[]
174 | 204902,2017-06-20 18:29:50.615251+00,t,290036,142.0,[]
175 | 205755,2017-06-26 16:35:12.712312+00,t,281528,142.0,[]
176 | 205788,2017-06-26 16:47:08.644718+00,t,281628,142.0,[]
177 | 210278,2017-07-19 19:46:16.533487+00,t,326413,132.0,[]
178 | 211531,2017-07-31 22:08:27.499098+00,t,353037,2.0,"[{""start"":139,""end"":149,""text"":""South Side""},{""start"":150,""end"":161,""text"":""Fuller Park""},{""start"":605,""end"":635,""text"":""200 block of West 47th Street.""},{""start"":1462,""end"":1482,""text"":""Ashburn neighborhood""},{""start"":1447,""end"":1461,""text"":""Southwest Side""}]"
179 | 211597,2017-07-31 23:54:27.375507+00,t,285008,131.0,[]
180 | 212076,2017-08-02 01:36:33.116888+00,t,285119,131.0,"[{""start"":686,""end"":721,""text"":""Irving Park Road and Western Avenue""}]"
181 | 212124,2017-08-03 17:27:13.961423+00,t,354618,130.0,"[{""start"":324,""end"":333,""text"":""West Side""},{""start"":1773,""end"":1781,""text"":""Lawndale""},{""start"":3303,""end"":3335,""text"":""Polk Street and Francisco Avenue""}]"
182 | 212536,2017-08-16 04:47:42.564931+00,t,315827,130.0,"[{""start"":135,""end"":145,""text"":""South Loop""},{""start"":425,""end"":452,""text"":""1100 block of South Indiana""},{""start"":936,""end"":961,""text"":""South Side Auburn Gresham""},{""start"":993,""end"":1006,""text"":""Humboldt Park""}]"
183 | 221833,2017-10-27 18:04:39.114165+00,t,268139,33.0,[]
184 | 222644,2017-10-30 19:57:12.165916+00,t,267157,157.0,[]
185 | 222854,2017-10-30 23:45:51.357935+00,t,267325,130.0,"[{""start"":1329,""end"":1339,""text"":""Palos Park""},{""start"":477,""end"":487,""text"":""Bridgeview""},{""start"":1882,""end"":1907,""text"":""12700 block of 81st Court""}]"
186 | 222947,2017-10-31 16:33:02.206965+00,t,260241,33.0,[]
187 | 223183,2017-10-31 23:17:12.900255+00,t,242395,130.0,"[{""start"":218,""end"":228,""text"":""South Side""}]"
188 | 224330,2017-11-02 23:54:01.689147+00,t,17881,158.0,[]
189 | 225660,2017-11-07 20:32:34.455332+00,t,368055,157.0,[]
190 | 227740,2017-11-15 02:03:56.81339+00,t,315061,157.0,[]
191 | 228325,2017-11-21 17:01:18.233928+00,t,296467,157.0,[]
192 | 230878,2018-01-10 22:43:11.839972+00,t,303024,33.0,[]
193 | 233151,2018-01-31 17:08:47.581319+00,t,362825,33.0,[]
194 | 233177,2018-01-31 18:56:46.092939+00,t,317223,33.0,[]
195 | 


--------------------------------------------------------------------------------
/lib/tagnews/data/column_names.txt:
--------------------------------------------------------------------------------
  1 |                                      Table "public.newsarticles_newssource"
  2 |      Column     |          Type          |                              Modifiers
  3 | ----------------+------------------------+----------------------------------------------------------------------
  4 |  id             | integer                | not null default nextval('newsarticles_newssource_id_seq'::regclass)
  5 |  name           | character varying(256) | not null
  6 |  short_name     | character varying(256) | not null
  7 |  legacy_feed_id | character varying(8)   | not null
  8 | Indexes:
  9 |     "newsarticles_newssource_pkey" PRIMARY KEY, btree (id)
 10 |     "newsarticles_newssource_3ee615f7" btree (legacy_feed_id)
 11 |     "newsarticles_newssource_legacy_feed_id_5367de32a6bdc03f_like" btree (legacy_feed_id varchar_pattern_ops)
 12 |     "newsarticles_newssource_short_name_1ff6619d20cb947d_like" btree (short_name varchar_pattern_ops)
 13 |     "newsarticles_newssource_short_name_1ff6619d20cb947d_uniq" btree (short_name)
 14 | Referenced by:
 15 |     TABLE "newsarticles_article" CONSTRAINT "n_news_source_id_6ef491df45588361_fk_newsarticles_newssource_id" FOREIGN KEY (news_source_id) REFERENCES newsarticles_newssource(id) DEFERRABLE INITIALLY DEFERRED
 16 |     TABLE "newsarticles_scraperresult" CONSTRAINT "ne_news_source_id_e906324e3d2ac00_fk_newsarticles_newssource_id" FOREIGN KEY (news_source_id) REFERENCES newsarticles_newssource(id) DEFERRABLE INITIALLY DEFERRED
 17 | 
 18 |                                       Table "public.newsarticles_article"
 19 |      Column     |           Type           |                             Modifiers
 20 | ----------------+--------------------------+-------------------------------------------------------------------
 21 |  id             | integer                  | not null default nextval('newsarticles_article_id_seq'::regclass)
 22 |  feedname       | character varying(1)     |
 23 |  url            | character varying(1024)  | not null
 24 |  orig_html      | text                     | not null
 25 |  title          | text                     | not null
 26 |  bodytext       | text                     | not null
 27 |  relevant       | boolean                  |
 28 |  created        | timestamp with time zone | not null
 29 |  last_modified  | timestamp with time zone | not null
 30 |  news_source_id | integer                  |
 31 |  author         | character varying(1024)  | not null
 32 | Indexes:
 33 |     "newsarticles_article_pkey" PRIMARY KEY, btree (id)
 34 |     "newsarticles_article_url_key" UNIQUE CONSTRAINT, btree (url)
 35 |     "newsarticles_article_8f28a911" btree (news_source_id)
 36 |     "newsarticles_article_ba31968f" btree (feedname)
 37 |     "newsarticles_article_created" btree (created)
 38 |     "newsarticles_article_e2fa5388" btree (created)
 39 |     "newsarticles_article_f552707d" btree (relevant)
 40 |     "newsarticles_article_feedname" btree (feedname)
 41 |     "newsarticles_article_feedname_6f274b5fd8544257_like" btree (feedname varchar_pattern_ops)
 42 |     "newsarticles_article_feedname_like" btree (feedname varchar_pattern_ops)
 43 |     "newsarticles_article_relevant" btree (relevant)
 44 |     "newsarticles_article_url_3fe47845b28cdc08_like" btree (url varchar_pattern_ops)
 45 | Foreign-key constraints:
 46 |     "n_news_source_id_6ef491df45588361_fk_newsarticles_newssource_id" FOREIGN KEY (news_source_id) REFERENCES newsarticles_newssource(id) DEFERRABLE INITIALLY DEFERRED
 47 | Referenced by:
 48 |     TABLE "newsarticles_article_categories" CONSTRAINT "newsarti_article_id_438886c21ec59122_fk_newsarticles_article_id" FOREIGN KEY (article_id) REFERENCES newsarticles_article(id) DEFERRABLE INITIALLY DEFERRED
 49 |     TABLE "newsarticles_usercoding" CONSTRAINT "newsarti_article_id_54d685c1a8b57e2c_fk_newsarticles_article_id" FOREIGN KEY (article_id) REFERENCES newsarticles_article(id) DEFERRABLE INITIALLY DEFERRED
 50 |     TABLE "newsarticles_trainedcoding" CONSTRAINT "newsarticles_trained_article_id_5b9c0111_fk_newsartic" FOREIGN KEY (article_id) REFERENCES newsarticles_article(id) DEFERRABLE INITIALLY DEFERRED
 51 | 
 52 |                                     Table "public.newsarticles_usercoding"
 53 |    Column   |           Type           |                              Modifiers
 54 | ------------+--------------------------+----------------------------------------------------------------------
 55 |  id         | integer                  | not null default nextval('newsarticles_usercoding_id_seq'::regclass)
 56 |  date       | timestamp with time zone | not null
 57 |  relevant   | boolean                  | not null
 58 |  article_id | integer                  | not null
 59 |  user_id    | integer                  |
 60 |  locations  | text                     | not null
 61 |  sentiment  | integer                  |
 62 | Indexes:
 63 |     "newsarticles_usercoding_pkey" PRIMARY KEY, btree (id)
 64 |     "newsarticles_usercoding_article_id_3535f524868d4ee3_uniq" UNIQUE CONSTRAINT, btree (article_id, user_id)
 65 |     "newsarticles_usercoding_article_id_key" UNIQUE CONSTRAINT, btree (article_id)
 66 |     "newsarticles_usercoding_e8701ad4" btree (user_id)
 67 | Foreign-key constraints:
 68 |     "newsarti_article_id_54d685c1a8b57e2c_fk_newsarticles_article_id" FOREIGN KEY (article_id) REFERENCES newsarticles_article(id) DEFERRABLE INITIALLY DEFERRED
 69 |     "newsarticles_usercodin_user_id_6f03990de1e1875c_fk_auth_user_id" FOREIGN KEY (user_id) REFERENCES auth_user(id) DEFERRABLE INITIALLY DEFERRED
 70 | Referenced by:
 71 |     TABLE "newsarticles_usercoding_categories" CONSTRAINT "ne_usercoding_id_3ce766f5753b730e_fk_newsarticles_usercoding_id" FOREIGN KEY (usercoding_id) REFERENCES newsarticles_usercoding(id) DEFERRABLE INITIALLY DEFERRED
 72 | 
 73 |                                      Table "public.newsarticles_category"
 74 |     Column    |           Type           |                             Modifiers
 75 | --------------+--------------------------+--------------------------------------------------------------------
 76 |  id           | integer                  | not null default nextval('newsarticles_category_id_seq'::regclass)
 77 |  title        | character varying(256)   | not null
 78 |  abbreviation | character varying(5)     | not null
 79 |  created      | timestamp with time zone | not null
 80 |  active       | boolean                  | not null
 81 |  kind         | character varying(50)    | not null
 82 | Indexes:
 83 |     "newsarticles_category_pkey" PRIMARY KEY, btree (id)
 84 | Referenced by:
 85 |     TABLE "newsarticles_usercoding_categories" CONSTRAINT "newsar_category_id_6f8bff226c05e06b_fk_newsarticles_category_id" FOREIGN KEY (category_id) REFERENCES newsarticles_category(id) DEFERRABLE INITIALLY DEFERRED
 86 |     TABLE "newsarticles_article_categories" CONSTRAINT "newsarti_category_id_5876ea9f7b91a1_fk_newsarticles_category_id" FOREIGN KEY (category_id) REFERENCES newsarticles_category(id) DEFERRABLE INITIALLY DEFERRED
 87 |     TABLE "newsarticles_trainedcategoryrelevance" CONSTRAINT "newsarticles_trained_category_id_d3c4a714_fk_newsartic" FOREIGN KEY (category_id) REFERENCES newsarticles_category(id) DEFERRABLE INITIALLY DEFERRED
 88 | 
 89 |                              Table "public.newsarticles_usercoding_categories"
 90 |     Column     |  Type   |                                    Modifiers
 91 | ---------------+---------+---------------------------------------------------------------------------------
 92 |  id            | integer | not null default nextval('newsarticles_usercoding_categories_id_seq'::regclass)
 93 |  usercoding_id | integer | not null
 94 |  category_id   | integer | not null
 95 | Indexes:
 96 |     "newsarticles_usercoding_categories_pkey" PRIMARY KEY, btree (id)
 97 |     "newsarticles_usercoding_categorie_usercoding_id_category_id_key" UNIQUE CONSTRAINT, btree (usercoding_id, category_id)
 98 |     "newsarticles_usercoding_categories_3ca0ec33" btree (usercoding_id)
 99 |     "newsarticles_usercoding_categories_b583a629" btree (category_id)
100 | Foreign-key constraints:
101 |     "ne_usercoding_id_3ce766f5753b730e_fk_newsarticles_usercoding_id" FOREIGN KEY (usercoding_id) REFERENCES newsarticles_usercoding(id) DEFERRABLE INITIALLY DEFERRED
102 |     "newsar_category_id_6f8bff226c05e06b_fk_newsarticles_category_id" FOREIGN KEY (category_id) REFERENCES newsarticles_category(id) DEFERRABLE INITIALLY DEFERRED
103 | 
104 |                                     Table "public.newsarticles_trainedcoding"
105 |    Column   |           Type           |                                Modifiers
106 | ------------+--------------------------+-------------------------------------------------------------------------
107 |  id         | integer                  | not null default nextval('newsarticles_trainedcoding_id_seq'::regclass)
108 |  date       | timestamp with time zone | not null
109 |  model_info | text                     | not null
110 |  relevance  | double precision         | not null
111 |  article_id | integer                  | not null
112 |  sentiment  | double precision         |
113 | Indexes:
114 |     "newsarticles_trainedcoding_pkey" PRIMARY KEY, btree (id)
115 |     "newsarticles_trainedcoding_article_id_key" UNIQUE CONSTRAINT, btree (article_id)
116 | Foreign-key constraints:
117 |     "newsarticles_trained_article_id_5b9c0111_fk_newsartic" FOREIGN KEY (article_id) REFERENCES newsarticles_article(id) DEFERRABLE INITIALLY DEFERRED
118 | Referenced by:
119 |     TABLE "newsarticles_trainedcategoryrelevance" CONSTRAINT "newsarticles_trained_coding_id_ad7cc027_fk_newsartic" FOREIGN KEY (coding_id) REFERENCES newsarticles_trainedcoding(id) DEFERRABLE INITIALLY DEFERRED
120 |     TABLE "newsarticles_trainedlocation" CONSTRAINT "newsarticles_trained_coding_id_d406a29f_fk_newsartic" FOREIGN KEY (coding_id) REFERENCES newsarticles_trainedcoding(id) DEFERRABLE INITIALLY DEFERRED
121 | 
122 |                                  Table "public.newsarticles_trainedlocation"
123 |     Column    |       Type       |                                 Modifiers
124 | --------------+------------------+---------------------------------------------------------------------------
125 |  id           | integer          | not null default nextval('newsarticles_trainedlocation_id_seq'::regclass)
126 |  text         | text             | not null
127 |  latitude     | double precision |
128 |  longitude    | double precision |
129 |  coding_id    | integer          | not null
130 |  confidence   | double precision |
131 |  neighborhood | text             | not null
132 | Indexes:
133 |     "newsarticles_trainedlocation_pkey" PRIMARY KEY, btree (id)
134 |     "newsarticles_trainedlocation_coding_id_d406a29f" btree (coding_id)
135 | Foreign-key constraints:
136 |     "newsarticles_trained_coding_id_d406a29f_fk_newsartic" FOREIGN KEY (coding_id) REFERENCES newsarticles_trainedcoding(id) DEFERRABLE INITIALLY DEFERRED
137 | 
138 |                                 Table "public.newsarticles_trainedcategoryrelevance"
139 |    Column    |       Type       |                                     Modifiers
140 | -------------+------------------+------------------------------------------------------------------------------------
141 |  id          | integer          | not null default nextval('newsarticles_trainedcategoryrelevance_id_seq'::regclass)
142 |  relevance   | double precision | not null
143 |  category_id | integer          | not null
144 |  coding_id   | integer          | not null
145 | Indexes:
146 |     "newsarticles_trainedcategoryrelevance_pkey" PRIMARY KEY, btree (id)
147 |     "newsarticles_trainedcategoryrelevance_category_id_d3c4a714" btree (category_id)
148 |     "newsarticles_trainedcategoryrelevance_coding_id_ad7cc027" btree (coding_id)
149 | Foreign-key constraints:
150 |     "newsarticles_trained_category_id_d3c4a714_fk_newsartic" FOREIGN KEY (category_id) REFERENCES newsarticles_category(id) DEFERRABLE INITIALLY DEFERRED
151 |     "newsarticles_trained_coding_id_ad7cc027_fk_newsartic" FOREIGN KEY (coding_id) REFERENCES newsarticles_trainedcoding(id) DEFERRABLE INITIALLY DEFERRED
152 | 
153 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | ## Getting Started
  4 | 
  5 | This project is developed with [uv](https://docs.astral.sh/uv/).
  6 | Follow the [installation directions](https://docs.astral.sh/uv/getting-started/installation/) from uv's website.
  7 | If Python is installed on your system uv will detect and use it.
  8 | If not, uv will download Python automatically.
  9 | 
 10 | ```
 11 | git clone https://github.com/chicago-justice-project/article-tagging.git
 12 | cd article-tagging
 13 | ```
 14 | 
 15 | ### Get the required data
 16 | 
 17 | Download the [Natural Language Toolkit (NLTK)](http://www.nltk.org/) data:
 18 | 
 19 | ```
 20 | uv run python -c "import nltk; nltk.download('punkt_tab', '.venv/nltk_data')"
 21 | uv run python -c "import nltk; nltk.download('wordnet', '.venv/nltk_data')"
 22 | ```
 23 | 
 24 | For the geotagging model you will need the [GloVe](https://nlp.stanford.edu/projects/glove/) pre-trained word vectors:
 25 | 
 26 | ```
 27 | curl -O https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
 28 | unzip glove.6B.zip -d lib/tagnews/data
 29 | ```
 30 | 
 31 | You will also need the [Chicago Community Areas boundary data](https://data.cityofchicago.org/d/igwz-8jzy):
 32 | 
 33 | ```
 34 | curl https://data.cityofchicago.org/api/geospatial/igwz-8jzy?method=export&format=GeoJSON \
 35 |     -o "lib/tagnews/data/Boundaries - Community Areas (current).geojson"
 36 | ```
 37 | 
 38 | The latest data dump from the [Chicago Justice Project](https://github.com/chicago-justice-project/chicago-justice) should be placed in `lib/tagnews/data`.
 39 | If you do not have access to the production data you can use the test dataset included in the repo:
 40 | 
 41 | ```
 42 | cp lib/tagnews/data/ci-data/*.csv lib/tagnews/data
 43 | ```
 44 | 
 45 | ### Generate the models
 46 | 
 47 | ```
 48 | uv run python -m tagnews.crimetype.models.binary_stemmed_logistic.save_model
 49 | uv run python -m tagnews.geoloc.models.lstm.save_model
 50 | ```
 51 | 
 52 | ### Run tests
 53 | 
 54 | ```
 55 | uv run pytest --cov-report term-missing --cov=tagnews
 56 | ```
 57 | 
 58 | ### Run JupyterLab
 59 | 
 60 | ```
 61 | uv run --with jupyter jupyter lab
 62 | ```
 63 | 
 64 | ## Directory structure
 65 | 
 66 | This project is structured as follows:
 67 | 
 68 | ```
 69 | ├───lib
 70 | │   ├───notebooks ............................ Jupyter/IPython notebooks
 71 | │   └───tagnews .............................. Python package/source code
 72 | │       ├───crimetype ........................ Code related to time-of-crime tagging
 73 | │       │   └───models ....................... Filler directory
 74 | │       │       └───binary_stemmed_logistic .. Code to train/save crimetype NLP model
 75 | │       ├───data ............................. Put the data in here!
 76 | │       │   └───ci-data ...................... A tiny subset of data used for testing
 77 | │       ├───geoloc ........................... Code related to geocoding
 78 | │       │   └───models ....................... Filler directory
 79 | │       │       └───lstm ..................... Code *and data* to train/save geostring extractor
 80 | │       │           └───saved ................ Where the geostring model is saved.
 81 | │       ├───tests ............................ Code used to test this project
 82 | │       └───utils ............................ Helper functions, mostly around data loading
 83 | └───r_models ................................. R code, unused for a while, use with caution
 84 | ```
 85 | 
 86 | Depending on how you want to contribute will dictate which parts you need to know about.
 87 | 
 88 | ## What can I do?
 89 | 
 90 | There are a couple things you could do, each item listed here is expounded on further below.
 91 | 
 92 | * Improve the type-of-crime model (article text -> type-of-crime tags)
 93 | * Improve the geostring extractor model (article text -> list of location strings)
 94 | * Improve the geocoding (list of location strings -> list of lat/longs)
 95 | * Write more tests
 96 | * Write documentation
 97 | * Ways to help without coding
 98 | 
 99 | ### The type-of-crime model
100 | 
101 | #### What is it?
102 | 
103 | The type-of-crime model builds a multi-class classifier that takes in text from a news article and for each type-of-crime tag outputs a probability that the tag applies to the news article. In other words, it tries to guess what kinds of crimes the news article discusses.
104 | 
105 | The model code can be found in `lib/tagnes/crimetype/models/binary_stemmed_logistic/save_model.py`. It's less than 100 lines, don't be afraid to read it!
106 | 
107 | The model relies on NLTK as a tokenizer and builds a binary bag-of-words vectorizer with 40000 features. (We restricted to 40000 features because performance did not decrease significantly and it made the model much smaller, useful when trying to publish to pypi as a package.) The vectorized versions of the articles are then used as input to a separate logistic regression for each crime tag.
108 | 
109 | #### How to train it?
110 | 
111 | The `save_model.py` can be run as a script to save the trained model:
112 | 
113 | ```
114 | uv run python -m tagnews.crimetype.models.binary_stemmed_logistic.save_model
115 | ```
116 | 
117 | The vectorizer is saved in the same directory as the code with the name `vectorizer-<year><month><day>-<hour><minute><second>.pkl`. The model is saved similarly, but with `model` instead of `vectorizer`.
118 | 
119 | This code trains on the whole labeled dataset. During development, the `lib/tagnews/crimetype/benchmark.py` file was used to perform cross validation.
120 | 
121 | #### How to measure performance?
122 | 
123 | We never defined a single number that could be used to decide if one model was better than another, even though that's usually a critical step. We generated FPR/TPRs for all the crime categories and plotted those. The best way may be to fix an acceptable FPR rate at something like 5% or 10% and see what maximizes the mean TPR across a set of desired categories. In short, there's not a solid answer here and refining this would be super helpful in its own right.
124 | 
125 | #### How might it be improved?
126 | 
127 | * Use a better vectorizer than bag-of-words, e.g. GloVe as used for the geostring model.
128 | * We briefly tried a naive bayes classifier over a logistic regression and it didn't seem to improve performance, but naive bayes is usually used as the baseline for these kinds of tasks. Could it be made to work better?
129 | * Add more examples of articles that have *no* tags. Right now we randomly sample 3000 such articles, but we could probably use more. This may help with an observed problem where some sports articles have a high chance of being about a crime according to the model (likely due to the high use of words like "shoot").
130 | 
131 | ### The geostring extractor model
132 | 
133 | #### What is it?
134 | 
135 | The geostring model builds a word-by-word probability that each word is part of a "geostring". A "geostring" is a list of words that define a location. They can be pretty accurate street addresses as in "the shooting happened at the *corner of 55th and Woodlawn*" or fuzzier locations such as a neighborhood name, a church name, etc. The per-word probability can be thresholded and we take all consecutive list of words above the threshold as the geostrings inside an article.
136 | 
137 | The model code can be found in `lib/tagnews/geoloc/models/lstm/save_model.py`. It's 150 lines of python code a good portion of which is trying to hit an external internet API. The keras library is used extensively.
138 | 
139 | The model relies on the pre-trained semantic word vectorizer GloVE to get a 50 dimensional feature vector for each word, and then a two layer bi-directional LSTM is used to generate the probabilities.
140 | 
141 | #### How to train it?
142 | 
143 | The `save_model.py` file can be run as a script to save the trained model:
144 | 
145 | ```
146 | uv run python -m tagnews.geoloc.models.lstm.save_model
147 | ```
148 | 
149 | The model is saved under `lib/tagnews/geoloc/models/lstm/saved/weights-*.hdf5`. The code will run for a set number of training epochs (one epoch is one pass through all of the training examples), saving the weights after each epoch.
150 | 
151 | #### How to measure performance?
152 | 
153 | Download the validation data from https://geo-extract-tester.herokuapp.com/ (there is also training data available for downloading). Follow the instructions on that website to upload guesses and the ROC curve will be shown for your model's predictions. If you have a higher AUC than the current high score, congratulations! Please submit a Pull Request!
154 | 
155 | You can also upload your model's predictions via an API. There is code inside `lib/tagnews/geoloc/models/lstm/save_model.py` demonstrating this.
156 | 
157 | #### How might it be improved?
158 | 
159 | * Including "naive" models that do simple look-ups against Chicago street names.
160 | * Using a word vectorizer that handles out-of-vocabulary predictions better (perhaps `FastText`?).
161 | * Just use a character-level CNN?
162 | * Augment the training data by labeling more articles (see the "I want to contribute to Chicago Justice Project but I don’t want to work on this NLP stuff. What can I do?" section).
163 | 
164 | ### The geocoding
165 | 
166 | #### What is it?
167 | 
168 | Geocoding here refers to the process of sending a geostring (e.g. "55th and Woodlawn") to an external service to retrieve a best-guess latitude/longitude pair of where that geostring is referring to.
169 | 
170 | Right now, the geocoding is done using an instance of pelias hosted by CJP.
171 | 
172 | The code can be found in `lib/tagnews/geoloc/tag.py`, in the `get_lat_longs_from_geostrings` function.
173 | 
174 | #### How might it be improved?
175 | 
176 | * Improve post-processing of geostrings (we do rudimentary things like append "Chicago, Illinois", but we could get more sophisticated).
177 | * Improve the inputs to it by improving the geostring model.
178 | * Improve the inputs by making a better post-processor of geostrings.
179 | * Improve the confidence score.
180 | 
181 | #### What if it breaks?
182 | 
183 | The last time the geocoding broke it was because they started checking for browser-like headers, so we updated our requests to have browser-like headers. Something like this may happen again and unfortunately there's no real playbook here.
184 | 
185 | The good news is that the geostrings will always be there, and if needed we can always re-process any geocoding that doesn't work.
186 | 
187 | ## Testing
188 | 
189 | ### The test suite
190 | 
191 | You can find the tests `lib/tagnews/tests/`. We use `pytest` as the test runner. The test coverage isn't phenomenal, but it's not terrible either. We always welcome Pull Requests making more and better tests!
192 | 
193 | ### Running locally
194 | 
195 | You need the data to run the tests. If you have the data, great! You should be able to run the tests. If you don't have the data, you use the tiny subset of the data stored in `lib/tagnews/data/ci-data/`.
196 | 
197 | ```
198 | cp lib/tagnews/data/ci-data/*.csv lib/tagnews/data
199 | ```
200 | 
201 | Make sure you have downloaded the GloVE vectors (see Getting Started above).
202 | 
203 | Beware that if you run the tests with the full data-set, it can take a _long_ time and a _lot_ of memory.
204 | 
205 | If you don't already have a type-of-crime or geostring model, you will need train one (see above).
206 | 
207 | Once that's completed, run:
208 | 
209 | ```
210 | uv run pytest --cov-report term-missing --cov=tagnews
211 | ```
212 | 
213 | ### Continuous Integration Testing
214 | 
215 | We use GitHub Actions for continuous integration testing. Right now, this is run manually under Actions in GitHub.
216 | 
217 | This is configured via the `.github/workflows/publish.yml` file at the top-level of this project.
218 | 
219 | ## Documentation
220 | 
221 | ### How to write it?
222 | 
223 | Write it in this very file! Or the README.md file!
224 | 
225 | ### How to publish it?
226 | 
227 | Documentation is not currently published. If you have interest in helping with this, submit a Pull Request!
228 | 
229 | ## Publishing a new version to pypi
230 | 
231 | First, update the version string in `pyproject.toml`, initially start out by bumping the version and making it a release candidate, e.g. `1.1.0rc1`.
232 | 
233 | Second, make sure the saved models either match the previously published version exactly (by downloading the current release, extracting it, and copying the model file to where it needs to be), or are _meant_ to be updated. Make sure only the saved model you want exists in your project, delete all others.
234 | 
235 | Then, use GitHub Actions to run the workflow using the `pypi` environment. This can also be tested first by publishing to TestPyPI. To do so, run the same GitHub Action manually but use the `testpypi` environment.
236 | 
237 | Create a new anaconda environment to download the version for rudimentary testing. The Continuous Integration should take care of most rigorous testing, this is just to make sure everything is working. I usually run through the example at the top of the README.
238 | 
239 | Once you are happy, remove the `rc*` suffix and publish as the actual version. You should then create a [release](https://github.com/chicago-justice-project/article-tagging/releases) on GitHub, attempting to log all the changes and attach the tarball created by `python setup.py sdist`.
240 | 
241 | *Note: pypi has a limit on the size of projects that can be uploaded, and pypi was recently migrated to a new data warehouse. We originally had to request a size increase in [this issue](https://github.com/pypa/packaging-problems/issues/119).*
242 | 
243 | ## I want to contribute to Chicago Justice Project but I don’t want to work on this NLP stuff. What can I do?
244 | 
245 | You can help out the [the team scraping articles/maintaining the volunteers' web interface](https://github.com/chicago-justice-project/chicago-justice). If that doesn't sound interesting either, we can always use more [volunteer taggers](http://chicagojustice.org/volunteer-for-cjp/). Or just show up Tuesday nights at ChiHackNight and ask what you can do!
246 | 


--------------------------------------------------------------------------------
/lib/tagnews/data/ci-data/newsarticles_usercoding_categories.csv:
--------------------------------------------------------------------------------
  1 | 92,132,17
  2 | 93,132,26
  3 | 94,132,20
  4 | 95,132,13
  5 | 96,132,37
  6 | 188,219,17
  7 | 189,219,26
  8 | 190,219,31
  9 | 191,219,10
 10 | 192,219,29
 11 | 1557,1728,25
 12 | 1558,1728,2
 13 | 1559,1728,31
 14 | 1560,1728,17
 15 | 1956,2227,8
 16 | 1957,2227,17
 17 | 1958,2227,2
 18 | 1959,2227,20
 19 | 1960,2227,13
 20 | 3667,3789,30
 21 | 4014,4069,31
 22 | 4126,4146,4
 23 | 4127,4146,2
 24 | 4128,4146,11
 25 | 4129,4146,12
 26 | 4130,4146,17
 27 | 4219,4233,2
 28 | 4220,4233,3
 29 | 4221,4233,4
 30 | 4222,4233,11
 31 | 4223,4233,12
 32 | 4224,4233,17
 33 | 4225,4233,21
 34 | 4345,4302,24
 35 | 4346,4302,17
 36 | 4347,4302,26
 37 | 4348,4302,13
 38 | 4993,4856,3
 39 | 4994,4856,37
 40 | 4995,4856,13
 41 | 4996,4856,17
 42 | 4997,4856,26
 43 | 4998,4856,31
 44 | 5924,5770,28
 45 | 5925,5770,12
 46 | 5926,5770,22
 47 | 5927,5770,38
 48 | 6620,6327,2
 49 | 6621,6327,3
 50 | 6622,6327,4
 51 | 6623,6327,7
 52 | 6624,6327,13
 53 | 6625,6327,17
 54 | 6626,6327,24
 55 | 6627,6327,26
 56 | 6628,6327,31
 57 | 6771,6398,17
 58 | 6772,6398,2
 59 | 6773,6398,5
 60 | 6774,6398,6
 61 | 6775,6398,31
 62 | 6846,6480,17
 63 | 6847,6480,2
 64 | 6848,6480,27
 65 | 6849,6480,13
 66 | 6850,6480,22
 67 | 7256,6776,17
 68 | 7257,6776,2
 69 | 7258,6776,35
 70 | 7259,6776,26
 71 | 7260,6776,31
 72 | 7930,7060,2
 73 | 7931,7060,4
 74 | 7932,7060,5
 75 | 7933,7060,17
 76 | 7934,7060,28
 77 | 7935,7060,29
 78 | 8069,7107,36
 79 | 8472,7254,11
 80 | 8473,7254,12
 81 | 8474,7254,13
 82 | 8475,7254,15
 83 | 8476,7254,26
 84 | 8477,7254,31
 85 | 8747,7362,17
 86 | 8748,7362,2
 87 | 8749,7362,27
 88 | 8750,7362,12
 89 | 8751,7362,31
 90 | 9696,7710,17
 91 | 9697,7710,26
 92 | 9698,7710,28
 93 | 9703,7713,36
 94 | 9704,7713,29
 95 | 9705,7713,31
 96 | 10009,7956,2
 97 | 10010,7956,3
 98 | 10011,7956,13
 99 | 10012,7956,17
100 | 10013,7956,22
101 | 10014,7956,23
102 | 10408,8373,2
103 | 10409,8373,17
104 | 10410,8373,22
105 | 10411,8373,25
106 | 10412,8373,26
107 | 10413,8373,29
108 | 11018,8769,13
109 | 11019,8769,21
110 | 11020,8769,31
111 | 11523,9304,8
112 | 12295,9932,17
113 | 12296,9932,2
114 | 12297,9932,13
115 | 12298,9932,37
116 | 12384,9987,2
117 | 12385,9987,37
118 | 12386,9987,12
119 | 12387,9987,13
120 | 12388,9987,17
121 | 12389,9987,22
122 | 12390,9987,24
123 | 12404,9997,8
124 | 12405,9997,34
125 | 12406,9997,4
126 | 12407,9997,37
127 | 12791,10301,32
128 | 12792,10301,26
129 | 13137,10604,17
130 | 13138,10604,34
131 | 13139,10604,20
132 | 13140,10604,37
133 | 13141,10604,31
134 | 13412,10874,2
135 | 13413,10874,35
136 | 13414,10874,31
137 | 14046,11159,17
138 | 14047,11159,2
139 | 14048,11159,10
140 | 14779,11619,32
141 | 14780,11619,4
142 | 15204,11819,2
143 | 15205,11819,3
144 | 15206,11819,13
145 | 15207,11819,18
146 | 15208,11819,22
147 | 15209,11819,24
148 | 15255,11847,28
149 | 15256,11847,31
150 | 15321,11896,32
151 | 15322,11896,2
152 | 15323,11896,37
153 | 16089,12822,29
154 | 16448,13406,16
155 | 17681,15416,2
156 | 17682,15416,3
157 | 17683,15416,4
158 | 17684,15416,5
159 | 17685,15416,38
160 | 17686,15416,13
161 | 17687,15416,17
162 | 17688,15416,22
163 | 17689,15416,24
164 | 17690,15416,31
165 | 18898,16772,33
166 | 18899,16772,2
167 | 18900,16772,26
168 | 18901,16772,9
169 | 19360,17292,2
170 | 19361,17292,3
171 | 19362,17292,4
172 | 19363,17292,5
173 | 19364,17292,7
174 | 19365,17292,34
175 | 19366,17292,13
176 | 19367,17292,17
177 | 19368,17292,31
178 | 19739,17509,17
179 | 19740,17509,2
180 | 19741,17509,26
181 | 19742,17509,13
182 | 19856,17808,18
183 | 19857,17808,2
184 | 20268,18167,17
185 | 20269,18167,2
186 | 20270,18167,13
187 | 20271,18167,26
188 | 20272,18167,5
189 | 20535,18630,17
190 | 20536,18630,2
191 | 20537,18630,4
192 | 20538,18630,13
193 | 20664,18783,17
194 | 20665,18783,2
195 | 20666,18783,31
196 | 22007,21221,34
197 | 22008,21221,3
198 | 22009,21221,4
199 | 22010,21221,37
200 | 22011,21221,7
201 | 22012,21221,13
202 | 22013,21221,26
203 | 22577,21766,32
204 | 22578,21766,17
205 | 22579,21766,26
206 | 22586,21772,32
207 | 22587,21772,17
208 | 22588,21772,26
209 | 23089,22428,24
210 | 23090,22428,17
211 | 23091,22428,2
212 | 23092,22428,13
213 | 23657,23050,26
214 | 26028,25991,25
215 | 26029,25991,18
216 | 26030,25991,2
217 | 26865,27133,33
218 | 26866,27133,34
219 | 26867,27133,3
220 | 26868,27133,4
221 | 26869,27133,5
222 | 26870,27133,8
223 | 26871,27133,2
224 | 26872,27133,22
225 | 26873,27133,26
226 | 27534,28032,27
227 | 27535,28032,2
228 | 27536,28032,11
229 | 27537,28032,12
230 | 27538,28032,38
231 | 27558,28126,27
232 | 27559,28126,2
233 | 27560,28126,11
234 | 27561,28126,12
235 | 27562,28126,38
236 | 29188,31220,17
237 | 29189,31220,2
238 | 29190,31220,26
239 | 29191,31220,13
240 | 30539,33722,21
241 | 30540,33722,31
242 | 30746,33873,21
243 | 30747,33873,31
244 | 30880,33938,18
245 | 30881,33938,2
246 | 30882,33938,21
247 | 31065,34093,30
248 | 32106,35017,33
249 | 32107,35017,26
250 | 33234,37023,33
251 | 33945,38136,34
252 | 33946,38136,2
253 | 33947,38136,13
254 | 33948,38136,17
255 | 33949,38136,20
256 | 33950,38136,22
257 | 33951,38136,26
258 | 33952,38136,31
259 | 34972,39556,24
260 | 34973,39556,2
261 | 34974,39556,26
262 | 34975,39556,13
263 | 35445,40077,25
264 | 35446,40077,35
265 | 35538,40153,25
266 | 35539,40153,15
267 | 35540,40153,31
268 | 35662,40306,9
269 | 35663,40306,26
270 | 35664,40306,13
271 | 35665,40306,15
272 | 36261,40871,1
273 | 36262,40871,26
274 | 36263,40871,3
275 | 36264,40871,4
276 | 36265,40871,17
277 | 36998,41820,2
278 | 36999,41820,13
279 | 37000,41820,15
280 | 37001,41820,17
281 | 37002,41820,18
282 | 37003,41820,24
283 | 37004,41820,27
284 | 37019,41852,24
285 | 37020,41852,17
286 | 37021,41852,2
287 | 37022,41852,13
288 | 38121,43350,2
289 | 38122,43350,3
290 | 38123,43350,4
291 | 38124,43350,14
292 | 38125,43350,17
293 | 38126,43350,22
294 | 38127,43350,31
295 | 38536,44004,2
296 | 38537,44004,37
297 | 38844,44438,1
298 | 38845,44438,2
299 | 38846,44438,31
300 | 38847,44438,4
301 | 38848,44438,17
302 | 40129,46592,17
303 | 40130,46592,2
304 | 40131,46592,11
305 | 40132,46592,4
306 | 40776,47805,1
307 | 40777,47805,2
308 | 40778,47805,37
309 | 40779,47805,9
310 | 40780,47805,13
311 | 40781,47805,17
312 | 40782,47805,26
313 | 42718,53469,2
314 | 42719,53469,4
315 | 42720,53469,5
316 | 42721,53469,6
317 | 42722,53469,9
318 | 42723,53469,17
319 | 43582,55829,2
320 | 43583,55829,22
321 | 43584,55829,31
322 | 44878,57826,2
323 | 44879,57826,37
324 | 44880,57826,10
325 | 44881,57826,34
326 | 44882,57826,17
327 | 44883,57826,20
328 | 44884,57826,22
329 | 44885,57826,31
330 | 45070,58242,6
331 | 45071,58242,22
332 | 45239,58848,2
333 | 45240,58848,26
334 | 45241,58848,38
335 | 45287,59062,2
336 | 45288,59062,23
337 | 45614,59597,17
338 | 45615,59597,2
339 | 45616,59597,11
340 | 45617,59597,12
341 | 45618,59597,21
342 | 46443,60721,34
343 | 46444,60721,5
344 | 46713,60981,2
345 | 46714,60981,11
346 | 46715,60981,12
347 | 46716,60981,21
348 | 53495,72429,16
349 | 53496,72429,13
350 | 54842,74956,24
351 | 54843,74956,17
352 | 54844,74956,2
353 | 54845,74956,13
354 | 54846,74956,22
355 | 55056,75398,24
356 | 55057,75398,2
357 | 55058,75398,13
358 | 55059,75398,22
359 | 55202,75663,24
360 | 55203,75663,2
361 | 55204,75663,31
362 | 55443,76411,35
363 | 55855,77257,26
364 | 55856,77257,13
365 | 56439,78695,17
366 | 56440,78695,2
367 | 56441,78695,23
368 | 57674,80662,28
369 | 57675,80662,38
370 | 57952,81047,34
371 | 57953,81047,2
372 | 57954,81047,37
373 | 57955,81047,22
374 | 57956,81047,13
375 | 58417,81846,17
376 | 58418,81846,2
377 | 58419,81846,26
378 | 58420,81846,31
379 | 59053,82527,2
380 | 59054,82527,3
381 | 59055,82527,4
382 | 59056,82527,7
383 | 59057,82527,13
384 | 59058,82527,16
385 | 59059,82527,17
386 | 59060,82527,22
387 | 59061,82527,26
388 | 59118,82584,1
389 | 59119,82584,18
390 | 59120,82584,2
391 | 59298,82926,18
392 | 59299,82926,2
393 | 59730,83912,31
394 | 61701,87771,21
395 | 61702,87771,6
396 | 61712,87800,18
397 | 61713,87800,2
398 | 63545,90724,14
399 | 63853,91284,29
400 | 64460,92580,2
401 | 64461,92580,10
402 | 64462,92580,15
403 | 65238,94294,36
404 | 65239,94294,31
405 | 65337,94603,9
406 | 65590,95282,24
407 | 65591,95282,2
408 | 65592,95282,26
409 | 65593,95282,13
410 | 66169,96332,9
411 | 66170,96332,26
412 | 66192,96361,23
413 | 66811,97506,3
414 | 66812,97506,35
415 | 66813,97506,19
416 | 66814,97506,15
417 | 66866,97740,26
418 | 66867,97740,4
419 | 66868,97740,5
420 | 66869,97740,7
421 | 67402,98644,7
422 | 67403,98644,26
423 | 67404,98644,3
424 | 67405,98644,37
425 | 67406,98644,13
426 | 67722,99538,2
427 | 67723,99538,13
428 | 68485,101082,17
429 | 68486,101082,2
430 | 68487,101082,26
431 | 68488,101082,13
432 | 69076,102047,17
433 | 69077,102047,3
434 | 69078,102047,37
435 | 70987,106009,17
436 | 70988,106009,12
437 | 70989,106009,15
438 | 70993,106027,14
439 | 72798,109564,36
440 | 72799,109564,31
441 | 74755,113250,17
442 | 74756,113250,2
443 | 74757,113250,3
444 | 74758,113250,4
445 | 74759,113250,26
446 | 75448,114030,25
447 | 75449,114030,3
448 | 75450,114030,13
449 | 75451,114030,17
450 | 75872,114749,17
451 | 75873,114749,2
452 | 75874,114749,3
453 | 75875,114749,26
454 | 75876,114749,13
455 | 76825,115899,2
456 | 76826,115899,3
457 | 76827,115899,10
458 | 76828,115899,11
459 | 76829,115899,17
460 | 76830,115899,26
461 | 77700,117328,8
462 | 77701,117328,5
463 | 78344,118240,9
464 | 78345,118240,2
465 | 78346,118240,26
466 | 78347,118240,6
467 | 78348,118240,17
468 | 79074,119322,9
469 | 79075,119322,15
470 | 79076,119322,31
471 | 80971,122057,23
472 | 80972,122057,22
473 | 80973,122057,15
474 | 82526,124718,2
475 | 82527,124718,3
476 | 82528,124718,10
477 | 82529,124718,34
478 | 82530,124718,15
479 | 82531,124718,17
480 | 82532,124718,19
481 | 82533,124718,22
482 | 83029,125488,2
483 | 83030,125488,35
484 | 83031,125488,15
485 | 83032,125488,25
486 | 83033,125488,26
487 | 83034,125488,31
488 | 83166,125760,17
489 | 83167,125760,5
490 | 83168,125760,37
491 | 84413,127428,26
492 | 84414,127428,11
493 | 84554,127697,5
494 | 84555,127697,6
495 | 84727,128087,30
496 | 85049,128637,17
497 | 85050,128637,26
498 | 85051,128637,13
499 | 85052,128637,37
500 | 85058,128662,4
501 | 85059,128662,36
502 | 85060,128662,22
503 | 85061,128662,31
504 | 85197,128857,1
505 | 85198,128857,6
506 | 85419,129483,17
507 | 85420,129483,2
508 | 85421,129483,37
509 | 85662,129899,30
510 | 86270,131048,8
511 | 86271,131048,17
512 | 86272,131048,13
513 | 86273,131048,31
514 | 86726,131765,37
515 | 86727,131765,13
516 | 86728,131765,31
517 | 86949,132243,15
518 | 86950,132243,2
519 | 86951,132243,3
520 | 86952,132243,31
521 | 89412,135388,34
522 | 89413,135388,37
523 | 89414,135388,13
524 | 89415,135388,2
525 | 89416,135388,15
526 | 89417,135388,17
527 | 89684,135705,34
528 | 89685,135705,37
529 | 89686,135705,2
530 | 89687,135705,17
531 | 89688,135705,19
532 | 89689,135705,26
533 | 91458,139290,16
534 | 91459,139290,5
535 | 91460,139290,21
536 | 92175,140867,2
537 | 92176,140867,12
538 | 92177,140867,22
539 | 92178,140867,31
540 | 92362,141314,33
541 | 92363,141314,34
542 | 92364,141314,19
543 | 92365,141314,9
544 | 92366,141314,31
545 | 92726,142222,2
546 | 92727,142222,3
547 | 92728,142222,31
548 | 93025,142937,17
549 | 93026,142937,3
550 | 93027,142937,14
551 | 96046,150134,13
552 | 96047,150134,22
553 | 96890,151573,17
554 | 96891,151573,2
555 | 96892,151573,19
556 | 96893,151573,37
557 | 97924,153541,17
558 | 97925,153541,10
559 | 97926,153541,6
560 | 98180,154021,17
561 | 98181,154021,34
562 | 98182,154021,19
563 | 98183,154021,37
564 | 98184,154021,31
565 | 98335,154234,36
566 | 98935,155785,13
567 | 98936,155785,37
568 | 99813,157346,25
569 | 99814,157346,10
570 | 99815,157346,20
571 | 99816,157346,6
572 | 99817,157346,17
573 | 99993,157684,24
574 | 99994,157684,17
575 | 99995,157684,2
576 | 99996,157684,26
577 | 99997,157684,13
578 | 100190,157999,2
579 | 100191,157999,27
580 | 100192,157999,13
581 | 101008,159958,33
582 | 101009,159958,15
583 | 101121,160087,14
584 | 101142,160137,34
585 | 101143,160137,2
586 | 101144,160137,37
587 | 101145,160137,23
588 | 101815,162892,26
589 | 101816,162892,35
590 | 101817,162892,4
591 | 102512,165333,2
592 | 102513,165333,3
593 | 102514,165333,6
594 | 102515,165333,9
595 | 102516,165333,17
596 | 102517,165333,31
597 | 104832,177862,2
598 | 104833,177862,10
599 | 105683,180249,2
600 | 105684,180249,3
601 | 105685,180249,4
602 | 105686,180249,12
603 | 105687,180249,13
604 | 105688,180249,17
605 | 106200,184791,2
606 | 106201,184791,3
607 | 106202,184791,36
608 | 106343,185415,17
609 | 106344,185415,26
610 | 106345,185415,13
611 | 106346,185415,7
612 | 106712,186861,17
613 | 106713,186861,2
614 | 106714,186861,27
615 | 106715,186861,13
616 | 106716,186861,15
617 | 107697,193258,2
618 | 107698,193258,11
619 | 107699,193258,4
620 | 107700,193258,5
621 | 107701,193258,13
622 | 108098,195838,17
623 | 108099,195838,2
624 | 108100,195838,11
625 | 108880,199329,24
626 | 108881,199329,22
627 | 109121,200327,35
628 | 109606,203291,17
629 | 109607,203291,3
630 | 109608,203291,30
631 | 109943,204359,27
632 | 109944,204359,12
633 | 109945,204359,21
634 | 109946,204359,39
635 | 110046,204592,2
636 | 110047,204592,37
637 | 110188,204900,40
638 | 110189,204900,27
639 | 110190,204900,39
640 | 110194,204902,11
641 | 110195,204902,12
642 | 110196,204902,13
643 | 110197,204902,17
644 | 110198,204902,26
645 | 110199,204902,28
646 | 110583,205755,34
647 | 110632,205788,16
648 | 110633,205788,22
649 | 112637,210278,24
650 | 112638,210278,17
651 | 112639,210278,26
652 | 112640,210278,13
653 | 112641,210278,15
654 | 113538,211531,2
655 | 113539,211531,35
656 | 113540,211531,6
657 | 113588,211597,2
658 | 113589,211597,27
659 | 113590,211597,21
660 | 113591,211597,39
661 | 113861,212076,2
662 | 113862,212076,27
663 | 113863,212076,12
664 | 113864,212076,38
665 | 113913,212124,2
666 | 113914,212124,3
667 | 113915,212124,39
668 | 113916,212124,11
669 | 113917,212124,12
670 | 113918,212124,27
671 | 114202,212536,2
672 | 114203,212536,35
673 | 114204,212536,26
674 | 114205,212536,5
675 | 116228,221833,17
676 | 116229,221833,5
677 | 116372,222644,28
678 | 116373,222644,13
679 | 116437,222854,3
680 | 116438,222854,8
681 | 116439,222854,15
682 | 116440,222854,17
683 | 116441,222854,26
684 | 116442,222854,31
685 | 116471,222947,24
686 | 116472,222947,26
687 | 116473,222947,13
688 | 116474,222947,15
689 | 116540,223183,2
690 | 116541,223183,11
691 | 116542,223183,12
692 | 116543,223183,37
693 | 116544,223183,38
694 | 116891,224330,2
695 | 116892,224330,3
696 | 116893,224330,4
697 | 116894,224330,40
698 | 116895,224330,12
699 | 116896,224330,26
700 | 117414,225660,27
701 | 117415,225660,12
702 | 117416,225660,39
703 | 118134,227740,25
704 | 118135,227740,26
705 | 118136,227740,17
706 | 118389,228325,2
707 | 118390,228325,37
708 | 118391,228325,10
709 | 118392,228325,17
710 | 118393,228325,22
711 | 118394,228325,31
712 | 119209,230878,33
713 | 119210,230878,15
714 | 119808,233151,17
715 | 119809,233151,10
716 | 119810,233151,15
717 | 119817,233177,34
718 | 


--------------------------------------------------------------------------------
/lib/tagnews/geoloc/tag.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import glob
  4 | import json
  5 | import os
  6 | import re
  7 | import time
  8 | from collections import namedtuple
  9 | from contextlib import ExitStack, redirect_stderr
 10 | 
 11 | import numpy as np
 12 | import pandas as pd
 13 | import requests
 14 | from shapely.geometry import shape, Point
 15 | 
 16 | from tagnews.utils.neighborhoods import neighborhoods
 17 | from .. import utils
 18 | 
 19 | with ExitStack() as stack:
 20 |     null_stream = open(os.devnull, "w")
 21 |     stack.enter_context(null_stream)
 22 |     stack.enter_context(redirect_stderr(null_stream))
 23 |     import keras
 24 | 
 25 | """
 26 | Contains the CrimeTags class that allows tagging of articles.
 27 | """
 28 | 
 29 | MODEL_LOCATION = os.path.join(
 30 |     os.path.split(__file__)[0], os.path.join("models", "lstm", "saved")
 31 | )
 32 | 
 33 | COMMUNITY_AREAS_FILE = os.path.join(
 34 |     os.path.split(__file__)[0],
 35 |     "..",
 36 |     "data",
 37 |     "Boundaries - Community Areas (current).geojson",
 38 | )
 39 | 
 40 | 
 41 | def post_process(geostring):
 42 |     """
 43 |     Post process the geostring in a way that makes it more amenable to
 44 |     geocoding by the current geocoding provider GISgraphy.
 45 | 
 46 |     Inputs
 47 |     ------
 48 |     geostring : str
 49 |         The geostring to post process
 50 | 
 51 |     Returns
 52 |     -------
 53 |     processed_geostring : str
 54 |     """
 55 |     # Merge multiple whitespaces into one
 56 |     geostring = " ".join(geostring.split())
 57 | 
 58 |     # gisgraphy struggles with things like "55th and Woodlawn".
 59 |     # replace "...<number><number ender, e.g. th or rd> and..."
 60 |     # with two zeros.
 61 |     # \100 does not work correclty so we need to add a separator.
 62 |     geostring = re.sub(
 63 |         r"([0-9]+)(th|rd|st) and", r"\1<__internal_separator__>00 and", geostring
 64 |     )
 65 |     geostring = geostring.replace("<__internal_separator__>", "")
 66 | 
 67 |     # remove stopwords, only if they are internal, i.e.
 68 |     # the geostring doesn't start with "block ...".
 69 |     for stopword in ["block", "of", "and"]:
 70 |         geostring = geostring.replace(" {} ".format(stopword), " ")
 71 | 
 72 |     return geostring
 73 | 
 74 | 
 75 | _base_geocoder_url = (
 76 |     "http://ec2-34-228-58-223.compute-1.amazonaws.com" ":4000/v1/search?text={}"
 77 | )
 78 | 
 79 | GeocodeResults = namedtuple(
 80 |     "GeocodeResults",
 81 |     [
 82 |         "coords_raw",
 83 |         "full_responses_raw",
 84 |         "scores_raw",
 85 |         "coords_post",
 86 |         "full_responses_post",
 87 |         "scores_post",
 88 |     ],
 89 | )
 90 | 
 91 | 
 92 | def get_lat_longs_from_geostrings(
 93 |     geostring_list,
 94 |     post_process_f=None,
 95 |     sleep_secs=0,
 96 |     geocoder_url_formatter=_base_geocoder_url,
 97 | ):
 98 |     """
 99 |     Geo-code each geostring in `geostring_list` into lat/long values.
100 |     Also return the full response from the geocoding service.
101 | 
102 |     Inputs
103 |     ------
104 |     geostring_list : list of strings
105 |         The list of geostrings to geocode into lat/longs.
106 |     post_process_f : function
107 |         The results are returned for both the raw geostrings being
108 |         passed to the geocoder, and the results of
109 |         `post_process_f(geostring)` being passed to the geocoder.
110 |     sleep_secs : float
111 |         How long to sleep between successive requests, in seconds.
112 |     geocoder_url_formatter : str
113 |         A string with a "{}" in it where the text should be input, e.g.
114 |         "http://our-pelias.biz:4000/v1/search?text={}".
115 | 
116 |     Returns
117 |     -------
118 |     GeocodeResults : namedtuple
119 |         A named tuple with the following fields:
120 |         coords_raw : pandas.DataFrame
121 |             The length `n` DataFrame of lat/long values. Values are NaN
122 |             if the geocoder returned no results.
123 |         full_responses_raw : list
124 |             The length `n` list of the full responses from the geocoding
125 |             service.
126 |         scores_raw : numpy.array
127 |             Numpy array of the confidence scores of the responses.
128 |         coords_post : pandas.DataFrame
129 |             The length `n` DataFrame of lat/long values. Values are NaN
130 |             if the geocoder returned no results.
131 |         full_responses_post : list
132 |             The length `n` list of the full responses of the post-processed
133 |             geostrings.
134 |         scores_post : numpy.array
135 |             Numpy array of the confidence scores of the responses.
136 |     """
137 |     if post_process_f is None:
138 |         post_process_f = post_process
139 | 
140 |     def _geocode(lst):
141 |         full_responses = []
142 |         for addr_str in lst:
143 |             try:
144 |                 g = json.loads(
145 |                     requests.get(geocoder_url_formatter.format(addr_str)).text
146 |                 )
147 |             except Exception:
148 |                 g = {}
149 |             full_responses.append(g)
150 |             time.sleep(sleep_secs)
151 | 
152 |         def _get_latlong(g):
153 |             try:
154 |                 return g["features"][0]["geometry"]["coordinates"]
155 |             except (KeyError, IndexError):
156 |                 return [np.nan, np.nan]
157 | 
158 |         def _get_confidence(g):
159 |             try:
160 |                 return g["features"][0]["properties"]["confidence"]
161 |             except (KeyError, IndexError):
162 |                 return np.nan
163 | 
164 |         coords = pd.DataFrame(
165 |             [_get_latlong(g) for g in full_responses], columns=["long", "lat"]
166 |         )
167 |         coords = coords[["lat", "long"]]  # it makes me feel better, OK?
168 |         scores = np.asarray([_get_confidence(g) for g in full_responses]).astype(np.float32)
169 | 
170 |         return full_responses, coords, scores
171 | 
172 |     full_responses_raw, coords_raw, scores_raw = _geocode(geostring_list)
173 | 
174 |     full_responses_post, coords_post, scores_post = _geocode(
175 |         [post_process_f(geo_s) for geo_s in geostring_list]
176 |     )
177 | 
178 |     return GeocodeResults(
179 |         coords_raw=coords_raw,
180 |         full_responses_raw=full_responses_raw,
181 |         scores_raw=scores_raw,
182 |         coords_post=coords_post,
183 |         full_responses_post=full_responses_post,
184 |         scores_post=scores_post,
185 |     )
186 | 
187 | 
188 | def load_model(location=MODEL_LOCATION):
189 |     """
190 |     Load a model from the given folder `location`.
191 |     There should be at least one file named model-TIME.pkl and
192 |     a file named vectorizer-TIME.pkl inside the folder.
193 | 
194 |     The files with the most recent timestamp are loaded.
195 |     """
196 |     models = glob.glob(os.path.join(location, "weights*.hdf5"))
197 |     if not models:
198 |         raise RuntimeError(
199 |             (
200 |                 "No models to load. Run"
201 |                 ' "python -m tagnews.geoloc.models.'
202 |                 'lstm.save_model"'
203 |             )
204 |         )
205 | 
206 |     model = keras.models.load_model(models[-1])
207 | 
208 |     return model
209 | 
210 | 
211 | class GeoCoder:
212 |     def __init__(self):
213 |         self.model = load_model()
214 |         self.glove = utils.load_vectorizer.load_glove(
215 |             os.path.join(os.path.split(__file__)[0], "../data/glove.6B.50d.txt")
216 |         )
217 |         with open(COMMUNITY_AREAS_FILE) as f:
218 |             d = json.load(f)
219 |             self.com_areas = {
220 |                 f["properties"]["community"]: shape(f["geometry"])
221 |                 for f in d["features"]
222 |             }
223 | 
224 |     def pre_process(self, s):
225 |         """
226 |         Takes in a string which is the text of an article and returns the tuple
227 |         `(words, data)` where `words` is the list of words found and `data`
228 |         is the 3D numpy array that contains the numeric data that can be used
229 |         by the trained model.
230 | 
231 |         Inputs
232 |         ------
233 |         s : str
234 |             Article text.
235 | 
236 |         Returns
237 |         -------
238 |         words : list of strings
239 |             The words found in the article.
240 |         data : 3D numpy.array
241 |             Has shape (1, N, M) where N is the number of words and M
242 |             is the size of the word vectors, currently M is 51.
243 |         """
244 |         words = s.split()  # split along white space.
245 |         data = pd.concat(
246 |             [
247 |                 pd.DataFrame([[w[0].isupper()] if w else [False] for w in words]),
248 |                 (self.glove.reindex(words).fillna(0).reset_index(drop=True)),
249 |             ],
250 |             axis="columns",
251 |         )
252 |         data = np.asarray(data).astype(np.float32)
253 |         return words, np.expand_dims(data, axis=0)
254 | 
255 |     def extract_geostring_probs(self, s):
256 |         """
257 |         Extract the probability that each word in s is part of a geostring.
258 | 
259 |         Inputs
260 |         ------
261 |         s : str
262 |             Article text.
263 | 
264 |         Returns
265 |         -------
266 |         words : list of strings
267 |             The words found in the article.
268 |         probs : 1D numpy.array
269 |             Has shape (N,) where N is the number of words.
270 |         """
271 |         if not s.strip():
272 |             return [[], np.zeros((0,), dtype=np.float32)]
273 |         words, data = self.pre_process(s)
274 |         probs = self.model.predict(data)[0][:, 1]
275 |         return words, probs
276 | 
277 |     def extract_geostrings(self, s, prob_thresh=0.5):
278 |         """
279 |         Extract the geostrings from the article text.
280 | 
281 |         Inputs
282 |         ------
283 |         s : str
284 |             Article text.
285 |         prob_thresh : float, 0 <= prob_thresh <= 1
286 |             The threshold on probability above which words will be
287 |             considered as part of a geostring.
288 |             DEFAULT: 0.5
289 | 
290 |         Returns
291 |         -------
292 |         geostrings : list of lists of strings
293 |             The list of extracted geostrings from the article text.
294 |             Each word is kept separated in the list.
295 |             Example:
296 |                 [['1300', 'W.', 'Halsted'], ['Ohio']]
297 |         """
298 |         words, probs = self.extract_geostring_probs(s)
299 |         above_thresh = probs >= prob_thresh
300 | 
301 |         words = ["filler"] + words + ["filler"]
302 |         probs = np.append(0, np.append(probs, 0))
303 | 
304 |         above_thresh = np.concatenate([[False], above_thresh, [False]]).astype(np.int32)
305 |         switch_ons = np.where(np.diff(above_thresh) == 1)[0] + 1
306 |         switch_offs = np.where(np.diff(above_thresh) == -1)[0] + 1
307 | 
308 |         geostrings = []
309 |         probstrings = []
310 |         for on, off in zip(switch_ons, switch_offs):
311 |             geostrings.append(words[on:off])
312 |             probstrings.append(probs[on:off])
313 | 
314 |         return geostrings, probstrings
315 | 
316 |     @staticmethod
317 |     def lat_longs_from_geostring_lists(geostring_lists, **kwargs):
318 |         """
319 |         Get the latitude/longitude pairs from a list of geostrings as
320 |         returned by `extract_geostrings`. Note that `extract_geostrings`
321 |         returns a list of lists of words.
322 | 
323 |         Inputs
324 |         ------
325 |         geostring_lists : List[List[str]]
326 |             A length-N list of list of strings, as returned by
327 |             `extract_geostrings`.
328 |             Example: [['5500', 'S.', 'Woodlawn'], ['1700', 'S.', 'Halsted']]
329 |         **kwargs : other parameters passed to `get_lat_longs_from_geostrings`
330 | 
331 |         Returns
332 |         -------
333 |         coords : pandas.DataFrame
334 |             A pandas DataFrame with columns "lat" and "long". Values are
335 |             NaN if the geocoder returned no results.
336 |         scores : numpy.array
337 |             1D, length-N numpy array of the scores, higher indicates more
338 |             confidence. This is our best guess after masssaging the scores
339 |             returned by the geocoder, and should not be taken as any sort
340 |             of absolute rule.
341 |         """
342 |         out = get_lat_longs_from_geostrings(
343 |             [" ".join(gl) for gl in geostring_lists], **kwargs
344 |         )
345 | 
346 |         return out.coords_post, out.scores_post
347 | 
348 |     def community_area_from_coords(self, coords):
349 |         """
350 |         Get the community area name that the coordinate lies in.
351 | 
352 |         Parameters
353 |         ----------
354 |         coords : pandas.DataFrame
355 |             A pandas dataframe with columns "lat" and "long".
356 | 
357 |         Returns
358 |         -------
359 |         com_areas : List
360 |             A list of community areas, one corresponding to each
361 |             row of coords. An empty string indicates that the coord
362 |             did not belong to any of the community areas.
363 |         """
364 |         out = []
365 |         for _, coord in coords.iterrows():
366 |             p = Point(coord["long"], coord["lat"])
367 |             for com_name, com_shape in self.com_areas.items():
368 |                 if com_shape.contains(p):
369 |                     out.append(com_name)
370 |                     break
371 |             else:
372 |                 out.append("")
373 |         return out
374 | 
375 |     def best_geostring(self, extracted_strs_and_probs: tuple):
376 |         """
377 | 
378 |         Parameters
379 |         ----------
380 |         extracted_strs_and_probs : 2-tuple
381 |             A 2-tuple of two lists containing a list of extracted geostrings at index zero
382 |                                 and a list of extracted geostring probabilities at index one
383 | 
384 |         Returns
385 |         -------
386 |         2-tuple of one geostring of the best geostring
387 |         OR False
388 |         """
389 |         consider = [[], []]
390 |         for geostring, probs in zip(
391 |             extracted_strs_and_probs[0], extracted_strs_and_probs[1]
392 |         ):
393 |             is_neighborhood = False
394 |             for neighborhood in neighborhoods:
395 |                 if neighborhood.lower() in " ".join(geostring).lower():
396 |                     is_neighborhood = True
397 |             if is_neighborhood or len(geostring) >= 3:
398 |                 consider[0].append((geostring))
399 |                 consider[1].append((probs))
400 |         if consider[0]:
401 |             avgs = [sum(i) / len(i) for i in consider[1]]
402 |             max_index = avgs.index(max(avgs))
403 |             return consider[0][max_index]
404 |         else:
405 |             return ''
406 | 
407 | 


--------------------------------------------------------------------------------
/lib/tagnews/data/ci-data/newsarticles_trainedcoding.csv:
--------------------------------------------------------------------------------
  1 | 453893,2018-04-06 04:23:02.331286+00,tagnews 1.0.2,0.9999999493514471,40534
  2 | 454415,2018-04-06 04:27:10.231305+00,tagnews 1.0.2,0.9896225820938951,41056
  3 | 454491,2018-04-06 04:27:44.699546+00,tagnews 1.0.2,0.9168645188301731,41132
  4 | 454644,2018-04-06 04:28:51.487924+00,tagnews 1.0.2,0.999932514377795,41285
  5 | 455210,2018-04-06 04:33:12.303175+00,tagnews 1.0.2,0.999287893890405,41851
  6 | 456158,2018-04-06 04:40:15.107791+00,tagnews 1.0.2,0.9999899196988399,42800
  7 | 456190,2018-04-06 04:40:24.03267+00,tagnews 1.0.2,0.999887094530515,42832
  8 | 457690,2018-04-06 04:51:33.089617+00,tagnews 1.0.2,0.9999088677496021,44332
  9 | 458343,2018-04-06 04:56:30.545353+00,tagnews 1.0.2,0.907531415964436,44986
 10 | 458774,2018-04-06 04:59:43.82406+00,tagnews 1.0.2,0.9979253520065241,45420
 11 | 460927,2018-04-06 05:18:24.462887+00,tagnews 1.0.2,0.9100118116350899,47574
 12 | 462142,2018-04-06 05:28:22.308611+00,tagnews 1.0.2,0.9999995557910791,48789
 13 | 467811,2018-04-06 06:19:54.057142+00,tagnews 1.0.2,0.997743130725998,54456
 14 | 470170,2018-04-06 06:40:19.891585+00,tagnews 1.0.2,0.9999775872018041,56816
 15 | 472166,2018-04-06 06:56:48.795115+00,tagnews 1.0.2,0.999768276509597,58813
 16 | 472582,2018-04-06 07:00:34.682456+00,tagnews 1.0.2,0.8882503668315859,59229
 17 | 473190,2018-04-06 07:07:26.558123+00,tagnews 1.0.2,0.9998427982345229,59836
 18 | 473404,2018-04-06 07:09:30.134+00,tagnews 1.0.2,0.9929430356020791,60050
 19 | 473939,2018-04-06 07:13:31.574314+00,tagnews 1.0.2,0.9964149984724949,60585
 20 | 475062,2018-04-06 07:24:31.885005+00,tagnews 1.0.2,0.9367940282996899,61709
 21 | 475324,2018-04-06 07:26:42.040181+00,tagnews 1.0.2,0.994860947249206,61971
 22 | 487373,2018-04-06 09:16:36.261811+00,tagnews 1.0.2,0.9908202992658509,74059
 23 | 491582,2018-04-06 09:53:08.206784+00,tagnews 1.0.2,0.9988898360762579,78269
 24 | 492024,2018-04-06 09:57:05.231426+00,tagnews 1.0.2,0.999973199132512,78711
 25 | 492289,2018-04-06 09:59:24.69061+00,tagnews 1.0.2,0.999808315805894,78976
 26 | 493040,2018-04-06 10:08:21.531175+00,tagnews 1.0.2,0.9999004512399681,79724
 27 | 494190,2018-04-06 10:17:59.220825+00,tagnews 1.0.2,0.999999980935647,80874
 28 | 496875,2018-04-06 10:41:33.208945+00,tagnews 1.0.2,0.999483423538713,83560
 29 | 500625,2018-04-06 11:15:45.916888+00,tagnews 1.0.2,0.999156888223627,87395
 30 | 501009,2018-04-06 11:19:24.342627+00,tagnews 1.0.2,0.999933181042262,87780
 31 | 501808,2018-04-06 11:25:46.986691+00,tagnews 1.0.2,0.995632505923119,88579
 32 | 502490,2018-04-06 11:31:56.199454+00,tagnews 1.0.2,0.999987595846957,89261
 33 | 502547,2018-04-06 11:32:23.489778+00,tagnews 1.0.2,0.9965366249994351,89318
 34 | 502889,2018-04-06 11:35:09.913288+00,tagnews 1.0.2,0.9524503271232819,89660
 35 | 503875,2018-04-06 11:44:09.105877+00,tagnews 1.0.2,0.99984092412567,90646
 36 | 507757,2018-04-06 12:20:35.009847+00,tagnews 1.0.2,0.985260566422226,94509
 37 | 507786,2018-04-06 12:20:48.815289+00,tagnews 1.0.2,0.9996901103399721,94538
 38 | 510712,2018-04-06 12:46:44.622491+00,tagnews 1.0.2,0.971614759823332,97498
 39 | 511271,2018-04-06 12:51:43.420295+00,tagnews 1.0.2,0.9922923576575211,98058
 40 | 512592,2018-04-06 13:05:25.884409+00,tagnews 1.0.2,0.99993577514431,99361
 41 | 514308,2018-04-06 13:20:56.85264+00,tagnews 1.0.2,0.9911938024182759,101077
 42 | 514617,2018-04-06 13:23:49.111337+00,tagnews 1.0.2,0.9138501467819742,101386
 43 | 515297,2018-04-06 13:29:21.0853+00,tagnews 1.0.2,0.9999860022949809,102066
 44 | 516352,2018-04-06 13:38:53.543665+00,tagnews 1.0.2,0.994323934760144,103122
 45 | 516386,2018-04-06 13:39:15.999084+00,tagnews 1.0.2,0.999935627754646,103156
 46 | 517535,2018-04-06 13:48:50.789743+00,tagnews 1.0.2,0.9991068685300749,104306
 47 | 517769,2018-04-06 13:51:15.925762+00,tagnews 1.0.2,0.98407007504748,104540
 48 | 518677,2018-04-06 13:59:46.477226+00,tagnews 1.0.2,0.9999780090434369,105448
 49 | 519589,2018-04-06 14:10:03.884068+00,tagnews 1.0.2,0.999966351973624,106346
 50 | 521136,2018-04-06 14:23:57.373069+00,tagnews 1.0.2,0.9967965743400659,107893
 51 | 522104,2018-04-06 14:32:30.666413+00,tagnews 1.0.2,0.9826979416411509,108861
 52 | 526095,2018-04-06 15:10:55.746265+00,tagnews 1.0.2,0.8428486532137309,112858
 53 | 526113,2018-04-06 15:11:06.186507+00,tagnews 1.0.2,0.99871130956743,112876
 54 | 529649,2018-04-06 15:42:59.30025+00,tagnews 1.0.2,0.9947046965727321,116413
 55 | 533365,2018-04-06 16:19:16.735275+00,tagnews 1.0.2,0.9792654478542471,120162
 56 | 534145,2018-04-06 16:26:44.38019+00,tagnews 1.0.2,0.973129736132725,120942
 57 | 534864,2018-04-06 16:33:07.889769+00,tagnews 1.0.2,0.9999966310233621,121661
 58 | 537443,2018-04-06 16:57:04.395494+00,tagnews 1.0.2,0.95064963726574,124241
 59 | 536014,2018-04-06 16:44:00.951708+00,tagnews 1.0.2,0.99998367863464,122811
 60 | 538379,2018-04-06 17:07:12.966202+00,tagnews 1.0.2,0.9987915520879151,125154
 61 | 539462,2018-04-06 17:17:50.055981+00,tagnews 1.0.2,0.863883951028128,126236
 62 | 542197,2018-04-06 17:42:44.372711+00,tagnews 1.0.2,0.9865234365113491,128971
 63 | 544884,2018-04-06 18:09:45.995975+00,tagnews 1.0.2,0.999995294977297,131633
 64 | 545652,2018-04-06 18:16:40.182721+00,tagnews 1.0.2,0.9977864966673121,132403
 65 | 545924,2018-04-06 18:19:22.821435+00,tagnews 1.0.2,0.999978634170278,132675
 66 | 547592,2018-04-06 18:34:40.982291+00,tagnews 1.0.2,0.9654272379126729,134343
 67 | 547861,2018-04-06 18:37:15.278881+00,tagnews 1.0.2,0.9797029097256109,134612
 68 | 548251,2018-04-06 18:41:09.706316+00,tagnews 1.0.2,0.959968104909423,135002
 69 | 548801,2018-04-06 18:45:50.652815+00,tagnews 1.0.2,0.9999578705512691,135552
 70 | 548826,2018-04-06 18:46:05.682786+00,tagnews 1.0.2,0.9812507013257259,135577
 71 | 549021,2018-04-06 18:47:54.53721+00,tagnews 1.0.2,0.9874391977978849,135772
 72 | 549647,2018-04-06 18:54:08.913032+00,tagnews 1.0.2,0.9957128218479301,136398
 73 | 550063,2018-04-06 18:58:16.794197+00,tagnews 1.0.2,0.9411575466816129,136814
 74 | 551242,2018-04-06 19:11:19.077545+00,tagnews 1.0.2,0.999855649439592,137963
 75 | 551960,2018-04-06 19:17:43.937509+00,tagnews 1.0.2,0.992348301676597,138681
 76 | 552438,2018-04-06 19:22:19.283931+00,tagnews 1.0.2,0.9293200935215621,139159
 77 | 555583,2018-04-06 19:51:29.329114+00,tagnews 1.0.2,0.9983273264004301,142304
 78 | 555900,2018-04-06 19:54:30.206493+00,tagnews 1.0.2,0.9996439141829279,142621
 79 | 559515,2018-04-06 20:29:30.512525+00,tagnews 1.0.2,0.998004769621765,146218
 80 | 561091,2018-04-06 20:44:26.450771+00,tagnews 1.0.2,0.9978320764347309,147795
 81 | 561538,2018-04-06 20:48:30.220341+00,tagnews 1.0.2,0.9999984941589128,148242
 82 | 562446,2018-04-06 20:57:10.686435+00,tagnews 1.0.2,0.967552833211205,149150
 83 | 563189,2018-04-06 21:05:39.600147+00,tagnews 1.0.2,0.9862481476081439,149865
 84 | 570417,2018-04-06 22:15:15.316543+00,tagnews 1.0.2,0.788684471902453,157068
 85 | 571856,2018-04-06 22:28:26.990609+00,tagnews 1.0.2,0.9995254014306991,158507
 86 | 573824,2018-04-06 22:46:12.837637+00,tagnews 1.0.2,0.981930328102285,160476
 87 | 574304,2018-04-06 22:50:16.968366+00,tagnews 1.0.2,0.9966144524368841,160956
 88 | 574517,2018-04-06 22:52:01.449551+00,tagnews 1.0.2,0.9485902236238009,161169
 89 | 576100,2018-04-06 23:09:01.193554+00,tagnews 1.0.2,0.988857692303698,162729
 90 | 577661,2018-04-06 23:22:51.826114+00,tagnews 1.0.2,0.974163605880917,164291
 91 | 578001,2018-04-06 23:26:09.611614+00,tagnews 1.0.2,0.9999607688401649,164631
 92 | 578317,2018-04-06 23:29:04.555426+00,tagnews 1.0.2,0.999911923675472,164947
 93 | 580278,2018-04-06 23:47:26.485186+00,tagnews 1.0.2,0.9886499410462121,166908
 94 | 580407,2018-04-06 23:48:44.515143+00,tagnews 1.0.2,0.9804183485201959,167037
 95 | 580457,2018-04-06 23:49:16.947558+00,tagnews 1.0.2,0.9435893034373599,167087
 96 | 584188,2018-04-07 00:27:36.648565+00,tagnews 1.0.2,0.999452789511913,170803
 97 | 589047,2018-04-07 01:17:40.407635+00,tagnews 1.0.2,0.9997032201925992,175658
 98 | 614671,2018-04-07 05:47:41.381881+00,tagnews 1.0.2,0.923683116129813,201260
 99 | 617068,2018-04-07 06:13:34.906861+00,tagnews 1.0.2,0.9999999998184541,203651
100 | 626743,2018-04-07 07:53:33.372862+00,tagnews 1.0.2,0.999307402755212,213344
101 | 627366,2018-04-07 08:00:05.556935+00,tagnews 1.0.2,0.999233766009999,213968
102 | 628811,2018-04-07 08:16:27.001801+00,tagnews 1.0.2,0.9998885006610941,215414
103 | 638085,2018-04-07 09:52:01.276691+00,tagnews 1.0.2,0.999991081666442,224701
104 | 655730,2018-04-07 12:52:48.982083+00,tagnews 1.0.2,0.611761421637548,242395
105 | 664187,2018-04-07 14:18:16.932903+00,tagnews 1.0.2,0.940635779357718,250841
106 | 673593,2018-04-07 15:51:48.004491+00,tagnews 1.0.2,0.999999440852851,260241
107 | 679017,2018-04-07 16:46:03.567665+00,tagnews 1.0.2,0.97342543252198,265658
108 | 703432,2018-04-07 20:45:09.373865+00,tagnews 1.0.2,0.9439736876444,290036
109 | 703434,2018-04-07 20:45:10.40857+00,tagnews 1.0.2,0.971157345809923,290038
110 | 680518,2018-04-07 17:02:42.151431+00,tagnews 1.0.2,0.9044098048439742,267157
111 | 680698,2018-04-07 17:04:51.132479+00,tagnews 1.0.2,0.998803081336373,267325
112 | 681512,2018-04-07 17:13:30.161124+00,tagnews 1.0.2,0.995728484527155,268139
113 | 694916,2018-04-07 19:23:10.986639+00,tagnews 1.0.2,0.9998013135720291,281528
114 | 695016,2018-04-07 19:24:14.881732+00,tagnews 1.0.2,0.9999974846780271,281628
115 | 698396,2018-04-07 19:55:28.491522+00,tagnews 1.0.2,0.99964667989408,285008
116 | 698507,2018-04-07 19:56:34.685989+00,tagnews 1.0.2,0.9927044974262328,285119
117 | 702101,2018-04-07 20:32:51.635439+00,tagnews 1.0.2,0.987630525694864,288705
118 | 705758,2018-04-07 21:09:12.118101+00,tagnews 1.0.2,0.9508571504826291,292352
119 | 709872,2018-04-07 21:47:14.228435+00,tagnews 1.0.2,0.9979933013970591,296467
120 | 716443,2018-04-07 22:47:09.648682+00,tagnews 1.0.2,0.326910330888475,303024
121 | 727147,2018-04-08 00:23:07.83915+00,tagnews 1.0.2,0.981861782740461,313702
122 | 728505,2018-04-08 00:34:41.867748+00,tagnews 1.0.2,0.16869949219008698,315061
123 | 729271,2018-04-08 00:41:36.337504+00,tagnews 1.0.2,0.995228266194327,315827
124 | 730663,2018-04-08 00:53:40.816585+00,tagnews 1.0.2,0.9999152063211558,317223
125 | 739867,2018-04-08 02:15:18.435313+00,tagnews 1.0.2,0.999999999114922,326413
126 | 740942,2018-04-08 02:24:43.982541+00,tagnews 1.0.2,0.8448991642964859,327488
127 | 766495,2018-04-08 06:11:50.514599+00,tagnews 1.0.2,0.9943096817014441,353037
128 | 768075,2018-04-08 06:26:17.501704+00,tagnews 1.0.2,0.9997591877520999,354618
129 | 776203,2018-04-08 07:40:14.890831+00,tagnews 1.0.2,0.999374242538893,362825
130 | 781435,2018-04-08 08:29:14.581644+00,tagnews 1.0.2,0.9823034105854179,368055
131 | 413550,2018-04-06 01:11:07.73102+00,tagnews 1.0.2,0.9998575366841641,132
132 | 413637,2018-04-06 01:11:19.713638+00,tagnews 1.0.2,0.9363483455726559,219
133 | 415147,2018-04-06 01:17:00.942544+00,tagnews 1.0.2,0.95221771381613,1761
134 | 415662,2018-04-06 01:18:12.646466+00,tagnews 1.0.2,0.9023008565400741,2276
135 | 417225,2018-04-06 01:21:51.371533+00,tagnews 1.0.2,0.9625021379247131,3840
136 | 417506,2018-04-06 01:22:32.058913+00,tagnews 1.0.2,0.918737580752234,4121
137 | 417583,2018-04-06 01:22:43.567975+00,tagnews 1.0.2,0.99972004364052,4198
138 | 417670,2018-04-06 01:22:54.964258+00,tagnews 1.0.2,0.99835708444398,4285
139 | 417739,2018-04-06 01:23:05.351858+00,tagnews 1.0.2,0.9999997197983299,4354
140 | 418293,2018-04-06 01:24:22.771378+00,tagnews 1.0.2,0.9999878349356909,4908
141 | 419207,2018-04-06 01:26:36.866343+00,tagnews 1.0.2,0.9999012082107359,5822
142 | 419765,2018-04-06 01:28:03.48474+00,tagnews 1.0.2,0.9999715841932209,6380
143 | 419836,2018-04-06 01:28:16.467132+00,tagnews 1.0.2,0.9947189203452851,6451
144 | 419919,2018-04-06 01:28:27.687662+00,tagnews 1.0.2,0.999960548834855,6534
145 | 420216,2018-04-06 01:29:11.211941+00,tagnews 1.0.2,0.9442981731740959,6831
146 | 420500,2018-04-06 01:29:49.04032+00,tagnews 1.0.2,0.980764656949998,7115
147 | 420547,2018-04-06 01:29:55.363319+00,tagnews 1.0.2,0.9832552983445559,7162
148 | 420694,2018-04-06 01:30:19.057729+00,tagnews 1.0.2,0.9881275134573192,7309
149 | 420802,2018-04-06 01:30:33.855733+00,tagnews 1.0.2,0.9999239993581641,7417
150 | 421150,2018-04-06 01:31:23.423822+00,tagnews 1.0.2,0.968428807713687,7765
151 | 421153,2018-04-06 01:31:23.826251+00,tagnews 1.0.2,0.99881334561032,7768
152 | 421396,2018-04-06 01:31:57.144215+00,tagnews 1.0.2,0.999992699275227,8011
153 | 421815,2018-04-06 01:32:57.282279+00,tagnews 1.0.2,0.972751737259355,8430
154 | 422211,2018-04-06 01:33:55.110953+00,tagnews 1.0.2,0.966741584957682,8826
155 | 422747,2018-04-06 01:35:13.558814+00,tagnews 1.0.2,0.94497039979141,9362
156 | 423375,2018-04-06 01:36:42.42577+00,tagnews 1.0.2,0.8450758451966,9990
157 | 423430,2018-04-06 01:36:49.088615+00,tagnews 1.0.2,0.9999214414211991,10045
158 | 423440,2018-04-06 01:36:50.919187+00,tagnews 1.0.2,0.997539950456985,10055
159 | 423745,2018-04-06 01:37:31.853592+00,tagnews 1.0.2,0.99391700724742,10360
160 | 424048,2018-04-06 01:38:15.090671+00,tagnews 1.0.2,0.9772771440389341,10663
161 | 424319,2018-04-06 01:38:52.743871+00,tagnews 1.0.2,0.9654367904633449,10934
162 | 424604,2018-04-06 01:39:38.331158+00,tagnews 1.0.2,0.999998508945548,11219
163 | 425064,2018-04-06 01:40:45.68868+00,tagnews 1.0.2,0.962874638291766,11679
164 | 425264,2018-04-06 01:41:14.548616+00,tagnews 1.0.2,0.9983590464239879,11879
165 | 425292,2018-04-06 01:41:17.894869+00,tagnews 1.0.2,0.78292962101689,11907
166 | 425341,2018-04-06 01:41:23.704755+00,tagnews 1.0.2,0.9404141647615001,11956
167 | 426267,2018-04-06 01:43:26.075762+00,tagnews 1.0.2,0.9884139618494581,12914
168 | 426851,2018-04-06 01:44:40.313328+00,tagnews 1.0.2,0.8884907339904599,13498
169 | 428876,2018-04-06 02:54:22.545312+00,tagnews 1.0.2,0.9933536233887491,15531
170 | 430230,2018-04-06 02:57:09.02851+00,tagnews 1.0.2,0.99993935178435,16887
171 | 430749,2018-04-06 02:58:14.953914+00,tagnews 1.0.2,0.999997576502179,17407
172 | 430966,2018-04-06 02:58:42.411204+00,tagnews 1.0.2,0.999797196071687,17624
173 | 431223,2018-04-06 02:59:15.301091+00,tagnews 1.0.2,0.9999047187628759,17881
174 | 431512,2018-04-06 02:59:50.802354+00,tagnews 1.0.2,0.9993146756468541,18171
175 | 431877,2018-04-06 03:00:47.693256+00,tagnews 1.0.2,0.997268248405229,18534
176 | 432356,2018-04-06 03:01:49.769789+00,tagnews 1.0.2,0.999713222538032,18999
177 | 432510,2018-04-06 03:02:08.648613+00,tagnews 1.0.2,0.977858386569711,19152
178 | 434949,2018-04-06 03:07:09.800331+00,tagnews 1.0.2,0.9999855795547621,21593
179 | 435494,2018-04-06 03:08:15.407483+00,tagnews 1.0.2,0.9458754625382351,22138
180 | 435500,2018-04-06 03:08:15.939637+00,tagnews 1.0.2,0.996230356277238,22144
181 | 436155,2018-04-06 03:09:37.308522+00,tagnews 1.0.2,0.9999984987696869,22800
182 | 437372,2018-04-06 03:12:06.851803+00,tagnews 1.0.2,0.727250099092021,24019
183 | 440314,2018-04-06 03:18:14.179475+00,tagnews 1.0.2,0.985039327167572,26962
184 | 441456,2018-04-06 03:20:34.515571+00,tagnews 1.0.2,0.9999556382799301,28104
185 | 442355,2018-04-06 03:22:28.605809+00,tagnews 1.0.2,0.998576721748682,29003
186 | 442449,2018-04-06 03:22:40.799598+00,tagnews 1.0.2,0.993966600112061,29097
187 | 445543,2018-04-06 03:28:44.805305+00,tagnews 1.0.2,0.99982631322319,32192
188 | 448043,2018-04-06 03:38:48.768498+00,tagnews 1.0.2,0.7778902633735899,34694
189 | 448194,2018-04-06 03:39:57.436842+00,tagnews 1.0.2,0.901220353889572,34846
190 | 448259,2018-04-06 03:40:11.057364+00,tagnews 1.0.2,0.969840685449547,34911
191 | 448414,2018-04-06 03:41:03.00998+00,tagnews 1.0.2,0.917062525553955,35066
192 | 449338,2018-04-06 03:48:20.438671+00,tagnews 1.0.2,0.9951635402008809,35990
193 | 451353,2018-04-06 04:04:49.279498+00,tagnews 1.0.2,0.9687636729776951,37996
194 | 452473,2018-04-06 04:12:59.601278+00,tagnews 1.0.2,0.9990444458982649,39114
195 | 


--------------------------------------------------------------------------------
/lib/notebooks/keras-glove-testing-api-example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/home/kevin/Documents/github/article-tagging/lib\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "cd .."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {},
 24 |    "outputs": [
 25 |     {
 26 |      "name": "stderr",
 27 |      "output_type": "stream",
 28 |      "text": [
 29 |       "Using TensorFlow backend.\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "import os\n",
 35 |     "import tagnews\n",
 36 |     "import pandas as pd\n",
 37 |     "from keras.models import Sequential\n",
 38 |     "from keras.layers import LSTM, Dense, TimeDistributed\n",
 39 |     "from keras.utils import to_categorical\n",
 40 |     "from keras.callbacks import ModelCheckpoint\n",
 41 |     "import numpy as np\n",
 42 |     "import json\n",
 43 |     "import requests\n",
 44 |     "import keras"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "glove = tagnews.load_glove('tagnews/data/glove.6B.50d.txt')\n"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 4,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "with open('tagnews/data/training.txt', encoding='utf-8') as f:\n",
 63 |     "    our_training_data = f.read()\n",
 64 |     "    \n",
 65 |     "training_df = pd.DataFrame([x.split() for x in our_training_data.split('\\n')],\n",
 66 |     "                           columns=['word', 'tag'])\n",
 67 |     "training_df.iloc[:,1] = training_df.iloc[:,1].apply(int)\n",
 68 |     "training_df['all_tags'] = 'NA'\n",
 69 |     "\n",
 70 |     "# If you want to join our data w/ kaggle data, you can do this.\n",
 71 |     "# ner = tagnews.load_ner_data('tagnews/data/')\n",
 72 |     "# pd.concat([training_df, ner]).reset_index(drop=True)\n",
 73 |     "\n",
 74 |     "# If you just want to use our data, you can do this.\n",
 75 |     "ner = training_df\n",
 76 |     "\n",
 77 |     "ner = ner[['word', 'all_tags', 'tag']]"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 5,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "ner = pd.concat([ner,\n",
 87 |     "                 pd.DataFrame(ner['word'].str[0].str.isupper().values),\n",
 88 |     "                 pd.DataFrame(glove.loc[ner['word'].str.lower()].values)],\n",
 89 |     "                axis='columns')\n",
 90 |     "ner.fillna(value=0.0, inplace=True)\n",
 91 |     "\n",
 92 |     "data_dim = 51\n",
 93 |     "timesteps = 25 # only during training, testing can take arbitrary length.\n",
 94 |     "num_classes = 2\n",
 95 |     "\n",
 96 |     "train_val_split = int(19 * ner.shape[0] / 20.)\n",
 97 |     "\n",
 98 |     "ner_train_idxs = range(0, train_val_split - timesteps, timesteps)\n",
 99 |     "x_train = np.array([ner.iloc[i:i+timesteps, 3:].values\n",
100 |     "                    for i in ner_train_idxs])\n",
101 |     "y_train = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)\n",
102 |     "                    for i in ner_train_idxs])\n",
103 |     "\n",
104 |     "ner_val_idxs = range(train_val_split, ner.shape[0] - timesteps, timesteps)\n",
105 |     "x_val = np.array([ner.iloc[i:i+timesteps, 3:].values\n",
106 |     "                  for i in ner_val_idxs])\n",
107 |     "y_val = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)\n",
108 |     "                  for i in ner_val_idxs])"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 6,
114 |    "metadata": {},
115 |    "outputs": [
116 |     {
117 |      "name": "stdout",
118 |      "output_type": "stream",
119 |      "text": [
120 |       "____________________________________________________________________________________________________\n",
121 |       "Layer (type)                                 Output Shape                            Param #        \n",
122 |       "====================================================================================================\n",
123 |       "lstm_1 (LSTM)                                (None, None, 32)                        10752          \n",
124 |       "____________________________________________________________________________________________________\n",
125 |       "lstm_2 (LSTM)                                (None, None, 8)                         1312           \n",
126 |       "____________________________________________________________________________________________________\n",
127 |       "time_distributed_1 (TimeDistributed)         (None, None, 2)                         18             \n",
128 |       "====================================================================================================\n",
129 |       "Total params: 12,082\n",
130 |       "Trainable params: 12,082\n",
131 |       "Non-trainable params: 0\n",
132 |       "____________________________________________________________________________________________________\n",
133 |       "None\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "model = Sequential()\n",
139 |     "model.add(LSTM(32, return_sequences=True, input_shape=(None, data_dim)))\n",
140 |     "model.add(LSTM(8, return_sequences=True))\n",
141 |     "model.add(TimeDistributed(Dense(2, activation='softmax')))\n",
142 |     "model.compile(loss='categorical_crossentropy',\n",
143 |     "              optimizer='adam',\n",
144 |     "              metrics=['categorical_accuracy'])\n",
145 |     "print(model.summary(100))"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 7,
151 |    "metadata": {
152 |     "collapsed": true
153 |    },
154 |    "outputs": [],
155 |    "source": [
156 |     "os.makedirs('tmp', exist_ok=True)\n",
157 |     "checkpointer = ModelCheckpoint(filepath='./tmp/weights-{epoch:02d}.hdf5',\n",
158 |     "                               monitor='val_categorical_accuracy',\n",
159 |     "                               verbose=1,\n",
160 |     "                               save_best_only=True)\n",
161 |     "\n",
162 |     "class OurAUC(keras.callbacks.Callback):\n",
163 |     "    def on_epoch_end(self, epoch, logs={}):\n",
164 |     "        # Go to https://geo-extract-tester.herokuapp.com/ and download\n",
165 |     "        # the validation data (validation.txt).\n",
166 |     "        with open('validation.txt', encoding='utf-8') as f:\n",
167 |     "            s = f.read()\n",
168 |     "\n",
169 |     "        gloved_data = pd.concat([pd.DataFrame([[w[0].isupper()] for w in s.split('\\n') if w]),\n",
170 |     "                                 glove.loc[[w for w in s.split('\\n') if w]].fillna(0).reset_index(drop=True)],\n",
171 |     "                               axis='columns')\n",
172 |     "        glove_time_size = 100\n",
173 |     "        preds_batched = []\n",
174 |     "        i = 0\n",
175 |     "        while gloved_data[i:i+glove_time_size].size:\n",
176 |     "            preds_batched.append(model.predict(np.expand_dims(gloved_data[i:i+glove_time_size],\n",
177 |     "                                                              axis=0))[0][:,1])\n",
178 |     "            i += glove_time_size\n",
179 |     "\n",
180 |     "        with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'w') as f:\n",
181 |     "            for prob in [p for pred in preds_batched for p in pred]:\n",
182 |     "                f.write(str(prob) + '\\n')\n",
183 |     "\n",
184 |     "        with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'rb') as f:\n",
185 |     "            url = 'https://geo-extract-tester.herokuapp.com/api/score'\n",
186 |     "            r = requests.post(url, files={'file': f})\n",
187 |     "            try:\n",
188 |     "                print('AUC: {:.5f}'.format(json.loads(r.text)['auc']))\n",
189 |     "            except KeyError:\n",
190 |     "                raise ValueError('Problem retrieving AUC from API. Is your validation set up to date?')\n",
191 |     "\n",
192 |     "our_auc = OurAUC()"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 8,
198 |    "metadata": {},
199 |    "outputs": [
200 |     {
201 |      "name": "stdout",
202 |      "output_type": "stream",
203 |      "text": [
204 |       "Train on 2467 samples, validate on 129 samples\n",
205 |       "Epoch 1/20\n",
206 |       "Epoch 00000: val_categorical_accuracy improved from -inf to 0.93054, saving model to ./tmp/weights-00.hdf5\n",
207 |       "AUC: 0.88599\n",
208 |       "27s - loss: 0.3390 - categorical_accuracy: 0.9053 - val_loss: 0.2362 - val_categorical_accuracy: 0.9305\n",
209 |       "Epoch 2/20\n",
210 |       "Epoch 00001: val_categorical_accuracy did not improve\n",
211 |       "AUC: 0.93386\n",
212 |       "26s - loss: 0.2037 - categorical_accuracy: 0.9177 - val_loss: 0.1728 - val_categorical_accuracy: 0.9271\n",
213 |       "Epoch 3/20\n",
214 |       "Epoch 00002: val_categorical_accuracy did not improve\n",
215 |       "AUC: 0.94096\n",
216 |       "26s - loss: 0.1584 - categorical_accuracy: 0.9369 - val_loss: 0.1627 - val_categorical_accuracy: 0.9253\n",
217 |       "Epoch 4/20\n",
218 |       "Epoch 00003: val_categorical_accuracy did not improve\n",
219 |       "AUC: 0.94627\n",
220 |       "26s - loss: 0.1458 - categorical_accuracy: 0.9429 - val_loss: 0.1583 - val_categorical_accuracy: 0.9243\n",
221 |       "Epoch 5/20\n",
222 |       "Epoch 00004: val_categorical_accuracy did not improve\n",
223 |       "AUC: 0.94879\n",
224 |       "27s - loss: 0.1399 - categorical_accuracy: 0.9448 - val_loss: 0.1532 - val_categorical_accuracy: 0.9262\n",
225 |       "Epoch 6/20\n",
226 |       "Epoch 00005: val_categorical_accuracy did not improve\n",
227 |       "AUC: 0.95070\n",
228 |       "26s - loss: 0.1351 - categorical_accuracy: 0.9465 - val_loss: 0.1526 - val_categorical_accuracy: 0.9287\n",
229 |       "Epoch 7/20\n",
230 |       "Epoch 00006: val_categorical_accuracy did not improve\n",
231 |       "AUC: 0.95202\n",
232 |       "26s - loss: 0.1326 - categorical_accuracy: 0.9467 - val_loss: 0.1512 - val_categorical_accuracy: 0.9281\n",
233 |       "Epoch 8/20\n",
234 |       "Epoch 00007: val_categorical_accuracy did not improve\n",
235 |       "AUC: 0.95270\n",
236 |       "27s - loss: 0.1301 - categorical_accuracy: 0.9488 - val_loss: 0.1527 - val_categorical_accuracy: 0.9281\n",
237 |       "Epoch 9/20\n",
238 |       "Epoch 00008: val_categorical_accuracy did not improve\n",
239 |       "AUC: 0.95297\n",
240 |       "27s - loss: 0.1276 - categorical_accuracy: 0.9493 - val_loss: 0.1465 - val_categorical_accuracy: 0.9274\n",
241 |       "Epoch 10/20\n",
242 |       "Epoch 00009: val_categorical_accuracy did not improve\n",
243 |       "AUC: 0.95275\n",
244 |       "28s - loss: 0.1255 - categorical_accuracy: 0.9493 - val_loss: 0.1444 - val_categorical_accuracy: 0.9287\n",
245 |       "Epoch 11/20\n",
246 |       "Epoch 00010: val_categorical_accuracy did not improve\n",
247 |       "AUC: 0.95273\n",
248 |       "27s - loss: 0.1241 - categorical_accuracy: 0.9496 - val_loss: 0.1439 - val_categorical_accuracy: 0.9281\n",
249 |       "Epoch 12/20\n",
250 |       "Epoch 00011: val_categorical_accuracy did not improve\n",
251 |       "AUC: 0.95465\n",
252 |       "27s - loss: 0.1231 - categorical_accuracy: 0.9498 - val_loss: 0.1443 - val_categorical_accuracy: 0.9268\n",
253 |       "Epoch 13/20\n",
254 |       "Epoch 00012: val_categorical_accuracy did not improve\n",
255 |       "AUC: 0.95379\n",
256 |       "27s - loss: 0.1211 - categorical_accuracy: 0.9507 - val_loss: 0.1492 - val_categorical_accuracy: 0.9284\n",
257 |       "Epoch 14/20\n",
258 |       "Epoch 00013: val_categorical_accuracy did not improve\n",
259 |       "AUC: 0.95501\n",
260 |       "27s - loss: 0.1195 - categorical_accuracy: 0.9510 - val_loss: 0.1436 - val_categorical_accuracy: 0.9274\n",
261 |       "Epoch 15/20\n",
262 |       "Epoch 00014: val_categorical_accuracy did not improve\n",
263 |       "AUC: 0.95443\n",
264 |       "27s - loss: 0.1170 - categorical_accuracy: 0.9527 - val_loss: 0.1405 - val_categorical_accuracy: 0.9290\n",
265 |       "Epoch 16/20\n",
266 |       "Epoch 00015: val_categorical_accuracy did not improve\n",
267 |       "AUC: 0.95387\n",
268 |       "26s - loss: 0.1151 - categorical_accuracy: 0.9536 - val_loss: 0.1395 - val_categorical_accuracy: 0.9281\n",
269 |       "Epoch 17/20\n",
270 |       "Epoch 00016: val_categorical_accuracy did not improve\n",
271 |       "AUC: 0.95428\n",
272 |       "27s - loss: 0.1135 - categorical_accuracy: 0.9538 - val_loss: 0.1402 - val_categorical_accuracy: 0.9278\n",
273 |       "Epoch 18/20\n",
274 |       "Epoch 00017: val_categorical_accuracy did not improve\n",
275 |       "AUC: 0.95323\n",
276 |       "27s - loss: 0.1120 - categorical_accuracy: 0.9546 - val_loss: 0.1450 - val_categorical_accuracy: 0.9287\n",
277 |       "Epoch 19/20\n",
278 |       "Epoch 00018: val_categorical_accuracy improved from 0.93054 to 0.93240, saving model to ./tmp/weights-18.hdf5\n",
279 |       "AUC: 0.95366\n",
280 |       "27s - loss: 0.1107 - categorical_accuracy: 0.9557 - val_loss: 0.1386 - val_categorical_accuracy: 0.9324\n",
281 |       "Epoch 20/20\n",
282 |       "Epoch 00019: val_categorical_accuracy improved from 0.93240 to 0.93240, saving model to ./tmp/weights-19.hdf5\n",
283 |       "AUC: 0.95260\n",
284 |       "27s - loss: 0.1078 - categorical_accuracy: 0.9570 - val_loss: 0.1414 - val_categorical_accuracy: 0.9324\n"
285 |      ]
286 |     },
287 |     {
288 |      "data": {
289 |       "text/plain": [
290 |        "<keras.callbacks.History at 0x7f9ca60245f8>"
291 |       ]
292 |      },
293 |      "execution_count": 8,
294 |      "metadata": {},
295 |      "output_type": "execute_result"
296 |     }
297 |    ],
298 |    "source": [
299 |     "model.fit(x_train, y_train,\n",
300 |     "          epochs=20,\n",
301 |     "          validation_data=(x_val, y_val),\n",
302 |     "          callbacks=[checkpointer, our_auc],\n",
303 |     "          verbose=2)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "# Go to https://geo-extract-tester.herokuapp.com/ and download\n",
313 |     "# the validation data (validation.txt).\n",
314 |     "with open('validation.txt', encoding='utf-8') as f:\n",
315 |     "    s = f.read()\n",
316 |     "\n",
317 |     "gloved_data = glove.loc[[w for w in s.split('\\n') if w]].fillna(0)\n",
318 |     "glove_time_size = 100\n",
319 |     "preds_batched = []\n",
320 |     "i = 0\n",
321 |     "while gloved_data[i:i+glove_time_size].size:\n",
322 |     "    preds_batched.append(model.predict(np.expand_dims(gloved_data[i:i+glove_time_size], axis=0))[0][:,1])\n",
323 |     "    i += glove_time_size\n",
324 |     "\n",
325 |     "preds = [p for pred in preds_batched for p in pred]\n",
326 |     "\n",
327 |     "print('\\n'.join(['{:>15} {:>9.4f}'.format(w, p) for (w, p) in zip(words, preds)][400:500]))\n",
328 |     "    \n",
329 |     "with open('guesses.txt', 'w') as f:\n",
330 |     "    for prob in [p for pred in preds_batched for p in pred]:\n",
331 |     "        f.write(str(prob) + '\\n')\n",
332 |     "\n",
333 |     "# Now go to https://geo-extract-tester.herokuapp.com/ and upload `guesses.txt` to see how you did!"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {
340 |     "collapsed": true
341 |    },
342 |    "outputs": [],
343 |    "source": []
344 |   }
345 |  ],
346 |  "metadata": {
347 |   "kernelspec": {
348 |    "display_name": "Python 3",
349 |    "language": "python",
350 |    "name": "python3"
351 |   },
352 |   "language_info": {
353 |    "codemirror_mode": {
354 |     "name": "ipython",
355 |     "version": 3
356 |    },
357 |    "file_extension": ".py",
358 |    "mimetype": "text/x-python",
359 |    "name": "python",
360 |    "nbconvert_exporter": "python",
361 |    "pygments_lexer": "ipython3",
362 |    "version": "3.6.1"
363 |   }
364 |  },
365 |  "nbformat": 4,
366 |  "nbformat_minor": 2
367 | }
368 | 


--------------------------------------------------------------------------------
/lib/tagnews/data/ci-data/newsarticles_trainedlocation.csv:
--------------------------------------------------------------------------------
  1 | 2285,Marquez-Connerly,,,413550
  2 | 2286,7500 block of Ridge Avenue,,,413550
  3 | 2287,"500 block of Elmwood Avenue,",,,413550
  4 | 3352,Southwest Side Little Village,,,415147
  5 | 3353,"3300 block of West 27th Street,",,,415147
  6 | 3708,5700 block of North Christiana,,,415662
  7 | 5640,Marquette,,,417739
  8 | 5641,and Marquette,,,417739
  9 | 5642,of 79th and Marquette. Much,,,417739
 10 | 5643,Fernando Chavez’s,,,417739
 11 | 7077,West,,,419207
 12 | 7078,District,,,419207
 13 | 7516,1700 block of North Rockwell,,,419765
 14 | 7610,Side,,,419836
 15 | 7611,3800 block of North Freemont,,,419836
 16 | 7612,Harbor,,,419836
 17 | 7699,South Side Englewood neighborhood.,,,419919
 18 | 7700,5800 block of South Elizabeth Street,,,419919
 19 | 7701,5800 block of South Racine,,,419919
 20 | 7702,59th Street,,,419919
 21 | 7947,2400 block of West,,,420216
 22 | 8846,2400 block of North Lincoln,,,421150
 23 | 8847,"Felix Hotel, 111 W. Huron. She",,,421150
 24 | 8848,The,,,421150
 25 | 9022,Northwest Side. The trio,,,421396
 26 | 9023,3400 block of North Lawndale,,,421396
 27 | 9024,4800 block of South Lake,,,421396
 28 | 9025,3700 block of South,,,421396
 29 | 9026,3300 block of West,,,421396
 30 | 9701,County Courthouse,,,422211
 31 | 10619,Monday,,,423375
 32 | 10620,6700 to 6900 blocks of South Justine. Anyone,,,423375
 33 | 10652,"Chicago,",,,423430
 34 | 10655,Street,,,423440
 35 | 10924,County,,,423745
 36 | 10925,200 block of Sherry Lane. A,,,423745
 37 | 11175,block of North,,,424048
 38 | 11176,"bakery,",,,424048
 39 | 11177,"1000 block of North Boulevard,",,,424048
 40 | 11178,"1010 North Blvd,",,,424048
 41 | 11380,WIRE December,,,424319
 42 | 11381,Gresham District,,,424319
 43 | 11382,Halsted St.,,,424319
 44 | 11383,Chevrolet Caprice,,,424319
 45 | 11384,79th and Peoria,,,424319
 46 | 11856,2100 block of North Milwaukee Avenue.,,,424604
 47 | 11857,"2100 block of North Rockwell Street,",,,424604
 48 | 12384,Circuit,,,425064
 49 | 12589,7100 block of South Normal Avenue.,,,425292
 50 | 12590,7100 block of South Normal,,,425292
 51 | 12635,2700 block of South Vernon Avenue.,,,425341
 52 | 13250,2700,,,426267
 53 | 16959,Lawn,,,430230
 54 | 16960,"63rd Street,",,,430230
 55 | 16961,3600 block of South 52nd Court,,,430230
 56 | 16962,west,,,430230
 57 | 16963,63rd St.,,,430230
 58 | 16964,"Street apartment,",,,430230
 59 | 16965,Street. Neighbors,,,430230
 60 | 16966,Investigators,,,430230
 61 | 16967,Street,,,430230
 62 | 16968,Midway Lounge,,,430230
 63 | 16969,"Street, reading",,,430230
 64 | 17494,leg. The,,,430749
 65 | 17495,6400 block of South Morgan Street,,,430749
 66 | 17496,6000 block of,,,430749
 67 | 17497,gallery,,,430749
 68 | 17704,ne ighborhood,,,430966
 69 | 17705,"South Side, police. The",,,430966
 70 | 17706,1400 block of East 75th Street,,,430966
 71 | 18145,"Area,",,,431512
 72 | 18146,south,,,431512
 73 | 18147,west,,,431512
 74 | 18148,114th,,,431512
 75 | 18571,of South,,,431877
 76 | 18572,8400 block of South Kingston Avenue.,,,431877
 77 | 22421,intersection of East 79th Street,,,436155
 78 | 22422,Street bus. She,,,436155
 79 | 23381,Her,,,437372
 80 | 23382,North Broadway,,,437372
 81 | 27583,4400 block of South Hermitage,,,441456
 82 | 28555,Roseland neigh borhood. Police,,,442355
 83 | 28556,Street,,,442355
 84 | 28589,Michigan,,,442449
 85 | 31881,South East End.,,,445543
 86 | 31882,South Dobson. Police,,,445543
 87 | 35730,S. Ahrens,,,449338
 88 | 37603,West Farragut,,,451353
 89 | 37604,"W. Berwyn,",,,451353
 90 | 38654,Whitehall Hotel ][2],,,452473
 91 | 38655,"100 block of East Delaware Place,",,,452473
 92 | 38656,"floor,""",,,452473
 93 | 40168,of 91st Street and South,,,453893
 94 | 40169,South Side,,,453893
 95 | 40170,9000 block of South,,,453893
 96 | 40171,4900 block of West Ferdinand Street,,,453893
 97 | 40172,3600 block of West Diversey Ave,,,453893
 98 | 40173,"8000 block of South Manistee Avenue,",,,453893
 99 | 40174,"1900 block of South Marshall Boulevard,",,,453893
100 | 40175,1100 block of North Pulaski Road,,,453893
101 | 40176,2300 block of South Rockwell,,,453893
102 | 40177,"7800 block of South Paulina Street,",,,453893
103 | 40178,2300 block of South Washtenaw Avenue,,,453893
104 | 40179,"Street,",,,453893
105 | 40694,"Street,",,,454491
106 | 42410,Side,,,456158
107 | 42411,Morgan Park,,,456158
108 | 42412,Southwest Side. The,,,456158
109 | 42413,2200 block of West Barry,,,456158
110 | 42414,Lake View neighborhood.,,,456158
111 | 42415,of,,,456158
112 | 42416,19th,,,456158
113 | 42451,4500 block of South Paulina Street.,,,456190
114 | 42452,"arm,",,,456190
115 | 43893,5600 block of West Grand,,,457690
116 | 43894,When I,,,457690
117 | 44375,South Side,,,458343
118 | 44376,Bank,,,458343
119 | 44377,9400 block of South Ashland Avenue,,,458343
120 | 44795,Thorobreds,,,458774
121 | 44796,"floor,",,,458774
122 | 47308,South Deering neighborhood,,,462142
123 | 47309,"city’s Far South Side,",,,462142
124 | 47310,"9800 block of South Merrill Avenue,",,,462142
125 | 47311,South,,,462142
126 | 47312,2200 block of East 97th Street,,,462142
127 | 47313,9400 block of South Rhodes Avenue,,,462142
128 | 47314,8600 block of South Kingston,,,462142
129 | 47315,block of North,,,462142
130 | 47316,"5600 block of West Washington Boulevard,",,,462142
131 | 47317,Garfield Park neighborhood. Claude Snulligan,,,462142
132 | 47318,0,,,462142
133 | 47319,"100 block of South Pulaski Road,",,,462142
134 | 47320,block of South Drake,,,462142
135 | 47321,South Deering neighborhood. Terrance Wright,,,462142
136 | 47322,99th Street,,,462142
137 | 47323,"5000 block of South Carpenter Street,",,,462142
138 | 47324,"1300 block of West Estes Avenue,",,,462142
139 | 52727,"Side Wednesday night,",,,470170
140 | 52728,Street,,,470170
141 | 52729,block of East 147th Street,,,470170
142 | 54460,"4400 block of North Sheridan Road,",,,472166
143 | 54461,When,,,472166
144 | 54462,block of West Lawrence,,,472166
145 | 54719,"100 block of East Cass Street,",,,472582
146 | 54720,Hospital,,,472582
147 | 54721,Lenox;,,,472582
148 | 54722,"apartment,",,,472582
149 | 55052,West Pullman,,,473190
150 | 55053,"Far South Side,",,,473190
151 | 55054,of South Morgan,,,473190
152 | 56448,block of Pfingston Road,,,475062
153 | 56449,2200 block of Central Street,,,475062
154 | 69421,P Stone,,,491582
155 | 69422,7800 block of South Kingston,,,491582
156 | 69711,block of South Escanaba,,,492024
157 | 69712,8000 block of South Escanaba,,,492024
158 | 69713,10900,,,492024
159 | 69714,South Racine,,,492024
160 | 69987,"time, Pedraza said. The Harrison",,,492289
161 | 69988,The,,,492289
162 | 70500,E. Boughton Road.,,,493040
163 | 70501,1725 W. Boughton,,,493040
164 | 70502,400 block of East Briarcliff Road,,,493040
165 | 70503,DUI,,,493040
166 | 70504,block of Woodcreek,,,493040
167 | 70505,Cache Road,,,493040
168 | 70506,400 block of New Avenue,,,493040
169 | 70507,600 block of Jordan Avenue,,,493040
170 | 70508,1700 block of William Drive,,,493040
171 | 71384,"6500 block of South Maryland,",,,494190
172 | 71385,Maryland,,,494190
173 | 71386,"Sedell Brown,",,,494190
174 | 73400,9700 block of South Greenwood Avenue,,,496875
175 | 76891,110 block of South Michigan,,,500625
176 | 77309,1100 block of Pleasant of,,,501009
177 | 77310,block of Superior Street,,,501009
178 | 77311,6400 block of Roosevelt Road,,,501009
179 | 77312,400 block of Augusta Boulevard,,,501009
180 | 77313,1000 block of South Elmwood Avenue,,,501009
181 | 77314,1100 block of South Humphrey Avenue,,,501009
182 | 77315,"100 block of North Humphrey Avenue,",,,501009
183 | 77316,1000 block of Woodbine between,,,501009
184 | 77317,600 block of Highland Avenue. •A,,,501009
185 | 77318,720 W. North,,,501009
186 | 77319,400 block of Washington between,,,501009
187 | 77320,800 block of Home Avenue,,,501009
188 | 77321,5900 block of Chicago Avenue,,,501009
189 | 77322,400 block of North Humphrey Avenue.,,,501009
190 | 77323,block of Lake,,,501009
191 | 77324,8000 block of South Drexel,,,501009
192 | 78941,3700 block of West 119th Street,,,502490
193 | 78942,Friday,,,502490
194 | 78943,"3700 block of West 119th Street,",,,502490
195 | 78944,8700 block of South Burley Avenue,,,502490
196 | 78945,Pontrelli,,,502490
197 | 79020,"Boston Marathon,",,,502547
198 | 79021,Three,,,502547
199 | 79022,Boston Marathon,,,502547
200 | 79299,"Boston Marathon,",,,502889
201 | 79300,Chic,,,502889
202 | 79301,Marathon,,,502889
203 | 83804,Assembly,,,507757
204 | 83805,south suburbs. They,,,507757
205 | 88363,6400 block of North Albany Avenue,,,512592
206 | 90167,west,,,514617
207 | 90607,"6700 block of South Evans Avenue,",,,515297
208 | 90608,Police,,,515297
209 | 90609,Their,,,515297
210 | 90610,65th and Maryland,,,515297
211 | 90611,"5700 block of South Washtenaw Avenue,",,,515297
212 | 93343,block of 175th Street,,,517535
213 | 93344,of,,,517535
214 | 93529,Meadows,,,517769
215 | 93530,Cook County Circuit,,,517769
216 | 94346,1200 block of North Mayfield,,,518677
217 | 95295,Side,,,519589
218 | 95296,intersection of West 69th Street and South,,,519589
219 | 95297,9700 block of South Merrion Avenue,,,519589
220 | 95298,"2200 block of East 75th Street,",,,519589
221 | 95299,"600 block of East 79th Street,",,,519589
222 | 96653,"Side Wednesday afternoon,",,,521136
223 | 96654,and Loomis streets,,,521136
224 | 97904,1800 block of North Damen,,,522104
225 | 108439,"3900 block of North Long,",,,533365
226 | 109412,A Cook,,,534145
227 | 109413,Corro,,,534145
228 | 109414,"4800 block of Oakton Street,",,,534145
229 | 109415,Corro,,,534145
230 | 109416,105 of,,,534145
231 | 109417,Skokie,,,534145
232 | 109994,"Roseland neighborhood,",,,534864
233 | 109995,10500 block of South LaSalle Street,,,534864
234 | 109996,"back,",,,534864
235 | 113173,forest,,,538379
236 | 113174,PT Cruiser,,,538379
237 | 113175,Cap Sauers Holdings Nature,,,538379
238 | 113176,Breit,,,538379
239 | 114033,block of North,,,539462
240 | 118406,"Austin neighborhood,",,,544884
241 | 118407,700 block of North Parkside,,,544884
242 | 118408,2090,,,544884
243 | 119130,Loop. San,,,545652
244 | 119131,The intersection,,,545652
245 | 119132,Yojimbo's,,,545652
246 | 119133,"Larrabee,",,,545652
247 | 119134,cyclist's path. Townsend's,,,545652
248 | 119135,"18th District lockup,",,,545652
249 | 119136,Clybourn-Larrabee intersection,,,545652
250 | 119137,Clybourn-Larrabee intersection,,,545652
251 | 119138,d esignate,,,545652
252 | 119139,street,,,545652
253 | 119140,Honorary Bobby,,,545652
254 | 119141,east coast. Before unveiling,,,545652
255 | 119219,He,,,545924
256 | 121737,1600 block of South,,,548801
257 | 121738,4200 block of West Lake Street,,,548801
258 | 121739,"West Side,",,,548801
259 | 121740,4200 block of South Fifth,,,548801
260 | 121741,"200 block of North Karlov Avenue,",,,548801
261 | 122020,No,,,549021
262 | 122459,South Halsted,,,549647
263 | 122460,Auburn-Gresham,,,549647
264 | 122461,Tuesday,,,549647
265 | 123640,southwest,,,551242
266 | 123641,"arm,",,,551242
267 | 123642,Will County,,,551242
268 | 123643,block of Francis Street,,,551242
269 | 123644,"Gerald Chamberlain Jr.,",,,551242
270 | 123645,1300 block of Englewood Avenue,,,551242
271 | 124322,Bank of America branch located,,,551960
272 | 124323,18460 Governors Highway,,,551960
273 | 124324,approximately,,,551960
274 | 124325,183rd Street,,,551960
275 | 124326,183rd Street. Parker,,,551960
276 | 124626,"Court House,",,,552438
277 | 127597,2700 block of N. Mango. Police,,,555583
278 | 127598,2500 block of N.,,,555583
279 | 127853,The,,,555900
280 | 130841,"County board,",,,559515
281 | 131998,9000 block of South,,,561091
282 | 131999,9100 block of South Bishop,,,561091
283 | 132253,Bluebird,,,561538
284 | 133136,200 block of West Diversey Parkway.,,,563189
285 | 137732,block of South,,,571856
286 | 137733,4300 block of North Sheridan Road,,,571856
287 | 137734,"3500 block of North Broadway,",,,571856
288 | 137735,"5500 block of North Clark,",,,571856
289 | 140874,South Laflin,,,576100
290 | 140875,West Side,,,576100
291 | 142014,Calumet Heights neighborhood,,,578001
292 | 142015,Chicago's South Side.,,,578001
293 | 142016,9300 block of South Stony,,,578001
294 | 142372,Austin,,,578317
295 | 142373,5400 block of West Madison Street,,,578317
296 | 142374,Austin,,,578317
297 | 142375,That,,,578317
298 | 143657,"School,",,,580278
299 | 143905,Police,,,580407
300 | 143925,south side of,,,580457
301 | 146364,"3700 block of 83rd Place,",,,584188
302 | 146365,Two,,,584188
303 | 149490,"8200 block of South Whipple Street,",,,589047
304 | 149491,"7900 block of South California Avenue,",,,589047
305 | 163511,7400 block of South South,,,614671
306 | 163512,"Street,",,,614671
307 | 165137,430,,,617068
308 | 172159,Austin neighborhood. Demetrius Bronson,,,627366
309 | 172160,900 block of Lorel Avenue.,,,627366
310 | 172161,DiBella,,,627366
311 | 172162,"900 block of Lorel,",,,627366
312 | 173206,West Englewood,,,628811
313 | 173207,South Side,,,628811
314 | 173208,1600 block of West 71st Street,,,628811
315 | 173209,South Chicago neighborhood,,,628811
316 | 179094,Pulaski Road. The,,,638085
317 | 212069,5500 block of South Hermitage,,,673593
318 | 212070,Lower West Side. The,,,673593
319 | 212071,block of South Western,,,673593
320 | 212072,Humboldt Park,,,673593
321 | 212073,West Side. A,,,673593
322 | 212074,1100 block of North Ridgeway,,,673593
323 | 212075,South Shore neighborhood. Officers,,,673593
324 | 212076,6900 block of South Clyde,,,673593
325 | 212077,2200 block of South Lawndale,,,673593
326 | 212078,West Side. Hector Badillo,,,673593
327 | 212079,700 block of North California Avenue,,,673593
328 | 212080,"400 block of North Trumbull,",,,673593
329 | 212081,Southwest Side Brighton Park,,,673593
330 | 212082,"2600 block of West 39th,",,,673593
331 | 212083,Woodlawn,,,673593
332 | 212084,South Side. The,,,673593
333 | 212085,6200 block of South Drexel,,,673593
334 | 212086,South Side. The,,,673593
335 | 212087,block of West 79th,,,673593
336 | 212088,Trumbull Park neighborhood,,,673593
337 | 212089,Far South Side. A,,,673593
338 | 212090,block of South Yates,,,673593
339 | 212091,South Side. At,,,673593
340 | 219217,West Garfield Park neighborhood. The,,,680518
341 | 219218,Kostner,,,680518
342 | 219219,"4300 block of West Adams,",,,680518
343 | 219305,block of 81st Court,,,680698
344 | 219306,Palos,,,680698
345 | 219307,Department,,,680698
346 | 219308,"Correction,",,,680698
347 | 234322,north,,,694916
348 | 234323,Drive,,,694916
349 | 234324,Park,,,694916
350 | 234400,3500 block of Wonder Lake,,,695016
351 | 242709,A Chicago,,,703432
352 | 242710,Side,,,703432
353 | 244808,Jefferson Park,,,705758
354 | 244809,Northwest Side. Kyle Brandon,,,705758
355 | 244810,5000 block of North Long,,,705758
356 | 248279,Wrigleyville,,,709872
357 | 248280,N orth Side. Jarqueese O’Brian,,,709872
358 | 248281,3700 block of North Fremont,,,709872
359 | 248282,South Side,,,709872
360 | 248283,11500 block of South Throop,,,709872
361 | 253595,block of South Calumet Avenue,,,716443
362 | 253596,block of South Prairie Avenue,,,716443
363 | 253597,10400 block of South Indiana Avenue,,,716443
364 | 253598,10500 block of South Forest Avenue,,,716443
365 | 253599,10400 block of South calumet Avenue,,,716443
366 | 253600,800 block of East 103rd Street •,,,716443
367 | 253601,500 block of East 105th Street •,,,716443
368 | 253602,400 block of East 107th Street The,,,716443
369 | 262599,800 block of N. Michigan Ave.,,,727147
370 | 262600,Gold Coast,,,727147
371 | 262601,1700 block of W. Wabansia Ave.,,,727147
372 | 262602,700 block of W. Hubbard St.,,,727147
373 | 262603,3500 block of N. Clark St.,,,727147
374 | 262604,1900 block of N. Lincoln Ave.,,,727147
375 | 262605,Lincoln,,,727147
376 | 262606,700 block of N. Armour St.,,,727147
377 | 262607,1300 block of N. Bosworth Ave.,,,727147
378 | 263732,west,,,728505
379 | 263733,of Amesbury Road,,,728505
380 | 264502,South Loop. Samantha Salazar,,,729271
381 | 264503,1100 block of South Indiana,,,729271
382 | 264504,South Side Auburn Gresham neighborhood.,,,729271
383 | 264505,County Jail,,,729271
384 | 265724,3900 block of North Ashland Avenue,,,730663
385 | 265725,6200 block of North Western Avenue,,,730663
386 | 265726,1900 block of West Peterson Avenue •,,,730663
387 | 265727,4400 block of North Broadway •,,,730663
388 | 265728,4800 block of North Broadway •,,,730663
389 | 265729,2800 block of North Broadway No,,,730663
390 | 273333,1600 block of South St. Louis,,,739867
391 | 273334,1800 block of West 87,,,739867
392 | 273335,St. On,,,739867
393 | 273336,1800 block of W. 87,,,739867
394 | 273337,"St,",,,739867
395 | 273338,1200 block of W. 79 th,,,739867
396 | 273339,block of S. Marshell,,,739867
397 | 273340,approximately 2:28,,,739867
398 | 273341,7700 block of S. Cottage Grove,,,739867
399 | 273342,1100 block of North Lockwood,,,739867
400 | 273343,1300 block of South Millard,,,739867
401 | 273344,7100 block of South State,,,739867
402 | 273345,4200 block of West Addison,,,739867
403 | 273346,6600 block of South Capmbell. He,,,739867
404 | 273347,1300 block of South Throop. A,,,739867
405 | 273348,400 block of East,,,739867
406 | 273349,Street.,,,739867
407 | 273350,6600 block of South Cottage Grove. The,,,739867
408 | 273351,"6800 block of South Crandon,",,,739867
409 | 273352,He,,,739867
410 | 273353,1600 block of South St. Louis. He,,,739867
411 | 273354,A,,,739867
412 | 273355,block of South Wallace,,,739867
413 | 273356,approximately,,,739867
414 | 273357,5800 block of South King Drive,,,739867
415 | 273358,block of West,,,739867
416 | 273359,4500 block of South Pulaski. One,,,739867
417 | 273360,street. An,,,739867
418 | 297211,South Side Fuller Park,,,766495
419 | 297212,"200 block of West 47th Street. Initially,",,,766495
420 | 305403,South May Street,,,776203
421 | 


--------------------------------------------------------------------------------
/lib/notebooks/keras-glove-with-street-names-better.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/Users/josh/Documents/chihack/article-tagging/lib\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "cd '/Users/josh/Documents/chihack/article-tagging/lib'"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {},
 24 |    "outputs": [
 25 |     {
 26 |      "name": "stderr",
 27 |      "output_type": "stream",
 28 |      "text": [
 29 |       "/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n",
 30 |       "  return f(*args, **kwds)\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "from numpy.random import seed\n",
 36 |     "seed(1)\n",
 37 |     "from tensorflow import set_random_seed\n",
 38 |     "set_random_seed(2)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 3,
 44 |    "metadata": {
 45 |     "collapsed": true
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "import os\n",
 50 |     "import tagnews\n",
 51 |     "import pandas as pd\n",
 52 |     "from keras.models import Sequential\n",
 53 |     "from keras.layers import LSTM, Dense, TimeDistributed\n",
 54 |     "from keras.utils import to_categorical\n",
 55 |     "from keras.callbacks import ModelCheckpoint\n",
 56 |     "import numpy as np\n",
 57 |     "import json\n",
 58 |     "import requests\n",
 59 |     "import keras\n",
 60 |     "import shutil"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 4,
 66 |    "metadata": {
 67 |     "collapsed": true
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "glove = tagnews.load_glove('tagnews/data/glove.6B.50d.txt')"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 5,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "data": {
 81 |       "text/plain": [
 82 |        "(400000, 50)"
 83 |       ]
 84 |      },
 85 |      "execution_count": 5,
 86 |      "metadata": {},
 87 |      "output_type": "execute_result"
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "glove.shape"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 6,
 97 |    "metadata": {
 98 |     "collapsed": true
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "glove.loc['address_vec'] = glove.loc[['street', 'avenue', 'place', 'road', 'block', 'main', 'city', 'west', 'east', 'north', 'south']].mean()\n",
103 |     "glove.loc['neighborhood_vec'] = glove.loc[['neighborhood', 'burrough', 'community', 'area']].mean()"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 7,
109 |    "metadata": {
110 |     "collapsed": true
111 |    },
112 |    "outputs": [],
113 |    "source": [
114 |     "with open('tagnews/data/Chicago_Street_Names.csv') as street_names:\n",
115 |     "    streets = street_names.read().splitlines()[1:]\n",
116 |     "streets = [i.lower() for i in streets]\n",
117 |     "\n",
118 |     "with open('tagnews/data/chicago_neighborhoods.csv') as neighborhoods:\n",
119 |     "    hoods = neighborhoods.read().splitlines()\n",
120 |     "hoods = list(set([j.lower().replace('\\\"', '') for j in hoods]))\n",
121 |     "\n",
122 |     "for name in streets:\n",
123 |     "    glove.loc[name] = glove.loc['address_vec']\n",
124 |     "for hood in hoods:\n",
125 |     "    glove.loc[hood] = glove.loc['neighborhood_vec']"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 8,
131 |    "metadata": {
132 |     "collapsed": true
133 |    },
134 |    "outputs": [],
135 |    "source": [
136 |     "with open('tagnews/data/training.txt', encoding='utf-8') as f:\n",
137 |     "    our_training_data = f.read()\n",
138 |     "    \n",
139 |     "training_df = pd.DataFrame([x.split() for x in our_training_data.split('\\n') if x],\n",
140 |     "                           columns=['word', 'tag'])\n",
141 |     "\n",
142 |     "training_df.iloc[:,1] = training_df.iloc[:,1].apply(int)\n",
143 |     "training_df['all_tags'] = 'NA'\n",
144 |     "\n",
145 |     "# If you want to join our data w/ kaggle data, you can do this.\n",
146 |     "# ner = tagnews.load_ner_data('tagnews/data/')\n",
147 |     "# pd.concat([training_df, ner]).reset_index(drop=True)\n",
148 |     "\n",
149 |     "# If you just want to use our data, you can do this.\n",
150 |     "ner = training_df\n",
151 |     "\n",
152 |     "ner = ner[['word', 'all_tags', 'tag']]"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 9,
158 |    "metadata": {
159 |     "collapsed": true
160 |    },
161 |    "outputs": [],
162 |    "source": [
163 |     "# pd.DataFrame(glove.loc[ner.loc[ner['word'] == 'Woodlawn']['word'].str.lower()].values)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 10,
169 |    "metadata": {
170 |     "collapsed": true
171 |    },
172 |    "outputs": [],
173 |    "source": [
174 |     "ner = pd.concat([ner,\n",
175 |     "                 pd.DataFrame(ner['word'].str[0].str.isupper().values),\n",
176 |     "                 pd.DataFrame(glove.loc[ner['word'].str.lower()].values),\n",
177 |     "                 pd.DataFrame(ner['word'].str.isnumeric().values),\n",
178 |     "                 pd.DataFrame(ner['word'].str.len().values)],\n",
179 |     "                 axis='columns')\n",
180 |     "ner.fillna(value=0.0, inplace=True)\n",
181 |     "\n",
182 |     "data_dim = 53\n",
183 |     "timesteps = 25 # only during training, testing can take arbitrary length.\n",
184 |     "num_classes = 2\n",
185 |     "\n",
186 |     "train_val_split = int(19 * ner.shape[0] / 20.)\n",
187 |     "\n",
188 |     "ner_train_idxs = range(0, train_val_split - timesteps, timesteps)\n",
189 |     "x_train = np.array([ner.iloc[i:i+timesteps, 3:].values\n",
190 |     "                    for i in ner_train_idxs])\n",
191 |     "y_train = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)\n",
192 |     "                    for i in ner_train_idxs])\n",
193 |     "\n",
194 |     "ner_val_idxs = range(train_val_split, ner.shape[0] - timesteps, timesteps)\n",
195 |     "x_val = np.array([ner.iloc[i:i+timesteps, 3:].values\n",
196 |     "                  for i in ner_val_idxs])\n",
197 |     "y_val = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)\n",
198 |     "                  for i in ner_val_idxs])"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 11,
204 |    "metadata": {},
205 |    "outputs": [
206 |     {
207 |      "name": "stdout",
208 |      "output_type": "stream",
209 |      "text": [
210 |       "____________________________________________________________________________________________________\n",
211 |       "Layer (type)                                 Output Shape                            Param #        \n",
212 |       "====================================================================================================\n",
213 |       "lstm_1 (LSTM)                                (None, None, 32)                        11008          \n",
214 |       "____________________________________________________________________________________________________\n",
215 |       "lstm_2 (LSTM)                                (None, None, 8)                         1312           \n",
216 |       "____________________________________________________________________________________________________\n",
217 |       "time_distributed_1 (TimeDistributed)         (None, None, 2)                         18             \n",
218 |       "====================================================================================================\n",
219 |       "Total params: 12,338\n",
220 |       "Trainable params: 12,338\n",
221 |       "Non-trainable params: 0\n",
222 |       "____________________________________________________________________________________________________\n",
223 |       "None\n"
224 |      ]
225 |     }
226 |    ],
227 |    "source": [
228 |     "model = Sequential()\n",
229 |     "model.add(LSTM(32, return_sequences=True, input_shape=(None, data_dim)))\n",
230 |     "model.add(LSTM(8, return_sequences=True))\n",
231 |     "model.add(TimeDistributed(Dense(2, activation='softmax')))\n",
232 |     "model.compile(loss='categorical_crossentropy',\n",
233 |     "              optimizer='adam',\n",
234 |     "              metrics=['categorical_accuracy'])\n",
235 |     "print(model.summary(100))"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 12,
241 |    "metadata": {
242 |     "collapsed": true
243 |    },
244 |    "outputs": [],
245 |    "source": [
246 |     "os.makedirs('tmp', exist_ok=True)\n",
247 |     "checkpointer = ModelCheckpoint(filepath='./tmp/weights-{epoch:02d}.hdf5',\n",
248 |     "                               monitor='val_categorical_accuracy',\n",
249 |     "                               verbose=1,\n",
250 |     "                               save_best_only=True)\n",
251 |     "\n",
252 |     "class OurAUC(keras.callbacks.Callback):\n",
253 |     "    def on_epoch_end(self, epoch, logs={}):\n",
254 |     "        # Go to https://geo-extract-tester.herokuapp.com/ and download\n",
255 |     "        # the validation data (validation.txt).\n",
256 |     "        '''with open('validation.txt', encoding='utf-8') as f:\n",
257 |     "            s = f.read()\n",
258 |     "\n",
259 |     "        gloved_data = pd.concat([pd.DataFrame([[w[0].isupper()] for w in s.split('\\n') if w]),\n",
260 |     "                                 glove.loc[[w for w in s.split('\\n') if w]].fillna(0).reset_index(drop=True)],\n",
261 |     "                                 axis='columns')\n",
262 |     "        glove_time_size = 100\n",
263 |     "        preds_batched = []\n",
264 |     "        i = 0\n",
265 |     "        while gloved_data[i:i+glove_time_size].size:\n",
266 |     "            preds_batched.append(model.predict(np.expand_dims(gloved_data[i:i+glove_time_size],\n",
267 |     "                                                              axis=0))[0][:,1])\n",
268 |     "            i += glove_time_size\n",
269 |     "\n",
270 |     "        with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'w') as f:\n",
271 |     "            for prob in [p for pred in preds_batched for p in pred]:\n",
272 |     "                f.write(str(prob) + '\\n')'''\n",
273 |     "\n",
274 |     "        with open('validation.txt', encoding='utf-8') as f:\n",
275 |     "            s = f.read()\n",
276 |     "\n",
277 |     "        gloved_data = pd.concat([pd.DataFrame([[w[0].isupper()] for w in s.split('\\n') if w]),\n",
278 |     "                                 glove.loc[[w for w in s.split('\\n') if w]].fillna(0).reset_index(drop=True),\n",
279 |     "                                 pd.DataFrame([[w[0].isnumeric()] for w in s.split('\\n') if w]),\n",
280 |     "                                 pd.DataFrame([[len(w[0])] for w in s.split('\\n') if w])],\n",
281 |     "                                 axis='columns')\n",
282 |     "        glove_time_size = 100\n",
283 |     "        preds_batched = []\n",
284 |     "        i = 0\n",
285 |     "        while gloved_data[i:i+glove_time_size].size:\n",
286 |     "            preds_batched.append(model.predict(np.expand_dims(gloved_data[i:i+glove_time_size],\n",
287 |     "                                                              axis=0))[0][:,1])\n",
288 |     "            i += glove_time_size\n",
289 |     "\n",
290 |     "        with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'w') as f:\n",
291 |     "            for prob in [p for pred in preds_batched for p in pred]:\n",
292 |     "                f.write(str(prob) + '\\n')\n",
293 |     "\n",
294 |     "        with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'rb') as f:\n",
295 |     "            url = 'https://geo-extract-tester.herokuapp.com/api/score'\n",
296 |     "            r = requests.post(url, files={'file': f})\n",
297 |     "            try:\n",
298 |     "                print('AUC: {:.5f}'.format(json.loads(r.text)['auc']))\n",
299 |     "            except KeyError:\n",
300 |     "                raise ValueError('Problem retrieving AUC from API. Is your validation set up to date?')\n",
301 |     "\n",
302 |     "our_auc = OurAUC()"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 13,
308 |    "metadata": {},
309 |    "outputs": [
310 |     {
311 |      "name": "stdout",
312 |      "output_type": "stream",
313 |      "text": [
314 |       "Train on 6271 samples, validate on 330 samples\n",
315 |       "Epoch 1/20\n",
316 |       "Epoch 00001: val_categorical_accuracy improved from -inf to 0.97697, saving model to ./tmp/weights-01.hdf5\n",
317 |       "AUC: 0.92249\n",
318 |       " - 60s - loss: 0.1762 - categorical_accuracy: 0.9638 - val_loss: 0.0828 - val_categorical_accuracy: 0.9770\n",
319 |       "Epoch 2/20\n",
320 |       "Epoch 00002: val_categorical_accuracy improved from 0.97697 to 0.97867, saving model to ./tmp/weights-02.hdf5\n",
321 |       "AUC: 0.94973\n",
322 |       " - 58s - loss: 0.0841 - categorical_accuracy: 0.9666 - val_loss: 0.0573 - val_categorical_accuracy: 0.9787\n",
323 |       "Epoch 3/20\n",
324 |       "Epoch 00003: val_categorical_accuracy improved from 0.97867 to 0.98048, saving model to ./tmp/weights-03.hdf5\n",
325 |       "AUC: 0.95489\n",
326 |       " - 59s - loss: 0.0708 - categorical_accuracy: 0.9734 - val_loss: 0.0526 - val_categorical_accuracy: 0.9805\n",
327 |       "Epoch 4/20\n",
328 |       "Epoch 00004: val_categorical_accuracy did not improve\n",
329 |       "AUC: 0.95848\n",
330 |       " - 59s - loss: 0.0666 - categorical_accuracy: 0.9751 - val_loss: 0.0506 - val_categorical_accuracy: 0.9792\n",
331 |       "Epoch 5/20\n",
332 |       "Epoch 00005: val_categorical_accuracy did not improve\n",
333 |       "AUC: 0.95974\n",
334 |       " - 61s - loss: 0.0639 - categorical_accuracy: 0.9760 - val_loss: 0.0487 - val_categorical_accuracy: 0.9805\n",
335 |       "Epoch 6/20\n",
336 |       "Epoch 00006: val_categorical_accuracy did not improve\n",
337 |       "AUC: 0.96138\n",
338 |       " - 63s - loss: 0.0623 - categorical_accuracy: 0.9765 - val_loss: 0.0483 - val_categorical_accuracy: 0.9804\n",
339 |       "Epoch 7/20\n",
340 |       "Epoch 00007: val_categorical_accuracy improved from 0.98048 to 0.98109, saving model to ./tmp/weights-07.hdf5\n",
341 |       "AUC: 0.95850\n",
342 |       " - 61s - loss: 0.0609 - categorical_accuracy: 0.9769 - val_loss: 0.0490 - val_categorical_accuracy: 0.9811\n",
343 |       "Epoch 8/20\n",
344 |       "Epoch 00008: val_categorical_accuracy did not improve\n",
345 |       "AUC: 0.96303\n",
346 |       " - 63s - loss: 0.0598 - categorical_accuracy: 0.9772 - val_loss: 0.0462 - val_categorical_accuracy: 0.9807\n",
347 |       "Epoch 9/20\n",
348 |       "Epoch 00009: val_categorical_accuracy did not improve\n",
349 |       "AUC: 0.96292\n",
350 |       " - 62s - loss: 0.0589 - categorical_accuracy: 0.9774 - val_loss: 0.0468 - val_categorical_accuracy: 0.9808\n",
351 |       "Epoch 10/20\n",
352 |       "Epoch 00010: val_categorical_accuracy did not improve\n",
353 |       "AUC: 0.96326\n",
354 |       " - 59s - loss: 0.0581 - categorical_accuracy: 0.9774 - val_loss: 0.0462 - val_categorical_accuracy: 0.9806\n",
355 |       "Epoch 11/20\n",
356 |       "Epoch 00011: val_categorical_accuracy did not improve\n",
357 |       "AUC: 0.96347\n",
358 |       " - 63s - loss: 0.0569 - categorical_accuracy: 0.9778 - val_loss: 0.0456 - val_categorical_accuracy: 0.9800\n",
359 |       "Epoch 12/20\n",
360 |       "Epoch 00012: val_categorical_accuracy did not improve\n",
361 |       "AUC: 0.96203\n",
362 |       " - 60s - loss: 0.0563 - categorical_accuracy: 0.9781 - val_loss: 0.0449 - val_categorical_accuracy: 0.9802\n",
363 |       "Epoch 13/20\n",
364 |       "Epoch 00013: val_categorical_accuracy did not improve\n",
365 |       "AUC: 0.96189\n",
366 |       " - 61s - loss: 0.0553 - categorical_accuracy: 0.9784 - val_loss: 0.0458 - val_categorical_accuracy: 0.9808\n",
367 |       "Epoch 14/20\n",
368 |       "Epoch 00014: val_categorical_accuracy did not improve\n",
369 |       "AUC: 0.95982\n",
370 |       " - 60s - loss: 0.0544 - categorical_accuracy: 0.9784 - val_loss: 0.0457 - val_categorical_accuracy: 0.9810\n",
371 |       "Epoch 15/20\n",
372 |       "Epoch 00015: val_categorical_accuracy did not improve\n",
373 |       "AUC: 0.96014\n",
374 |       " - 64s - loss: 0.0536 - categorical_accuracy: 0.9788 - val_loss: 0.0465 - val_categorical_accuracy: 0.9806\n",
375 |       "Epoch 16/20\n",
376 |       "Epoch 00016: val_categorical_accuracy did not improve\n",
377 |       "AUC: 0.96055\n",
378 |       " - 62s - loss: 0.0529 - categorical_accuracy: 0.9790 - val_loss: 0.0462 - val_categorical_accuracy: 0.9808\n",
379 |       "Epoch 17/20\n",
380 |       "Epoch 00017: val_categorical_accuracy did not improve\n",
381 |       "AUC: 0.96207\n",
382 |       " - 63s - loss: 0.0522 - categorical_accuracy: 0.9793 - val_loss: 0.0464 - val_categorical_accuracy: 0.9802\n",
383 |       "Epoch 18/20\n",
384 |       "Epoch 00018: val_categorical_accuracy improved from 0.98109 to 0.98145, saving model to ./tmp/weights-18.hdf5\n",
385 |       "AUC: 0.96180\n",
386 |       " - 64s - loss: 0.0511 - categorical_accuracy: 0.9798 - val_loss: 0.0459 - val_categorical_accuracy: 0.9815\n",
387 |       "Epoch 19/20\n",
388 |       "Epoch 00019: val_categorical_accuracy did not improve\n",
389 |       "AUC: 0.95842\n",
390 |       " - 59s - loss: 0.0508 - categorical_accuracy: 0.9803 - val_loss: 0.0470 - val_categorical_accuracy: 0.9804\n",
391 |       "Epoch 20/20\n",
392 |       "Epoch 00020: val_categorical_accuracy did not improve\n",
393 |       "AUC: 0.95720\n",
394 |       " - 61s - loss: 0.0498 - categorical_accuracy: 0.9802 - val_loss: 0.0467 - val_categorical_accuracy: 0.9810\n"
395 |      ]
396 |     },
397 |     {
398 |      "data": {
399 |       "text/plain": [
400 |        "<keras.callbacks.History at 0x1a1731ce10>"
401 |       ]
402 |      },
403 |      "execution_count": 13,
404 |      "metadata": {},
405 |      "output_type": "execute_result"
406 |     }
407 |    ],
408 |    "source": [
409 |     "model.fit(x_train, y_train,\n",
410 |     "          epochs=20,\n",
411 |     "          validation_data=(x_val, y_val),\n",
412 |     "          callbacks=[checkpointer, our_auc],\n",
413 |     "          verbose=2)"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": null,
419 |    "metadata": {
420 |     "collapsed": true
421 |    },
422 |    "outputs": [],
423 |    "source": []
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": null,
428 |    "metadata": {
429 |     "collapsed": true
430 |    },
431 |    "outputs": [],
432 |    "source": []
433 |   }
434 |  ],
435 |  "metadata": {
436 |   "kernelspec": {
437 |    "display_name": "Python 3",
438 |    "language": "python",
439 |    "name": "python3"
440 |   },
441 |   "language_info": {
442 |    "codemirror_mode": {
443 |     "name": "ipython",
444 |     "version": 3
445 |    },
446 |    "file_extension": ".py",
447 |    "mimetype": "text/x-python",
448 |    "name": "python",
449 |    "nbconvert_exporter": "python",
450 |    "pygments_lexer": "ipython3",
451 |    "version": "3.6.3"
452 |   }
453 |  },
454 |  "nbformat": 4,
455 |  "nbformat_minor": 2
456 | }
457 | 


--------------------------------------------------------------------------------
/lib/notebooks/geo-string-result-explorations.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import sys\n",
 10 |     "import json\n",
 11 |     "sys.path.append('..')\n",
 12 |     "\n",
 13 |     "import tagnews\n",
 14 |     "import folium"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "name": "stderr",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "..\\tagnews\\utils\\load_data.py:185: RuntimeWarning: 1 location strings were not found in the bodytext.\n",
 27 |       "  RuntimeWarning)\n"
 28 |      ]
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "df = tagnews.load_data()"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 3,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "crimetags = tagnews.CrimeTags()\n",
 42 |     "geoextractor = tagnews.GeoCoder()"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 4,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "Article ID: 205277\n",
 55 |       "south: 6.251640619617706\n",
 56 |       "Ind.,: 3.4382634669318946\n",
 57 |       "1800 block of East 222nd Place: 1.1430729818468413\n",
 58 |       "1700 block of West 220th Place,: 1.5479177721231407\n"
 59 |      ]
 60 |     },
 61 |     {
 62 |      "data": {
 63 |       "text/html": [
 64 |        "<div style=\"width:100%;\"><div style=\"position:relative;width:100%;height:0;padding-bottom:60%;\"><iframe src=\"data:text/html;charset=utf-8;base64,PCFET0NUWVBFIGh0bWw+CjxoZWFkPiAgICAKICAgIDxtZXRhIGh0dHAtZXF1aXY9ImNvbnRlbnQtdHlwZSIgY29udGVudD0idGV4dC9odG1sOyBjaGFyc2V0PVVURi04IiAvPgogICAgPHNjcmlwdD5MX1BSRUZFUl9DQU5WQVMgPSBmYWxzZTsgTF9OT19UT1VDSCA9IGZhbHNlOyBMX0RJU0FCTEVfM0QgPSBmYWxzZTs8L3NjcmlwdD4KICAgIDxzY3JpcHQgc3JjPSJodHRwczovL2Nkbi5qc2RlbGl2ci5uZXQvbnBtL2xlYWZsZXRAMS4yLjAvZGlzdC9sZWFmbGV0LmpzIj48L3NjcmlwdD4KICAgIDxzY3JpcHQgc3JjPSJodHRwczovL2FqYXguZ29vZ2xlYXBpcy5jb20vYWpheC9saWJzL2pxdWVyeS8xLjExLjEvanF1ZXJ5Lm1pbi5qcyI+PC9zY3JpcHQ+CiAgICA8c2NyaXB0IHNyYz0iaHR0cHM6Ly9tYXhjZG4uYm9vdHN0cmFwY2RuLmNvbS9ib290c3RyYXAvMy4yLjAvanMvYm9vdHN0cmFwLm1pbi5qcyI+PC9zY3JpcHQ+CiAgICA8c2NyaXB0IHNyYz0iaHR0cHM6Ly9jZG5qcy5jbG91ZGZsYXJlLmNvbS9hamF4L2xpYnMvTGVhZmxldC5hd2Vzb21lLW1hcmtlcnMvMi4wLjIvbGVhZmxldC5hd2Vzb21lLW1hcmtlcnMuanMiPjwvc2NyaXB0PgogICAgPGxpbmsgcmVsPSJzdHlsZXNoZWV0IiBocmVmPSJodHRwczovL2Nkbi5qc2RlbGl2ci5uZXQvbnBtL2xlYWZsZXRAMS4yLjAvZGlzdC9sZWFmbGV0LmNzcyIgLz4KICAgIDxsaW5rIHJlbD0ic3R5bGVzaGVldCIgaHJlZj0iaHR0cHM6Ly9tYXhjZG4uYm9vdHN0cmFwY2RuLmNvbS9ib290c3RyYXAvMy4yLjAvY3NzL2Jvb3RzdHJhcC5taW4uY3NzIiAvPgogICAgPGxpbmsgcmVsPSJzdHlsZXNoZWV0IiBocmVmPSJodHRwczovL21heGNkbi5ib290c3RyYXBjZG4uY29tL2Jvb3RzdHJhcC8zLjIuMC9jc3MvYm9vdHN0cmFwLXRoZW1lLm1pbi5jc3MiIC8+CiAgICA8bGluayByZWw9InN0eWxlc2hlZXQiIGhyZWY9Imh0dHBzOi8vbWF4Y2RuLmJvb3RzdHJhcGNkbi5jb20vZm9udC1hd2Vzb21lLzQuNi4zL2Nzcy9mb250LWF3ZXNvbWUubWluLmNzcyIgLz4KICAgIDxsaW5rIHJlbD0ic3R5bGVzaGVldCIgaHJlZj0iaHR0cHM6Ly9jZG5qcy5jbG91ZGZsYXJlLmNvbS9hamF4L2xpYnMvTGVhZmxldC5hd2Vzb21lLW1hcmtlcnMvMi4wLjIvbGVhZmxldC5hd2Vzb21lLW1hcmtlcnMuY3NzIiAvPgogICAgPGxpbmsgcmVsPSJzdHlsZXNoZWV0IiBocmVmPSJodHRwczovL3Jhd2dpdC5jb20vcHl0aG9uLXZpc3VhbGl6YXRpb24vZm9saXVtL21hc3Rlci9mb2xpdW0vdGVtcGxhdGVzL2xlYWZsZXQuYXdlc29tZS5yb3RhdGUuY3NzIiAvPgogICAgPHN0eWxlPmh0bWwsIGJvZHkge3dpZHRoOiAxMDAlO2hlaWdodDogMTAwJTttYXJnaW46IDA7cGFkZGluZzogMDt9PC9zdHlsZT4KICAgIDxzdHlsZT4jbWFwIHtwb3NpdGlvbjphYnNvbHV0ZTt0b3A6MDtib3R0b206MDtyaWdodDowO2xlZnQ6MDt9PC9zdHlsZT4KICAgIAogICAgICAgICAgICA8c3R5bGU+ICNtYXBfOGI0MTFhZmUwNmQyNGVmZWE3N2FjM2U0ZjNjZGUzOGIgewogICAgICAgICAgICAgICAgcG9zaXRpb24gOiByZWxhdGl2ZTsKICAgICAgICAgICAgICAgIHdpZHRoIDogMTAwLjAlOwogICAgICAgICAgICAgICAgaGVpZ2h0OiAxMDAuMCU7CiAgICAgICAgICAgICAgICBsZWZ0OiAwLjAlOwogICAgICAgICAgICAgICAgdG9wOiAwLjAlOwogICAgICAgICAgICAgICAgfQogICAgICAgICAgICA8L3N0eWxlPgogICAgICAgIAo8L2hlYWQ+Cjxib2R5PiAgICAKICAgIAogICAgICAgICAgICA8ZGl2IGNsYXNzPSJmb2xpdW0tbWFwIiBpZD0ibWFwXzhiNDExYWZlMDZkMjRlZmVhNzdhYzNlNGYzY2RlMzhiIiA+PC9kaXY+CiAgICAgICAgCjwvYm9keT4KPHNjcmlwdD4gICAgCiAgICAKCiAgICAgICAgICAgIAogICAgICAgICAgICAgICAgdmFyIGJvdW5kcyA9IG51bGw7CiAgICAgICAgICAgIAoKICAgICAgICAgICAgdmFyIG1hcF84YjQxMWFmZTA2ZDI0ZWZlYTc3YWMzZTRmM2NkZTM4YiA9IEwubWFwKAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ21hcF84YjQxMWFmZTA2ZDI0ZWZlYTc3YWMzZTRmM2NkZTM4YicsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB7Y2VudGVyOiBbNDEuODc4NzEsLTg3LjYyOThdLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgem9vbTogMTAsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBtYXhCb3VuZHM6IGJvdW5kcywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGxheWVyczogW10sCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB3b3JsZENvcHlKdW1wOiBmYWxzZSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGNyczogTC5DUlMuRVBTRzM4NTcKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgfSk7CiAgICAgICAgICAgIAogICAgICAgIAogICAgCiAgICAgICAgICAgIHZhciB0aWxlX2xheWVyXzhhOGQ3ODQwNDE0YzQzMThhNDU3MTlhMzE4YmQwZTg2ID0gTC50aWxlTGF5ZXIoCiAgICAgICAgICAgICAgICAnaHR0cHM6Ly97c30udGlsZS5vcGVuc3RyZWV0bWFwLm9yZy97en0ve3h9L3t5fS5wbmcnLAogICAgICAgICAgICAgICAgewogICJhdHRyaWJ1dGlvbiI6IG51bGwsCiAgImRldGVjdFJldGluYSI6IGZhbHNlLAogICJtYXhab29tIjogMTgsCiAgIm1pblpvb20iOiAxLAogICJub1dyYXAiOiBmYWxzZSwKICAic3ViZG9tYWlucyI6ICJhYmMiCn0KICAgICAgICAgICAgICAgICkuYWRkVG8obWFwXzhiNDExYWZlMDZkMjRlZmVhNzdhYzNlNGYzY2RlMzhiKTsKICAgICAgICAKICAgIAoKICAgICAgICAgICAgdmFyIG1hcmtlcl85ZWZiY2JjYTE3Nzg0ZDU5OTBkMTE2YWY3MWFjNWFhYyA9IEwubWFya2VyKAogICAgICAgICAgICAgICAgWy0zMy44Njc4NTEyNTczMjQyMiwxNTEuMjA3MzIxMTY2OTkyMl0sCiAgICAgICAgICAgICAgICB7CiAgICAgICAgICAgICAgICAgICAgaWNvbjogbmV3IEwuSWNvbi5EZWZhdWx0KCkKICAgICAgICAgICAgICAgICAgICB9CiAgICAgICAgICAgICAgICApCiAgICAgICAgICAgICAgICAuYWRkVG8obWFwXzhiNDExYWZlMDZkMjRlZmVhNzdhYzNlNGYzY2RlMzhiKTsKICAgICAgICAgICAgCiAgICAKICAgICAgICAgICAgdmFyIHBvcHVwXzc0ZTA0MmViMTgzODQzNzliZjcwMzRhM2IzY2E2ZjVjID0gTC5wb3B1cCh7bWF4V2lkdGg6ICczMDAnfSk7CgogICAgICAgICAgICAKICAgICAgICAgICAgICAgIHZhciBodG1sXzdmNDRlZmEwMDY4OTQ2YjA4MGVlMWJjMmE4ZjY2MTM1ID0gJCgnPGRpdiBpZD0iaHRtbF83ZjQ0ZWZhMDA2ODk0NmIwODBlZTFiYzJhOGY2NjEzNSIgc3R5bGU9IndpZHRoOiAxMDAuMCU7IGhlaWdodDogMTAwLjAlOyI+c291dGggOyBSQVcgOyA1Ni44MDU1NzwvZGl2PicpWzBdOwogICAgICAgICAgICAgICAgcG9wdXBfNzRlMDQyZWIxODM4NDM3OWJmNzAzNGEzYjNjYTZmNWMuc2V0Q29udGVudChodG1sXzdmNDRlZmEwMDY4OTQ2YjA4MGVlMWJjMmE4ZjY2MTM1KTsKICAgICAgICAgICAgCgogICAgICAgICAgICBtYXJrZXJfOWVmYmNiY2ExNzc4NGQ1OTkwZDExNmFmNzFhYzVhYWMuYmluZFBvcHVwKHBvcHVwXzc0ZTA0MmViMTgzODQzNzliZjcwMzRhM2IzY2E2ZjVjKTsKCiAgICAgICAgICAgIAogICAgICAgIAogICAgCgogICAgICAgICAgICB2YXIgbWFya2VyXzY1ZTMyMDhkZDViZDQwNWQ5YjUwZGFkNDY4YTJjMzNhID0gTC5tYXJrZXIoCiAgICAgICAgICAgICAgICBbNDEuNDgwODY5MjkzMjEyODksLTg3LjYzNzgxNzM4MjgxMjVdLAogICAgICAgICAgICAgICAgewogICAgICAgICAgICAgICAgICAgIGljb246IG5ldyBMLkljb24uRGVmYXVsdCgpCiAgICAgICAgICAgICAgICAgICAgfQogICAgICAgICAgICAgICAgKQogICAgICAgICAgICAgICAgLmFkZFRvKG1hcF84YjQxMWFmZTA2ZDI0ZWZlYTc3YWMzZTRmM2NkZTM4Yik7CiAgICAgICAgICAgIAogICAgCiAgICAgICAgICAgIHZhciBwb3B1cF9hMWIwMTkyYTJkY2Q0OWIxYTBlNGE0MzJlMjBkNDQ2OCA9IEwucG9wdXAoe21heFdpZHRoOiAnMzAwJ30pOwoKICAgICAgICAgICAgCiAgICAgICAgICAgICAgICB2YXIgaHRtbF9mMDYxZTU1NzI1MjY0Yzg3ODliODk0YWI3N2NkMmEwMiA9ICQoJzxkaXYgaWQ9Imh0bWxfZjA2MWU1NTcyNTI2NGM4Nzg5Yjg5NGFiNzdjZDJhMDIiIHN0eWxlPSJ3aWR0aDogMTAwLjAlOyBoZWlnaHQ6IDEwMC4wJTsiPnNvdXRoIDsgUE9TVCA7IDkuMDg2NTA2PC9kaXY+JylbMF07CiAgICAgICAgICAgICAgICBwb3B1cF9hMWIwMTkyYTJkY2Q0OWIxYTBlNGE0MzJlMjBkNDQ2OC5zZXRDb250ZW50KGh0bWxfZjA2MWU1NTcyNTI2NGM4Nzg5Yjg5NGFiNzdjZDJhMDIpOwogICAgICAgICAgICAKCiAgICAgICAgICAgIG1hcmtlcl82NWUzMjA4ZGQ1YmQ0MDVkOWI1MGRhZDQ2OGEyYzMzYS5iaW5kUG9wdXAocG9wdXBfYTFiMDE5MmEyZGNkNDliMWEwZTRhNDMyZTIwZDQ0NjgpOwoKICAgICAgICAgICAgCiAgICAgICAgCiAgICAKCiAgICAgICAgICAgIHZhciBtYXJrZXJfNTY1MzJiZTk1MDc3NDJmMGJjMTk3OTk4YTk0N2U0OGYgPSBMLm1hcmtlcigKICAgICAgICAgICAgICAgIFszOS43NjgzNzkyMTE0MjU3OCwtODYuMTU4MDQyOTA3NzE0ODRdLAogICAgICAgICAgICAgICAgewogICAgICAgICAgICAgICAgICAgIGljb246IG5ldyBMLkljb24uRGVmYXVsdCgpCiAgICAgICAgICAgICAgICAgICAgfQogICAgICAgICAgICAgICAgKQogICAgICAgICAgICAgICAgLmFkZFRvKG1hcF84YjQxMWFmZTA2ZDI0ZWZlYTc3YWMzZTRmM2NkZTM4Yik7CiAgICAgICAgICAgIAogICAgCiAgICAgICAgICAgIHZhciBwb3B1cF9jZDQ4ZWJkMzI1NGM0MThjOThiYjVlNmZkYWQyNDE4ZCA9IEwucG9wdXAoe21heFdpZHRoOiAnMzAwJ30pOwoKICAgICAgICAgICAgCiAgICAgICAgICAgICAgICB2YXIgaHRtbF8xNTNiZGNmMTRiZDc0ZDU3ODAzMTUzMzE1MmYwMGUzNyA9ICQoJzxkaXYgaWQ9Imh0bWxfMTUzYmRjZjE0YmQ3NGQ1NzgwMzE1MzMxNTJmMDBlMzciIHN0eWxlPSJ3aWR0aDogMTAwLjAlOyBoZWlnaHQ6IDEwMC4wJTsiPkluZC4sIDsgUkFXIDsgMjEuNzkyMzc0PC9kaXY+JylbMF07CiAgICAgICAgICAgICAgICBwb3B1cF9jZDQ4ZWJkMzI1NGM0MThjOThiYjVlNmZkYWQyNDE4ZC5zZXRDb250ZW50KGh0bWxfMTUzYmRjZjE0YmQ3NGQ1NzgwMzE1MzMxNTJmMDBlMzcpOwogICAgICAgICAgICAKCiAgICAgICAgICAgIG1hcmtlcl81NjUzMmJlOTUwNzc0MmYwYmMxOTc5OThhOTQ3ZTQ4Zi5iaW5kUG9wdXAocG9wdXBfY2Q0OGViZDMyNTRjNDE4Yzk4YmI1ZTZmZGFkMjQxOGQpOwoKICAgICAgICAgICAgCiAgICAgICAgCiAgICAKCiAgICAgICAgICAgIHZhciBtYXJrZXJfYjMxNTg0MTE4MjgwNDM5MGJhM2RmZWJiMTFjOGJjY2MgPSBMLm1hcmtlcigKICAgICAgICAgICAgICAgIFs0MS44OTExMDA2NTA5NzgxOSwtODcuNjE0MDAzMjYxNzc3OF0sCiAgICAgICAgICAgICAgICB7CiAgICAgICAgICAgICAgICAgICAgaWNvbjogbmV3IEwuSWNvbi5EZWZhdWx0KCkKICAgICAgICAgICAgICAgICAgICB9CiAgICAgICAgICAgICAgICApCiAgICAgICAgICAgICAgICAuYWRkVG8obWFwXzhiNDExYWZlMDZkMjRlZmVhNzdhYzNlNGYzY2RlMzhiKTsKICAgICAgICAgICAgCiAgICAKICAgICAgICAgICAgdmFyIHBvcHVwXzAxNzRmYjI3OTEyZTRiNTc5ODU4MThmNGI5OGI3MWMzID0gTC5wb3B1cCh7bWF4V2lkdGg6ICczMDAnfSk7CgogICAgICAgICAgICAKICAgICAgICAgICAgICAgIHZhciBodG1sX2Y3N2Y0NjE0Y2M4ZDQxMDhhYTU5NTMwMGQ0MGZhZTMwID0gJCgnPGRpdiBpZD0iaHRtbF9mNzdmNDYxNGNjOGQ0MTA4YWE1OTUzMDBkNDBmYWUzMCIgc3R5bGU9IndpZHRoOiAxMDAuMCU7IGhlaWdodDogMTAwLjAlOyI+SW5kLiwgOyBQT1NUIDsgNi4zMzgxOTI8L2Rpdj4nKVswXTsKICAgICAgICAgICAgICAgIHBvcHVwXzAxNzRmYjI3OTEyZTRiNTc5ODU4MThmNGI5OGI3MWMzLnNldENvbnRlbnQoaHRtbF9mNzdmNDYxNGNjOGQ0MTA4YWE1OTUzMDBkNDBmYWUzMCk7CiAgICAgICAgICAgIAoKICAgICAgICAgICAgbWFya2VyX2IzMTU4NDExODI4MDQzOTBiYTNkZmViYjExYzhiY2NjLmJpbmRQb3B1cChwb3B1cF8wMTc0ZmIyNzkxMmU0YjU3OTg1ODE4ZjRiOThiNzFjMyk7CgogICAgICAgICAgICAKICAgICAgICAKICAgIAoKICAgICAgICAgICAgdmFyIG1hcmtlcl9jZTIxZjY3YTIzOTU0ZGUwYjJjMTc5ODYzM2ZhNGEyMSA9IEwubWFya2VyKAogICAgICAgICAgICAgICAgWzM2LjA3MzIyNDc2MzQ2NzI0NSwtOTUuNzMwMzgxMzAzODg4M10sCiAgICAgICAgICAgICAgICB7CiAgICAgICAgICAgICAgICAgICAgaWNvbjogbmV3IEwuSWNvbi5EZWZhdWx0KCkKICAgICAgICAgICAgICAgICAgICB9CiAgICAgICAgICAgICAgICApCiAgICAgICAgICAgICAgICAuYWRkVG8obWFwXzhiNDExYWZlMDZkMjRlZmVhNzdhYzNlNGYzY2RlMzhiKTsKICAgICAgICAgICAgCiAgICAKICAgICAgICAgICAgdmFyIHBvcHVwXzY5ODk4NTEwY2M0MjQ4ZjA5YmMyMGQ3MjQ2ZjU2ODA2ID0gTC5wb3B1cCh7bWF4V2lkdGg6ICczMDAnfSk7CgogICAgICAgICAgICAKICAgICAgICAgICAgICAgIHZhciBodG1sX2VjMGJhNjEyMzA1MzRjZGViY2VmODhhMDBmNWJjY2RmID0gJCgnPGRpdiBpZD0iaHRtbF9lYzBiYTYxMjMwNTM0Y2RlYmNlZjg4YTAwZjViY2NkZiIgc3R5bGU9IndpZHRoOiAxMDAuMCU7IGhlaWdodDogMTAwLjAlOyI+MTgwMCBibG9jayBvZiBFYXN0IDIyMm5kIFBsYWNlIDsgUkFXIDsgOS40NzM3MDI8L2Rpdj4nKVswXTsKICAgICAgICAgICAgICAgIHBvcHVwXzY5ODk4NTEwY2M0MjQ4ZjA5YmMyMGQ3MjQ2ZjU2ODA2LnNldENvbnRlbnQoaHRtbF9lYzBiYTYxMjMwNTM0Y2RlYmNlZjg4YTAwZjViY2NkZik7CiAgICAgICAgICAgIAoKICAgICAgICAgICAgbWFya2VyX2NlMjFmNjdhMjM5NTRkZTBiMmMxNzk4NjMzZmE0YTIxLmJpbmRQb3B1cChwb3B1cF82OTg5ODUxMGNjNDI0OGYwOWJjMjBkNzI0NmY1NjgwNik7CgogICAgICAgICAgICAKICAgICAgICAKICAgIAoKICAgICAgICAgICAgdmFyIG1hcmtlcl83NWRmOTgzMmVjZDc0OGIwOTc5MjFkZDBhNTljNjQ2OCA9IEwubWFya2VyKAogICAgICAgICAgICAgICAgWzQxLjQ4NTI2NzYzOTE2MDE1NiwtODcuNTU5NTcwMzEyNV0sCiAgICAgICAgICAgICAgICB7CiAgICAgICAgICAgICAgICAgICAgaWNvbjogbmV3IEwuSWNvbi5EZWZhdWx0KCkKICAgICAgICAgICAgICAgICAgICB9CiAgICAgICAgICAgICAgICApCiAgICAgICAgICAgICAgICAuYWRkVG8obWFwXzhiNDExYWZlMDZkMjRlZmVhNzdhYzNlNGYzY2RlMzhiKTsKICAgICAgICAgICAgCiAgICAKICAgICAgICAgICAgdmFyIHBvcHVwX2EzZTBmNWUwOGRjZTRhZmZiYmU5MmUzNDdhMmY1YjU2ID0gTC5wb3B1cCh7bWF4V2lkdGg6ICczMDAnfSk7CgogICAgICAgICAgICAKICAgICAgICAgICAgICAgIHZhciBodG1sX2NjOTE0ZTc4NDkxZTQzNTM5MDNlMjVkZGUzODI5NWNiID0gJCgnPGRpdiBpZD0iaHRtbF9jYzkxNGU3ODQ5MWU0MzUzOTAzZTI1ZGRlMzgyOTVjYiIgc3R5bGU9IndpZHRoOiAxMDAuMCU7IGhlaWdodDogMTAwLjAlOyI+MTgwMCBibG9jayBvZiBFYXN0IDIyMm5kIFBsYWNlIDsgUE9TVCA7IDguMjg3OTI0PC9kaXY+JylbMF07CiAgICAgICAgICAgICAgICBwb3B1cF9hM2UwZjVlMDhkY2U0YWZmYmJlOTJlMzQ3YTJmNWI1Ni5zZXRDb250ZW50KGh0bWxfY2M5MTRlNzg0OTFlNDM1MzkwM2UyNWRkZTM4Mjk1Y2IpOwogICAgICAgICAgICAKCiAgICAgICAgICAgIG1hcmtlcl83NWRmOTgzMmVjZDc0OGIwOTc5MjFkZDBhNTljNjQ2OC5iaW5kUG9wdXAocG9wdXBfYTNlMGY1ZTA4ZGNlNGFmZmJiZTkyZTM0N2EyZjViNTYpOwoKICAgICAgICAgICAgCiAgICAgICAgCiAgICAKCiAgICAgICAgICAgIHZhciBtYXJrZXJfMmRmMmY5ZDUwNTJmNDgxYTkzMjA3M2MwNTJiZGM5MjcgPSBMLm1hcmtlcigKICAgICAgICAgICAgICAgIFs0Ny43OTgwOTExMTI5ODYyNCwtMTIyLjI1MTEwOTI1NDI1Mzg2XSwKICAgICAgICAgICAgICAgIHsKICAgICAgICAgICAgICAgICAgICBpY29uOiBuZXcgTC5JY29uLkRlZmF1bHQoKQogICAgICAgICAgICAgICAgICAgIH0KICAgICAgICAgICAgICAgICkKICAgICAgICAgICAgICAgIC5hZGRUbyhtYXBfOGI0MTFhZmUwNmQyNGVmZWE3N2FjM2U0ZjNjZGUzOGIpOwogICAgICAgICAgICAKICAgIAogICAgICAgICAgICB2YXIgcG9wdXBfMWU4YjIzZGUyNmFjNDNhMTk0NDFmZTU1MGZhNDk5MTYgPSBMLnBvcHVwKHttYXhXaWR0aDogJzMwMCd9KTsKCiAgICAgICAgICAgIAogICAgICAgICAgICAgICAgdmFyIGh0bWxfM2VjZjU1ZWY4YjE2NDdkYmExNDNjYTIwMDhkYjU1NzMgPSAkKCc8ZGl2IGlkPSJodG1sXzNlY2Y1NWVmOGIxNjQ3ZGJhMTQzY2EyMDA4ZGI1NTczIiBzdHlsZT0id2lkdGg6IDEwMC4wJTsgaGVpZ2h0OiAxMDAuMCU7Ij4xNzAwIGJsb2NrIG9mIFdlc3QgMjIwdGggUGxhY2UsIDsgUkFXIDsgNy44NDAyNDU8L2Rpdj4nKVswXTsKICAgICAgICAgICAgICAgIHBvcHVwXzFlOGIyM2RlMjZhYzQzYTE5NDQxZmU1NTBmYTQ5OTE2LnNldENvbnRlbnQoaHRtbF8zZWNmNTVlZjhiMTY0N2RiYTE0M2NhMjAwOGRiNTU3Myk7CiAgICAgICAgICAgIAoKICAgICAgICAgICAgbWFya2VyXzJkZjJmOWQ1MDUyZjQ4MWE5MzIwNzNjMDUyYmRjOTI3LmJpbmRQb3B1cChwb3B1cF8xZThiMjNkZTI2YWM0M2ExOTQ0MWZlNTUwZmE0OTkxNik7CgogICAgICAgICAgICAKICAgICAgICAKICAgIAoKICAgICAgICAgICAgdmFyIG1hcmtlcl85OGE2M2YzMmE0MTI0MzBiYWVhMDk4Yjk5NjJmZmE4MyA9IEwubWFya2VyKAogICAgICAgICAgICAgICAgWzQ3Ljc5ODA5MTExMjk4NjI0LC0xMjIuMjUxMTA5MjU0MjUzODZdLAogICAgICAgICAgICAgICAgewogICAgICAgICAgICAgICAgICAgIGljb246IG5ldyBMLkljb24uRGVmYXVsdCgpCiAgICAgICAgICAgICAgICAgICAgfQogICAgICAgICAgICAgICAgKQogICAgICAgICAgICAgICAgLmFkZFRvKG1hcF84YjQxMWFmZTA2ZDI0ZWZlYTc3YWMzZTRmM2NkZTM4Yik7CiAgICAgICAgICAgIAogICAgCiAgICAgICAgICAgIHZhciBwb3B1cF81MjgyMWRjNjUwZDg0OWM2YjQxNDY1MGUwYWE4ZDZkNyA9IEwucG9wdXAoe21heFdpZHRoOiAnMzAwJ30pOwoKICAgICAgICAgICAgCiAgICAgICAgICAgICAgICB2YXIgaHRtbF84YjU0YTdjNmMxMmM0OTAzYTczZjk0NzNiZjJhNTE2YSA9ICQoJzxkaXYgaWQ9Imh0bWxfOGI1NGE3YzZjMTJjNDkwM2E3M2Y5NDczYmYyYTUxNmEiIHN0eWxlPSJ3aWR0aDogMTAwLjAlOyBoZWlnaHQ6IDEwMC4wJTsiPjE3MDAgYmxvY2sgb2YgV2VzdCAyMjB0aCBQbGFjZSwgOyBQT1NUIDsgNS4wNjUwMjY4PC9kaXY+JylbMF07CiAgICAgICAgICAgICAgICBwb3B1cF81MjgyMWRjNjUwZDg0OWM2YjQxNDY1MGUwYWE4ZDZkNy5zZXRDb250ZW50KGh0bWxfOGI1NGE3YzZjMTJjNDkwM2E3M2Y5NDczYmYyYTUxNmEpOwogICAgICAgICAgICAKCiAgICAgICAgICAgIG1hcmtlcl85OGE2M2YzMmE0MTI0MzBiYWVhMDk4Yjk5NjJmZmE4My5iaW5kUG9wdXAocG9wdXBfNTI4MjFkYzY1MGQ4NDljNmI0MTQ2NTBlMGFhOGQ2ZDcpOwoKICAgICAgICAgICAgCiAgICAgICAgCjwvc2NyaXB0Pg==\" style=\"position:absolute;width:100%;height:100%;left:0;top:0;border:none !important;\" allowfullscreen webkitallowfullscreen mozallowfullscreen></iframe></div></div>"
 65 |       ],
 66 |       "text/plain": [
 67 |        "<folium.folium.Map at 0x29cba4b0eb8>"
 68 |       ]
 69 |      },
 70 |      "execution_count": 4,
 71 |      "metadata": {},
 72 |      "output_type": "execute_result"
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "while True:\n",
 77 |     "    random_sample = df.sample(1)\n",
 78 |     "    article_text = random_sample['bodytext'].iloc[0]\n",
 79 |     "    if crimetags.tagtext_proba(article_text).max() < .15:\n",
 80 |     "        continue\n",
 81 |     "    # print(article_text)\n",
 82 |     "    geostrings = [' '.join(gs) for gs in geoextractor.extract_geostrings(article_text, prob_thresh=0.5)]\n",
 83 |     "    geocode_results = tagnews.get_lat_longs_from_geostrings(geostrings)\n",
 84 |     "    lat_longs_raw = geocode_results.lat_longs_raw\n",
 85 |     "    lat_longs_post = geocode_results.lat_longs_post\n",
 86 |     "    \n",
 87 |     "    raw_scores = []\n",
 88 |     "    for gr in geocode_results.full_responses_raw:\n",
 89 |     "        try:\n",
 90 |     "            raw_scores.append(json.loads(gr.response.content)['result'][0]['score'])\n",
 91 |     "        except:\n",
 92 |     "            raw_scores.append(None)\n",
 93 |     "    post_scores = []\n",
 94 |     "    for gr in geocode_results.full_responses_post:\n",
 95 |     "        try:\n",
 96 |     "            post_scores.append(json.loads(gr.response.content)['result'][0]['score'])\n",
 97 |     "        except:\n",
 98 |     "            post_scores.append(None)\n",
 99 |     "\n",
100 |     "    if not geostrings:\n",
101 |     "        continue\n",
102 |     "    \n",
103 |     "    print('Article ID: {}'.format(random_sample.index[0]))\n",
104 |     "\n",
105 |     "    m = folium.Map(location=[41.87871, -87.6298])\n",
106 |     "\n",
107 |     "    for geostring, lat_long_raw, lat_long_post, raw_score, post_score in zip(geostrings, lat_longs_raw, lat_longs_post, raw_scores, post_scores):\n",
108 |     "        if lat_long_raw is None:\n",
109 |     "            print('  Unable to code raw \"{}\"'.format(geostring))\n",
110 |     "        else:\n",
111 |     "            folium.Marker(lat_long_raw, popup=geostring + ' ; RAW ; {}'.format(raw_score)).add_to(m)\n",
112 |     "        \n",
113 |     "        if lat_long_post is None:\n",
114 |     "            print('  Unable to code post-processed \"{}\"'.format(geostring))\n",
115 |     "        else:\n",
116 |     "            folium.Marker(lat_long_post, popup=geostring + ' ; POST ; {}'.format(post_score)).add_to(m)\n",
117 |     "        \n",
118 |     "        try:\n",
119 |     "            print('{}: {}'.format(geostring, raw_score / post_score))\n",
120 |     "        except:\n",
121 |     "            print('{}: {}'.format(geostring, 'N/A'))\n",
122 |     "    break\n",
123 |     "\n",
124 |     "m"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 5,
130 |    "metadata": {},
131 |    "outputs": [
132 |     {
133 |      "name": "stdout",
134 |      "output_type": "stream",
135 |      "text": [
136 |       "**CHICAGO (STMW) --** A 23-year-old man who was shot in south suburban Sauk Vill\n",
137 |       "age died early Thursday.\n",
138 |       "\n",
139 |       "Manuel G. Montoya was pronounced dead at 1:19 a.m. at St. Margaret Hospital in\n",
140 |       "Dyer, Ind., a Lake County coroner’s office statement said.\n",
141 |       "\n",
142 |       "He was shot in the 1800 block of East 222nd Place in Sauk Village, but police\n",
143 |       "and a representative at the coroner’s office could not say when the shooting\n",
144 |       "happened.\n",
145 |       "\n",
146 |       "He died of a gunshot wound, and his death was ruled a homicide, the coroner’s\n",
147 |       "office said.\n",
148 |       "\n",
149 |       "Montoya lived in the 1700 block of West 220th Place, about half a mile\n",
150 |       "northeast of the shooting.\n",
151 |       "\n",
152 |       "Sauk Village police could not provide further details early Thursday.\n",
153 |       "\n",
154 |       "_(Source: Sun-Times Media Wire (C) Chicago Sun-Times 2015. All Rights\n",
155 |       "Reserved. This material may not be published, broadcast, rewritten, or\n",
156 |       "redistributed.)_\n",
157 |       "\n",
158 |       "![][1]\n",
159 |       "\n",
160 |       "   [1]: http://pixel.wp.com/b.gif?host=chicago.cbslocal.com&blog=15116062&post=6\n",
161 |       "49158&subd=cbschicago&ref=&feed=1\n",
162 |       "\n"
163 |      ]
164 |     }
165 |    ],
166 |    "source": [
167 |     "print(article_text)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": []
176 |   }
177 |  ],
178 |  "metadata": {
179 |   "kernelspec": {
180 |    "display_name": "Python 3",
181 |    "language": "python",
182 |    "name": "python3"
183 |   },
184 |   "language_info": {
185 |    "codemirror_mode": {
186 |     "name": "ipython",
187 |     "version": 3
188 |    },
189 |    "file_extension": ".py",
190 |    "mimetype": "text/x-python",
191 |    "name": "python",
192 |    "nbconvert_exporter": "python",
193 |    "pygments_lexer": "ipython3",
194 |    "version": "3.6.1"
195 |   }
196 |  },
197 |  "nbformat": 4,
198 |  "nbformat_minor": 2
199 | }
200 | 


--------------------------------------------------------------------------------