├── .bowerrc ├── .gitignore ├── .ipynb_checkpoints ├── GetMuniBondsTest-checkpoint.ipynb ├── TableParser-checkpoint.ipynb ├── TableParser2-checkpoint.ipynb ├── TableParser3-checkpoint.ipynb └── TableParser4-checkpoint.ipynb ├── LICENSE ├── README.md ├── backend.py ├── bower.json ├── bulk_processing ├── IRR_estimate.ipynb ├── bulk_proc.sh └── tf_calc.ipynb ├── data_query.py ├── deprecated ├── GetMuniBondData.cfg ├── GetMuniBondData.py ├── GetMuniBondsTest.ipynb ├── Procfile ├── TableParser.ipynb ├── TableParser.py ├── TableParser2.ipynb ├── TableParser2.py ├── TableParser3.ipynb ├── TableParser3.py ├── deploy_notebook.sh ├── manifest.yml ├── pdf2text_bulk.ipynb └── tableparser.py ├── design ├── 1_Home.jpg ├── 2_Show.jpg ├── 3_Browse.jpg ├── browse_similar_data_tables_feature.png └── screenshot_show_example.png ├── prototyping.ipynb ├── requirements.txt ├── server.py ├── static ├── TabulaRazr_Logo.png ├── center_for_municipal_finance_logo.png ├── css │ ├── main.css │ ├── source │ │ ├── index.html │ │ └── table-images │ │ │ ├── back.png │ │ │ ├── blurry.jpg │ │ │ ├── botleft.png │ │ │ ├── botright.png │ │ │ ├── gradback.png │ │ │ ├── gradhead.png │ │ │ ├── gradhover.png │ │ │ ├── header.jpg │ │ │ ├── left.png │ │ │ ├── pattern-head.png │ │ │ ├── pattern.gif │ │ │ ├── pattern.png │ │ │ ├── patternb-head.png │ │ │ ├── patternb.png │ │ │ └── right.png │ └── style.css ├── filters │ ├── funds.json │ └── maturity_schedule.json ├── scrutiny.png └── xirr_calculator.png ├── templates ├── filtered_project.html ├── index.html ├── inspector.html ├── view_filtered.html └── viewer.html └── xirr_calc.py /.bowerrc: -------------------------------------------------------------------------------- 1 | { 2 | "directory": "static/bower_components" 3 | } 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | cf 2 | nohup.out 3 | guni.log 4 | static/ug/* 5 | .ipynb_checkpoints/ 6 | bulk_processing/.ipynb_checkpoints/ 7 | prototyping.py 8 | *.pyc 9 | static/bower_components/ 10 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/GetMuniBondsTest-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 35, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "skipping pdf/ER862677-ER674128-ER1075876.pdf, already exists.\n", 15 | "skipping pdf/EP849915-EP657701-EP1059361.pdf, already exists.\n", 16 | "skipping pdf/ER866175-ER676833-ER1078611.pdf, already exists.\n" 17 | ] 18 | } 19 | ], 20 | "source": [ 21 | "from __future__ import print_function\n", 22 | "import re\n", 23 | "import os\n", 24 | "import codecs\n", 25 | "import string\n", 26 | "\n", 27 | "#Convert all pdfs\n", 28 | "files = os.listdir('pdf')\n", 29 | "for i,f in enumerate(files):\n", 30 | "\n", 31 | " pdf_path = os.path.join('pdf', f)\n", 32 | " txt_path = os.path.join('txt', f+'.txt')\n", 33 | " \n", 34 | " if not os.path.isfile(txt_path):\n", 35 | " #Layout preservation crucial to maintain clues about tabular data\n", 36 | " cmd = \"pdftotext -layout %s %s\" % (pdf_path, txt_path)\n", 37 | " print ('%d/%d %s' % (i, len(files), cmd))\n", 38 | " os.system(cmd)\n", 39 | " else:\n", 40 | " print ('skipping %s, already exists.' % (pdf_path, ))" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 147, 46 | "metadata": { 47 | "collapsed": false, 48 | "scrolled": false 49 | }, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "--------ER862677-ER674128-ER1075876.pdf.txt--------\n", 56 | "HEADER 1 $95,885,000\n", 57 | "HEADER 2 CALIFORNIA MUNICIPAL FINANCE AUTHORITY\n", 58 | "HEADER 3 REVENUE BONDS, SERIES 2015-A\n", 59 | "--------EP849915-EP657701-EP1059361.pdf.txt--------\n", 60 | "HEADER 1 $6,645,000\n", 61 | "HEADER 2 CITY OF PALM SPRINGS\n", 62 | "HEADER 3 LIMITED OBLIGATION REFUNDING IMPROVEMENT BONDS\n", 63 | "--------ER866175-ER676833-ER1078611.pdf.txt--------\n", 64 | "HEADER 1 $19,560,000\n", 65 | "HEADER 2 RNR SCHOOL FINANCING AUTHORITY\n", 66 | "HEADER 3 COMMUNITY FACILITIES DISTRICT NO. 92-1\n", 67 | "PRINCIPAL AMOUNT OF 2015 REFUNDING BONDS $19,560,000.00\n", 68 | "PLUS: NET ORIGINAL ISSUE PREMIUM 2,550,554.30\n", 69 | "PLUS: TRANSFERRED MONEYS FROM FUNDS FOR 2006 BONDS 367,663.99\n", 70 | "TOTAL SOURCES $22,302,178.29\n", 71 | "DEPOSIT INTO ESCROW FUND (1) $21,893,691.38\n", 72 | "DEPOSIT INTO 2015A COSTS OF ISSUANCE ACCOUNT (2) 408,486.91\n", 73 | "TOTAL USES $22,302,178.29\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "#Existing Version\n", 79 | "for file in os.listdir('txt'):\n", 80 | " \n", 81 | " print (\"--------\" + file + \"--------\")\n", 82 | " \n", 83 | " printline = 0\n", 84 | " linesleft = 0\n", 85 | " blanklines = 0\n", 86 | " \n", 87 | " topfound = 0\n", 88 | " headerline = 0 \n", 89 | " \n", 90 | " with codecs.open('txt/'+file, \"r\", \"utf-8\") as f:\n", 91 | " for i, line in enumerate(f):\n", 92 | "\n", 93 | " strippedline = line.upper().strip()\n", 94 | "\n", 95 | " if topfound == 0 and string.find(line,\" $\") > 0:\n", 96 | " headerline = 1\n", 97 | " topfound = 1\n", 98 | "\n", 99 | " if 1 <= headerline <= 3:\n", 100 | " caption = \"HEADER \" + str(headerline)\n", 101 | " value = strippedline\n", 102 | " #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n", 103 | " print (u\"{:60s} {:10s}\".format(caption, value))\n", 104 | " headerline = headerline + 1\n", 105 | " continue\n", 106 | "\n", 107 | " if strippedline == \"SOURCES AND USES OF FUNDS\" \\\n", 108 | " or strippedline == \"SOURCES AND USES OF FUNDS*\" \\\n", 109 | " or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS\" \\\n", 110 | " or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS*\" \\\n", 111 | " or strippedline == \"SOURCES AND USES OF FUNDS(1)\" \\\n", 112 | " or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS(1)\" \\\n", 113 | " or strippedline == \"PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS\":\n", 114 | " printline = 1\n", 115 | " linesleft = 25\n", 116 | "\n", 117 | " if printline == 1:\n", 118 | " dollar_amount_regex = re.compile(\"[\\$]{0,1}[\\s]{0,6}[0-9,]{0,15}(\\.[0-9]{1,2})$\")\n", 119 | " dollar_amount_match = re.search(dollar_amount_regex,strippedline)\n", 120 | " if dollar_amount_match:\n", 121 | " caption = strippedline[:dollar_amount_match.start(0)].strip()\n", 122 | " value = strippedline[dollar_amount_match.start(0):].strip()\n", 123 | " #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n", 124 | " print (u\"{:60s} {:10s}\".format(caption, value))\n", 125 | " if len(line.strip()) < 5 and linesleft < 10:\n", 126 | " blanklines = blanklines + 1\n", 127 | " linesleft = linesleft - 1\n", 128 | "\n", 129 | " if linesleft == 0:\n", 130 | " printline = 0" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": true 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "#Issues:\n", 142 | "## Doesn't pick up caption in EP1059361 --> add USES OF FUNDS but then no SOURCES OF PAYMENTS\n", 143 | "## Doesn't pick up line items in ER1075876 --> match sequences of .... to indicate tables as well, plus be more lenient with cents values\n" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 154, 149 | "metadata": { 150 | "collapsed": false 151 | }, 152 | "outputs": [ 153 | { 154 | "name": "stdout", 155 | "output_type": "stream", 156 | "text": [ 157 | "--------ER862677-ER674128-ER1075876.pdf.txt--------\n", 158 | "HEADER 1 $95,885,000\n", 159 | "HEADER 2 CALIFORNIA MUNICIPAL FINANCE AUTHORITY\n", 160 | "HEADER 3 REVENUE BONDS, SERIES 2015-A\n", 161 | "PRINCIPAL AMOUNT $ 95,885,000\n", 162 | "BOND PREMIUM 12,984,339\n", 163 | "OTHER AVAILABLE FUNDS(1) 6,600,643 \n", 164 | "TOTAL SOURCES $115,469,982\n", 165 | "DEPOSIT TO ACQUISITION FUND $ 41,000,000\n", 166 | "RETIREMENT OF WATER REVENUE ANTICIPATION NOTES(2) 14,000,000\n", 167 | "DEPOSIT TO ESCROW FUND FOR REFUNDED 2008 BONDS 52,742,691\n", 168 | "DISCHARGE OF STATE LOAN 7,096,550 \n", 169 | "COSTS OF ISSUANCE(3) 630,741 \n", 170 | "TOTAL USES $115,469,982\n", 171 | "--------EP849915-EP657701-EP1059361.pdf.txt--------\n", 172 | "HEADER 1 $6,645,000\n", 173 | "HEADER 2 CITY OF PALM SPRINGS\n", 174 | "HEADER 3 LIMITED OBLIGATION REFUNDING IMPROVEMENT BONDS\n", 175 | "TRANSFER TO ESCROW BANK $6,086,693.08\n", 176 | "RESERVE FUND (1) 274,331.25\n", 177 | "COSTS OF ISSUANCE FUND (2) 152,404.72\n", 178 | "TOTAL USES $6,513,429.05\n", 179 | "--------ER866175-ER676833-ER1078611.pdf.txt--------\n", 180 | "HEADER 1 $19,560,000\n", 181 | "HEADER 2 RNR SCHOOL FINANCING AUTHORITY\n", 182 | "HEADER 3 COMMUNITY FACILITIES DISTRICT NO. 92-1\n", 183 | "PRINCIPAL AMOUNT OF 2015 REFUNDING BONDS $19,560,000.00\n", 184 | "PLUS: NET ORIGINAL ISSUE PREMIUM 2,550,554.30\n", 185 | "PLUS: TRANSFERRED MONEYS FROM FUNDS FOR 2006 BONDS 367,663.99\n", 186 | "TOTAL SOURCES $22,302,178.29\n", 187 | "DEPOSIT INTO ESCROW FUND (1) $21,893,691.38\n", 188 | "DEPOSIT INTO 2015A COSTS OF ISSUANCE ACCOUNT (2) 408,486.91\n", 189 | "TOTAL USES $22,302,178.29\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "#New Version\n", 195 | "for file in os.listdir('txt'):\n", 196 | " \n", 197 | " print (\"--------\" + file + \"--------\")\n", 198 | " \n", 199 | " printline = 0\n", 200 | " linesleft = 0\n", 201 | " blanklines = 0\n", 202 | " \n", 203 | " topfound = 0\n", 204 | " headerline = 0 \n", 205 | " \n", 206 | " with codecs.open('txt/'+file, \"r\", \"utf-8\") as f:\n", 207 | " for i, line in enumerate(f):\n", 208 | "\n", 209 | " \n", 210 | " strippedline = line.upper().strip()\n", 211 | "\n", 212 | " if topfound == 0 and string.find(line,\" $\") > 0:\n", 213 | " headerline = 1\n", 214 | " topfound = 1\n", 215 | "\n", 216 | " if 1 <= headerline <= 3:\n", 217 | " caption = \"HEADER \" + str(headerline)\n", 218 | " value = strippedline\n", 219 | " #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n", 220 | " print (u\"{:60s} {:10s}\".format(caption, value))\n", 221 | " headerline = headerline + 1\n", 222 | " continue\n", 223 | "\n", 224 | " if strippedline == \"SOURCES AND USES OF FUNDS\" \\\n", 225 | " or strippedline == \"SOURCES AND USES OF FUNDS*\" \\\n", 226 | " or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS\" \\\n", 227 | " or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS*\" \\\n", 228 | " or strippedline == \"SOURCES AND USES OF FUNDS(1)\" \\\n", 229 | " or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS(1)\" \\\n", 230 | " or strippedline == \"PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS\" \\\n", 231 | " or strippedline == \"ESTIMATED USES OF FUNDS\": #New\n", 232 | " printline = 1\n", 233 | " linesleft = 25\n", 234 | " #print (\"#### line:\", i, \"to\", i+linesleft)\n", 235 | "\n", 236 | " if printline == 1:\n", 237 | " #Include a minimum of preceding dots or whitespace\n", 238 | " #Group 1 = preceding whitespace\n", 239 | " #Group 2 = Dollar value\n", 240 | " #Group 3 = $Cents value if existing\n", 241 | " dollar_amount_regex = ur\"([\\.]{4,}|[\\s]{4,})[\\s]*\" + \\\n", 242 | " ur\"([\\$]{0,1}[\\s]{0,6}[0-9,]{2,15})(\\.[0-9]{1,2})?$\"\n", 243 | " dollar_amount_regex = re.compile(dollar_amount_regex)\n", 244 | " dollar_amount_match = re.search(dollar_amount_regex,strippedline)\n", 245 | " \n", 246 | " #Check whether we found something tabular and a dollar value\n", 247 | " if dollar_amount_match and dollar_amount_match.group(2):\n", 248 | " caption = strippedline[:dollar_amount_match.start(1)].strip()\n", 249 | " value = strippedline[dollar_amount_match.start(2):].strip()\n", 250 | " #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n", 251 | " print (u\"{:60s} {:10s}\".format(caption, value))\n", 252 | " if len(line.strip()) < 5 and linesleft < 10:\n", 253 | " blanklines = blanklines + 1\n", 254 | " linesleft = linesleft - 1\n", 255 | "\n", 256 | " if linesleft == 0:\n", 257 | " printline = 0" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 150, 263 | "metadata": { 264 | "collapsed": false 265 | }, 266 | "outputs": [ 267 | { 268 | "name": "stdout", 269 | "output_type": "stream", 270 | "text": [ 271 | "--------ER862677-ER674128-ER1075876.pdf.txt--------\n", 272 | "27 - issuance of the Bonds. See “PLAN OF FINANCE” and “ESTIMATED SOURCES AND USES OF FUNDS.”\n", 273 | "\n", 274 | "229 - ESTIMATED SOURCES AND USES OF FUNDS ................................................................................... 7\n", 275 | "\n", 276 | "370 - of the Bonds. See “PLAN OF FINANCE” and “ESTIMATED SOURCES AND USES OF FUNDS.”\n", 277 | "\n", 278 | "653 - ESTIMATED SOURCES AND USES OF FUNDS\n", 279 | "\n", 280 | "--------EP849915-EP657701-EP1059361.pdf.txt--------\n", 281 | "--------ER866175-ER676833-ER1078611.pdf.txt--------\n", 282 | "223 - ESTIMATED SOURCES AND USES OF FUNDS .................................................................................. 13 \n", 283 | "\n", 284 | "429 - Bonds. See “ESTIMATED SOURCES AND USES OF FUNDS.”\n", 285 | "\n", 286 | "715 - “ESTIMATED SOURCES AND USES OF FUNDS.”\n", 287 | "\n", 288 | "983 - ESTIMATED SOURCES AND USES OF FUNDS\n", 289 | "\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "#Some exploration\n", 295 | "max_distance_below = 25\n", 296 | "max_distance_above = 5\n", 297 | "context_identifier = u\"SOURCES AND USES OF FUNDS|SOURCES AND USES OF FUNDS*|ESTIMATED SOURCES AND USES OF FUNDS|\" + \\\n", 298 | " \"ESTIMATED SOURCES AND USES OF FUNDS*|SOURCES AND USES OF FUNDS(1)|\" + \\\n", 299 | " \"ESTIMATED SOURCES AND USES OF FUNDS(1)|PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS\"\n", 300 | "context_identifier = context_identifier.split(u\"|\")\n", 301 | "\n", 302 | "for file in os.listdir('txt'):\n", 303 | " \n", 304 | " print (\"--------\" + file + \"--------\")\n", 305 | " with codecs.open('txt/'+file, \"r\", \"utf-8\") as f:\n", 306 | " for i, line in enumerate(f):\n", 307 | " \n", 308 | " #Print Candidates\n", 309 | " id_found = reduce(lambda x,y: x or y, ( (id in line) for id in context_identifier ))\n", 310 | " if id_found:\n", 311 | " print(i, '-', line)\n", 312 | " " 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": true 320 | }, 321 | "outputs": [], 322 | "source": [] 323 | } 324 | ], 325 | "metadata": { 326 | "kernelspec": { 327 | "display_name": "Python 2", 328 | "language": "python", 329 | "name": "python2" 330 | }, 331 | "language_info": { 332 | "codemirror_mode": { 333 | "name": "ipython", 334 | "version": 2 335 | }, 336 | "file_extension": ".py", 337 | "mimetype": "text/x-python", 338 | "name": "python", 339 | "nbconvert_exporter": "python", 340 | "pygments_lexer": "ipython2", 341 | "version": "2.7.6" 342 | } 343 | }, 344 | "nbformat": 4, 345 | "nbformat_minor": 0 346 | } 347 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/TableParser4-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "#TabulaRazr - specific to calculate - TABLE Parser\n", 12 | "#Infers a table with arbitrary number of columns from reoccuring patterns in text lines\n", 13 | "#(c) Alexander Hirner 2016, no redistribution without permission\n", 14 | "#Contributions: ____ (refactoring), UI styling (), ....\n", 15 | "\n", 16 | "\n", 17 | "#Main assumptions Table identificatin:\n", 18 | "#1) each row is either in one line or not a row at all\n", 19 | "#2) each column features at least one number (=dollar amount)\n", 20 | "#2a) each column features at least one date-like string [for time-series only]\n", 21 | "#3) a table exists if rows are in narrow consecutive order and share similarities --> scoring algo [DONE] \n", 22 | "#4) each column is separated by more than x consecutive whitespace indicators (e.g. ' ' or '..')\n", 23 | "\n", 24 | "#Feature List Todo:\n", 25 | "#1) Acknowledge footnotes / make lower meta-data available\n", 26 | "#2) make delimiter length smartly dependent on number of columns (possible iterative approach)\n", 27 | "#3) improve captioning: expand non canonical values in tables [DONE] .. but not to the extent how types match up --> use this to further\n", 28 | "## delineate between caption and headers\n", 29 | "#4) UI: parameterize extraction on the show page on the fly\n", 30 | "#5) deeper type inference on token level: type complex [DONE], subtype header (centered, capitalized), \n", 31 | "## subtype page nr., type free flow [DONE, need paragraph]\n", 32 | "#5a) re\n", 33 | "#6) Respect negative values with potential '-' for numerical values\n", 34 | "#7)\n", 35 | "#8) classify tables with keywords (Muni Bonds) and unsupervised clustering (Hackathon)\n", 36 | "#9) Restructure folder and URI around MD5 hash (http://stackoverflow.com/questions/24570066/calculate-md5-from-werkzeug-datastructures-filestorage-without-saving-the-object)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "from backend import *\n", 48 | "import os" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "import json\n", 60 | "from flask import Flask, request, redirect, url_for, send_from_directory\n", 61 | "from werkzeug import secure_filename\n", 62 | "from flask import jsonify, render_template, make_response\n", 63 | "\n", 64 | "import matplotlib.pyplot as plt" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "TITLE = \"TabulaRazr (XIRR for muni_bonds)\"\n", 76 | "\n", 77 | "scripts = []\n", 78 | "css = [\n", 79 | " \"./bower_components/bootstrap/dist/css/bootstrap.min.css\",\n", 80 | " \"./css/main.css\",\n", 81 | " \"./css/style.css\"\n", 82 | "]\n", 83 | "\n", 84 | "\n", 85 | "UPLOAD_FOLDER = './static/ug'\n", 86 | "ALLOWED_EXTENSIONS = set(['txt', 'pdf'])\n", 87 | "\n", 88 | "TITLE = \"TabulaRazr\"\n", 89 | "\n", 90 | "app = Flask(__name__)\n", 91 | "app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER\n", 92 | "\n", 93 | "def get_extension(filename):\n", 94 | " return '.' in filename and \\\n", 95 | " filename.rsplit('.', 1)[1] \n", 96 | "\n", 97 | "def allowed_file(filename):\n", 98 | " return get_extension(filename) in ALLOWED_EXTENSIONS\n", 99 | "\n", 100 | "@app.route('/', methods=['GET', 'POST'])\n", 101 | "def upload_file():\n", 102 | "\n", 103 | " if request.method == 'POST':\n", 104 | " \n", 105 | " file = request.files['file']\n", 106 | " project = request.form['project']\n", 107 | " \n", 108 | " if file and allowed_file(file.filename):\n", 109 | " filename = secure_filename(file.filename)\n", 110 | " extension = get_extension(file.filename)\n", 111 | " path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)\n", 112 | " \n", 113 | " file.save(os.path.join(app.config['UPLOAD_FOLDER'], project, filename))\n", 114 | " \n", 115 | " if extension == \"pdf\":\n", 116 | " txt_path = path+'.txt'\n", 117 | " filename += '.txt' \n", 118 | " if not os.path.isfile(txt_path):\n", 119 | " #Layout preservation crucial to preserve clues about tabular data\n", 120 | " cmd = \"pdftotext -enc UTF-8 -layout %s %s \" % (path, txt_path)\n", 121 | " os.system(cmd) \n", 122 | "\n", 123 | " return redirect(url_for('analyze', filename=filename, project=project))\n", 124 | "\n", 125 | " return render_template('index.html',\n", 126 | " title=TITLE ,\n", 127 | " css=css)\n", 128 | "\n", 129 | "@app.route('/analyze/', methods=['GET', 'POST'])\n", 130 | "def analyze(filename): \n", 131 | "\n", 132 | " project = request.args.get('project')\n", 133 | " txt_path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)\n", 134 | " \n", 135 | " if not os.path.isfile(txt_path):\n", 136 | " return {'error' : txt_path+' not found' }\n", 137 | " \n", 138 | " tables = return_tables(txt_path)\n", 139 | " \n", 140 | " #Export tables\n", 141 | " with codecs.open(txt_path + '.tables.json', 'w', \"utf-8\") as file:\n", 142 | " json.dump(tables, file)\n", 143 | "\n", 144 | " #Export chart\n", 145 | " lines_per_page = 80\n", 146 | " nr_data_rows = []\n", 147 | " #for t in tables.values():\n", 148 | " # print t\n", 149 | " for key, t in tables.iteritems():\n", 150 | " e = t['end_line']\n", 151 | " b = t['begin_line']\n", 152 | " for l in range(b, e):\n", 153 | " page = l / lines_per_page\n", 154 | " if len(nr_data_rows) <= page:\n", 155 | " nr_data_rows += ([0]*(page-len(nr_data_rows)+1))\n", 156 | " nr_data_rows[page] += 1\n", 157 | " dr = pd.DataFrame()\n", 158 | " dr['value'] = nr_data_rows\n", 159 | " dr['page'] = range(0, len(dr))\n", 160 | " \n", 161 | " #plot the row density\n", 162 | " chart = filename+\".png\"\n", 163 | " fig, ax = plt.subplots( nrows=1, ncols=1, figsize=(8,3) ) # create figure & 1 axis\n", 164 | " ax.set_xlabel('page nr.')\n", 165 | " ax.set_ylabel('number of data rows')\n", 166 | " ax.set_title('Distribution of Rows with Data')\n", 167 | " ax.plot(dr['page'], dr['value'], )\n", 168 | " fig.savefig(txt_path + '.png') # save the figure to file\n", 169 | " plt.close(fig) # close the figure\n", 170 | "\n", 171 | " if request.method == 'POST':\n", 172 | " return json.dumps(tables)\n", 173 | " \n", 174 | " return redirect(url_for('uploaded_file', filename=filename, project=project))\n", 175 | " \n", 176 | "\n", 177 | "@app.route('/show/')\n", 178 | "def uploaded_file(filename):\n", 179 | "\n", 180 | " project = request.args.get('project') \n", 181 | " path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)\n", 182 | " \n", 183 | " tables_path = path + '.tables.json'\n", 184 | " chart_path = path+\".png\"\n", 185 | " \n", 186 | " if not os.path.isfile(tables_path):\n", 187 | " analyze(path)\n", 188 | "\n", 189 | " with codecs.open(tables_path) as file:\n", 190 | " tables = json.load(file) \n", 191 | "\n", 192 | " #Create HTML\n", 193 | " notices = ['Extraction Results for ' + filename, 'Ordered by lines'] \n", 194 | " dfs = (table_to_df(table).to_html() for table in tables.values())\n", 195 | " headers = []\n", 196 | " for t in tables.values():\n", 197 | " if 'header' in t:\n", 198 | " headers.append(t['header'])\n", 199 | " else:\n", 200 | " headers.append('-')\n", 201 | " meta_data = [{'begin_line' : t['begin_line'], 'end_line' : t['end_line']} for t in tables.values()]\n", 202 | "\n", 203 | " return render_template('viewer.html',\n", 204 | " title=TITLE + ' - ' + filename,\n", 205 | " base_scripts=scripts, filename=filename, project=project,\n", 206 | " css=css, notices = notices, tables = dfs, headers=headers, meta_data=meta_data, chart=chart_path)\n", 207 | "\n", 208 | "@app.route('/inspector/')\n", 209 | "def inspector(filename):\n", 210 | " extension = 'txt'\n", 211 | " path = os.path.join(app.config['UPLOAD_FOLDER'], extension, filename)\n", 212 | " begin_line = int(request.args.get('data_begin'))\n", 213 | " end_line = int(request.args.get('data_end'))\n", 214 | " margin_top = config[\"meta_info_lines_above\"]\n", 215 | " margin_bottom = margin_top\n", 216 | " \n", 217 | " notices = ['showing data lines from %i to %i with %i meta-lines above and below' % (begin_line, end_line, margin_top)]\n", 218 | " with codecs.open(path, \"r\", \"utf-8\") as file:\n", 219 | " lines = [l.encode('utf-8') for l in file][begin_line - margin_top:end_line + margin_bottom]\n", 220 | " top_lines = lines[:margin_top]\n", 221 | " table_lines = lines[margin_top:margin_top+end_line-begin_line]\n", 222 | " bottom_lines = lines[margin_top+end_line-begin_line:]\n", 223 | " \n", 224 | " offset = begin_line-margin_top\n", 225 | " table_id = begin_line\n", 226 | " \n", 227 | " return render_template('inspector.html',\n", 228 | " title=TITLE,\n", 229 | " base_scripts=scripts, css=css, notices = notices, filename=filename, top_lines=top_lines, \n", 230 | " table_lines=table_lines, bottom_lines=bottom_lines, offset=offset, table_id=begin_line)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": { 237 | "collapsed": false 238 | }, 239 | "outputs": [ 240 | { 241 | "name": "stderr", 242 | "output_type": "stream", 243 | "text": [ 244 | "ERROR:__main__:Exception on / [POST]\n", 245 | "Traceback (most recent call last):\n", 246 | " File \"/Library/Python/2.7/site-packages/flask/app.py\", line 1817, in wsgi_app\n", 247 | " response = self.full_dispatch_request()\n", 248 | " File \"/Library/Python/2.7/site-packages/flask/app.py\", line 1477, in full_dispatch_request\n", 249 | " rv = self.handle_user_exception(e)\n", 250 | " File \"/Library/Python/2.7/site-packages/flask/app.py\", line 1381, in handle_user_exception\n", 251 | " reraise(exc_type, exc_value, tb)\n", 252 | " File \"/Library/Python/2.7/site-packages/flask/app.py\", line 1475, in full_dispatch_request\n", 253 | " rv = self.dispatch_request()\n", 254 | " File \"/Library/Python/2.7/site-packages/flask/app.py\", line 1461, in dispatch_request\n", 255 | " return self.view_functions[rule.endpoint](**req.view_args)\n", 256 | " File \"\", line 36, in upload_file\n", 257 | " path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)\n", 258 | "NameError: global name 'os' is not defined\n" 259 | ] 260 | } 261 | ], 262 | "source": [ 263 | "def run_from_ipython():\n", 264 | " try:\n", 265 | " __IPYTHON__\n", 266 | " return True\n", 267 | " except NameError:\n", 268 | " return False\n", 269 | "\n", 270 | "if run_from_ipython():\n", 271 | " app.run(host='0.0.0.0', port = 8080)\n", 272 | "else:\n", 273 | " PORT = int(os.getenv('PORT', 8080))\n", 274 | " app.run(debug=True, host='0.0.0.0', port = PORT)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": { 281 | "collapsed": true 282 | }, 283 | "outputs": [], 284 | "source": [] 285 | } 286 | ], 287 | "metadata": { 288 | "kernelspec": { 289 | "display_name": "Python 2", 290 | "language": "python", 291 | "name": "python2" 292 | }, 293 | "language_info": { 294 | "codemirror_mode": { 295 | "name": "ipython", 296 | "version": 2 297 | }, 298 | "file_extension": ".py", 299 | "mimetype": "text/x-python", 300 | "name": "python", 301 | "nbconvert_exporter": "python", 302 | "pygments_lexer": "ipython2", 303 | "version": "2.7.10" 304 | } 305 | }, 306 | "nbformat": 4, 307 | "nbformat_minor": 0 308 | } 309 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TabulaRazr 2 | **Extract and browse tabular data from legacy financial documents with ease**. 3 | 4 | This repository is a partial release from prior work and the Top 5 submission at [DeveloperWeek 2016](http://accelerate.im/projects/362) ([video presentation](https://www.youtube.com/watch?v=Snqul2fJT5c)). The more elaborate version builds semantic links between tables to efficiently compare deals and aggregate otherwise disconnected knowledge from a large collection of documents. 5 | 6 | Issues, forks and heavy usage welcome. Distributed under APGL v3. 7 | 8 | # Usage 9 | After uploading a `.txt` or `.pdf` document, all identified tables are presented as well as where they occur in the document. 10 | ![View on Document](/../xirr-specific/design/screenshot_show_example.png?raw=true "Municipal Bond of Flint") 11 | The screenshot shows a bond used to construct **public buildings in Jurupa's school district**, Riverside County. 12 | Additional information, such as inferred data types and positional features of table cells are cached in `.json` files on the local filesystem. 13 | 14 | Once the data is structured and annotated, it is relatively easy to automatically calculate domain specific key figures. This customized version includes an experimental calculation for the [internal rate of return](http://www.investopedia.com/terms/i/irr.asp) for Municpal Bonds. Often, auxiliary information is surfaced such as unemployment rates which again can be used as a basis to aggregate hidden knowledge. 15 | 16 | # Setup and run 17 | 18 | ### Initial setup and run 19 | 20 | npm install -g bower 21 | pip install -r requirements.txt 22 | bower install 23 | python server.py 24 | 25 | 26 | 27 | Navigate to `http://localhost:7081` and upload an example document (see below). 28 | You may set your PORT variable to other ports than 7081. 29 | 30 | ### Updating 31 | 32 | git pull 33 | pip install -r requirements.txt 34 | bower install 35 | 36 | 37 | 38 | # Folder structure 39 | - /templates ... Jinja2 html templates 40 | - /static ... all stylesheets and media goes there 41 | - /static/ug/ ... user uploaded data and analysis files (graphs, json) 42 | 43 | # Example documents 44 | 45 | One running instance with Municipal Bonds and other document categories lives at: http://tabularazr.eastus.cloudapp.azure.com:7081 46 | 47 | | Document | Category | 48 | |----------|---------:| 49 | |**Municipal Bond of the City of Flint:** [Debt Service Schedule](http://tabularazr.eastus.cloudapp.azure.com:7081/show/muni_bonds/ER544111-ER421289-ER823264.pdf.txt#1581)|Municipal Bond| 50 | |**Deep Learning Paper:** [Empirical Findings](http://tabularazr.eastus.cloudapp.azure.com:7081/show/_other/sentence_entailment_attention_LSTM.pdf.txt)|other| 51 | |**Annual Report Bosch 2014:** [Sales Figures](http://tabularazr.eastus.cloudapp.azure.com:7081/show/business_reports/Bosch_Annual_Report_2014_Financial_Report.pdf.txt#2238)|Business Report| 52 | |**Annual Report Oakland:** [Income per Sector from 2006 to 2010](http://tabularazr.eastus.cloudapp.azure.com:7081/show/muni_bonds/ER544111-ER421289-ER823264.pdf.txt#3533)|(Business) Report| 53 | |**EY's Biotech Report 2015:** [Europe's Top IPOs in 2014](http://tabularazr.eastus.cloudapp.azure.com:7081/show/business_reports/EY-beyond-borders-2015.pdf.txt#2946)|Business Report| 54 | 55 | # Other documents 56 | Choose any financial document, research paper or annual report to upload yourself. Or browse these sources. 57 | 58 | ### Example pdfs from public data (municipal bonds, audit reports, finanical reviews) 59 | 60 | - http://emma.msrb.org/EP753324-ER508056-ER910760.pdf 61 | - http://emma.msrb.org/EP407966-EP321048-EP717328.pdf 62 | - http://emma.msrb.org/ER544111-ER421289-ER823264.pdf (very high cost of issuance) 63 | - http://emma.msrb.org/MS132788-MS108096-MD209140.pdf (1997 bond issue) 64 | 65 | #### Works with XIRR calculation feature 66 | 67 | These documents can be successfully processed by the XIRR feature 68 | 69 | - http://emma.msrb.org/ER588705-ER457598-ER860368.pdf 70 | 71 | ### Other documents that may be of interest: 72 | 73 | - https://treas-secure.state.mi.us/LAFDocSearch/tl41R01.aspx?&lu_id=1349&doc_yr=2015&doc_code=AUD (2015 Audit) 74 | - https://treas-secure.state.mi.us/LAFDocSearch/tl41R01.aspx?&lu_id=1349&doc_yr=2014&doc_code=AUD (2014 Audit) 75 | - http://www.michigan.gov/documents/treasury/Flint-ReviewTeamReport-11-7-11_417437_7.pdf (Review Team Report used to determine that the city faced a financial emergency) 76 | -------------------------------------------------------------------------------- /backend.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import re 4 | 5 | import codecs 6 | import string 7 | 8 | from collections import Counter, OrderedDict 9 | 10 | config = { "min_delimiter_length" : 3, "min_columns": 2, "min_consecutive_rows" : 3, "max_grace_rows" : 4, 11 | "caption_assign_tolerance" : 15.0, "meta_info_lines_above" : 10, "count_ws_header" : False, 12 | "threshold_caption_extension" : 0.45, "number_compatibility" : 0.5, 13 | "header_good_candidate_length" : 3, "complex_leftover_threshold" : 3, "min_canonical_rows" : 0.1, 14 | "min_fuzzy_ratio" : 0.75 } 15 | 16 | import numpy as np 17 | import pandas as pd 18 | 19 | ### Tokenize and Tag 20 | 21 | #Regex tester online: https://regex101.com 22 | #Contrast with Basic table parsing capabilities of http://docs.astropy.org/en/latest/io/ascii/index.html 23 | 24 | tokenize_pattern = ur"[.]{%i,}|[\ \$]{%i,}|" % ((config['min_delimiter_length'],)*2) 25 | tokenize_pattern = ur"[.\ \$]{%i,}" % (config['min_delimiter_length'],) 26 | footnote_inidicator = ur"[^,_!a-zA-Z0-9.]" 27 | 28 | column_pattern = OrderedDict() 29 | #column_pattern['large_num'] = ur"\d{1,3}(,\d{3})*(\.\d+)?" 30 | column_pattern['large_num'] = ur"(([0-9]{1,3})(,\d{3})+(\.[0-9]{2})?)" 31 | column_pattern['small_float'] = ur"[0-9]+\.[0-9]+" 32 | column_pattern['integer'] = ur"^\s*[0-9]+\s*$" 33 | #column_patter['delimiter'] = "[_=]{6,}" 34 | #column_pattern['other'] = ur"([a-zA-Z0-9]{2,}\w)" 35 | column_pattern['other'] = ur".+" 36 | 37 | subtype_indicator = OrderedDict() 38 | subtype_indicator['dollar'] = ur".*\$.*" 39 | subtype_indicator['rate'] = ur"[%]" 40 | #enter full set of date patterns here if we want refinement early on 41 | subtype_indicator['year'] = ur"(20[0-9]{2})|(19[0-9]{2})" 42 | 43 | 44 | 45 | import dateutil.parser as date_parser 46 | from datetime import date 47 | #Implement footnote from levtovers 48 | def tag_token(token, ws): 49 | for t, p in column_pattern.iteritems(): 50 | result = re.search(p, token) 51 | if result: 52 | leftover = token[:result.start()], token[result.end():] 53 | lr = "".join(leftover) 54 | value = token[result.start():result.end()] 55 | 56 | if len(lr) >= config['complex_leftover_threshold']: 57 | return "complex", "unknown", token, leftover 58 | elif len(lr) == 0: 59 | leftover = None 60 | 61 | subtype = "none" 62 | #First match on left-overs 63 | for sub, indicator in subtype_indicator.iteritems(): 64 | if re.match(indicator, lr): subtype = sub 65 | #Only if no indicator matched there, try on full token 66 | if subtype == "none": 67 | for sub, indicator in subtype_indicator.iteritems(): 68 | if re.match(indicator, token): subtype = sub 69 | #Only if no indicator matched again, try on whitespace 70 | if subtype == "none": 71 | for sub, indicator in subtype_indicator.iteritems(): 72 | if re.match(indicator, ws): subtype = sub 73 | 74 | if subtype == "none" and t == "other": 75 | #No leftovers possible because fuzzy_token not implemented despite documented 76 | today = date.today() 77 | v_ascii = value.encode("ascii", errors="ignore") 78 | try: 79 | dt = date_parser.parse(v_ascii, fuzzy=True, default=today) 80 | if dt != today: 81 | return t, "date", value, leftover 82 | except: 83 | pass 84 | 85 | return t, subtype, value, leftover 86 | #Try date at last: 87 | 88 | return "unknown", "none", token, "" 89 | 90 | def row_feature(line): 91 | matches = re.finditer(tokenize_pattern, line) 92 | start_end = [ (match.start(), match.end()) for match in matches] 93 | #No delimiter found so it's free flowing text, i.e. part of a paragraph 94 | if len(start_end) < 1: 95 | if len(line) == 0: 96 | return () 97 | else: 98 | return [{'start' : 0, 'value' : line, 'type' : 'freeflow', 'subtype' : 'none'}] 99 | 100 | tokens = re.split(tokenize_pattern, line) 101 | if tokens[0] == "": 102 | tokens = tokens[1:] 103 | else: 104 | start_end = [(0,0)] + start_end 105 | 106 | features = [] 107 | for se, token in zip(start_end, tokens): 108 | t, subtype, value, leftover = tag_token(token, line[se[0]:se[1]]) 109 | feature = {"start" : se[1], "value" : value, "type" : t, "subtype" : subtype} 110 | if leftover: feature["leftover"] = leftover 111 | features.append(feature) 112 | return features 113 | 114 | 115 | #Establish whether amount of rows is above a certain threshold and whether there is at least one number 116 | def row_qualifies(row): 117 | return row != None and len(row) >= config['min_columns'] and \ 118 | sum( 1 if c['type'] in ['large_num', 'small_float', 'integer'] else 0 for c in row) > 0 119 | 120 | 121 | ### Scope tables 122 | 123 | #Non qualified rows arm for consistency check but are tolerated for max_grace_rows (whitespace, breakline, junk) 124 | def filter_row_spans_new(row_features, row_qualifies=row_qualifies, ): 125 | 126 | min_consecutive = config["min_consecutive_rows"] 127 | grace_rows = config['max_grace_rows'] 128 | 129 | last_qualified = None 130 | consecutive = 0 131 | underqualified = 0 132 | consistency_check = False 133 | i = 0 134 | 135 | for j, row in enumerate(row_features): 136 | qualifies = row_qualifies(row) 137 | if consistency_check: 138 | #print "BENCHMARKING %s AGAINST:" % row_to_string(row), row_to_string(row_features[last_qualified], 'type') 139 | if not row_type_compatible(row_features[last_qualified], row): 140 | qualifies = False 141 | consistency_check = False 142 | #print qualifies, row_to_string(row) 143 | 144 | if qualifies: 145 | if last_qualified is None: 146 | last_qualified = i 147 | consecutive = 1 148 | else: 149 | consecutive += 1 150 | else: 151 | underqualified += 1 152 | if underqualified > grace_rows: 153 | if consecutive >= min_consecutive: 154 | #TODO: do post splitting upon type check and benchmark 155 | #print "YIELDED from", last_qualified, "to", i-underqualified+1 156 | yield last_qualified, i-underqualified+1 157 | 158 | last_qualified = None 159 | consecutive = 0 160 | underqualified = 0 161 | consistency_check = False 162 | else: 163 | if last_qualified: 164 | consistency_check = True 165 | #print i, last_qualified, consecutive, consistency_check, row_to_string(row) 166 | i += 1 167 | 168 | if consecutive >= min_consecutive: 169 | yield last_qualified, i-underqualified 170 | 171 | def row_to_string(row, key='value', sep='|'): 172 | return sep.join(c[key] for c in row) 173 | 174 | def row_type_compatible(row_canonical, row_test): 175 | #Test whether to break because types differ too much 176 | no_fit = 0.0 177 | for c in row_test: 178 | dist = (abs(c['start']-lc['start']) for lc in row_canonical) 179 | val, idx = min((val, idx) for (idx, val) in enumerate(dist)) 180 | if c['type'] != row_canonical[idx]['type']: 181 | no_fit += 1.0 182 | number = ('large_num', 'small_float', 'integer') 183 | if c['type'] in number and row_canonical[idx]['type'] in number: 184 | no_fit -= config["number_compatibility"] 185 | 186 | fraction_no_fit = no_fit / float(len(row_test)) 187 | #print "test row", row_to_string(row_test), ") against types (", row_to_string(row_canonical, 'type'), ") has %f unmatching types" % fraction_no_fit, "yields", fraction_no_fit < config["threshold_caption_extension"] 188 | return fraction_no_fit < config["threshold_caption_extension"] 189 | 190 | def filter_row_spans(row_features, row_qualifies): 191 | 192 | min_consecutive = config["min_consecutive_rows"] 193 | grace_rows = config['max_grace_rows'] 194 | 195 | last_qualified = None 196 | consecutive = 0 197 | underqualified = 0 198 | underqualified_rows = [] #Tuples of row number and the row 199 | 200 | i = 0 201 | 202 | for j, row in enumerate(row_features): 203 | if row_qualifies(row): 204 | underqualified = 0 205 | if last_qualified is None: 206 | last_qualified = i 207 | consecutive = 1 208 | else: 209 | consecutive += 1 210 | else: 211 | underqualified += 1 212 | underqualified_rows.append((j, row) ) 213 | if underqualified > grace_rows: 214 | if consecutive >= min_consecutive: 215 | yield last_qualified, i-underqualified+1 216 | 217 | last_qualified = None 218 | consecutive = 0 219 | underqualified = 0 220 | #print i, underqualified, last_qualified, consecutive#, "" or row 221 | i += 1 222 | 223 | if consecutive >= min_consecutive: 224 | yield last_qualified, i-underqualified 225 | 226 | ### Structure and convert tables 227 | 228 | 229 | def row_to_string(row, key='value', sep='|'): 230 | return sep.join(c[key] for c in row) 231 | 232 | 233 | def readjust_cols(feature_row, slots): 234 | 235 | feature_new = [{'value' : 'NaN'}] * len(slots) 236 | for v in feature_row: 237 | dist = (abs((float(v['start'])) - s) for s in slots) 238 | val , idx = min((val, idx) for (idx, val) in enumerate(dist)) 239 | if val <= config['caption_assign_tolerance']: feature_new[idx] = v 240 | 241 | return feature_new 242 | 243 | 244 | def normalize_rows(rows_in, structure): 245 | slots = [c['start'] for c in structure] 246 | nrcols = len(structure) 247 | 248 | for r in rows_in: 249 | if len(r) != nrcols: 250 | if len(r)/float(nrcols) > config['threshold_caption_extension']: 251 | yield readjust_cols(r, slots) 252 | else: 253 | yield r 254 | 255 | #TODO: make side-effect free 256 | def structure_rows(row_features, meta_features): 257 | #Determine maximum nr. of columns 258 | lengths = Counter(len(r) for r in row_features) 259 | nrcols = config['min_columns'] 260 | for l in sorted(lengths.keys(), reverse=True): 261 | nr_of_l_rows = lengths[l] 262 | if nr_of_l_rows/float(len(row_features)) > config['min_canonical_rows']: 263 | nrcols = l 264 | break 265 | 266 | canonical = filter(lambda r: len(r) == nrcols , row_features) 267 | 268 | #for c in canonical: print len(c), row_to_string(c) 269 | 270 | structure = [] 271 | for i in range(nrcols): 272 | col = {} 273 | col['start'] = float (sum (c[i]['start'] for c in canonical )) / len(canonical) 274 | 275 | types = Counter(c[i]['type'] for c in canonical) 276 | col['type'] = types.most_common(1)[0][0] 277 | subtypes = Counter(c[i]['subtype'] for c in canonical if c[i]['subtype'] != "none") 278 | subtype = "none" if len(subtypes) == 0 else subtypes.most_common(1)[0][0] 279 | col['subtype'] = subtype 280 | structure.append(col) 281 | 282 | #Test how far up the types are compatible and by that are data vs caption 283 | for r in row_features: 284 | #if r in canonical: 285 | if len(r) and row_type_compatible(structure, r): 286 | break 287 | else: 288 | meta_features.append(r) 289 | row_features.remove(r) 290 | 291 | meta_features.reverse() 292 | #for m in meta_features: print "META", row_to_string(m) 293 | 294 | captions = [''] * nrcols 295 | single_headers = [] 296 | #latest_caption_len = 1 297 | fill_up_captions_blocked = False 298 | caption_begin = False 299 | slots = [c['start'] for c in structure] 300 | first_captions = [] 301 | for mf in meta_features: 302 | #if we have at least one token as data and the closest tokens have not been exhausted yet for captions, consider them 303 | nr_meta_tokens = len(mf) 304 | if nr_meta_tokens > 0 and not fill_up_captions_blocked: 305 | #Find closest slot the caption could fit 306 | #TODO = allow doubling of captions if it is centered above more than one slot 307 | for c in mf: 308 | dist = (abs((float(c['start'])) - s) for s in slots) 309 | dists = sorted((val, idx) for (idx, val) in enumerate(dist)) 310 | val, idx = dists[0] 311 | #val2, idx2 = val, idx 312 | span_len = len(c['value']) 313 | if len(dists) > 1: 314 | val2, idx2 = dists[1] 315 | span_len = int(abs(val2-val)) 316 | #print span_len, row_to_string(mf) 317 | if val <= config['caption_assign_tolerance'] and (len(c['value']) - span_len) < config['caption_assign_tolerance']: 318 | captions[idx] = c['value'] + ' ' + captions[idx] 319 | if idx == 0: first_captions.append(c['value']) 320 | caption_begin = True 321 | else: 322 | single_headers.append(c['value']) 323 | fill_up_captions_blocked = True 324 | #latest_caption_len = nr_meta_tokens 325 | 326 | #In case of blank line, test if adding more captions should be locked 327 | else: 328 | if caption_begin: 329 | fill_up_captions_blocked = True 330 | #Accept both orphan tokens and freeflow text as headers 331 | #Todo: make separate data field for freeflow 332 | if nr_meta_tokens == 1 : single_headers.append(mf[0]['value']) 333 | 334 | #If all meta features were aggregated into the first column caption, consider the table to have no column labels 335 | if len(single_headers) == 0 and len(captions[0]) and sum(len(c) for c in captions[1:]) == 0: 336 | single_headers = first_captions 337 | captions[0] = u'' 338 | 339 | #Assign captions as the value in structure 340 | for i, c in enumerate(captions): 341 | structure[i]['value'] = c 342 | #Expand all the non canonical rows with NaN values (Todo: if types are very similar) 343 | normalized_data = [r for r in normalize_rows(row_features, structure)] 344 | 345 | return structure, normalized_data, single_headers 346 | 347 | 348 | def convert_to_table(rows, b, e, above): 349 | table = {'begin_line' : b, 'end_line' : e, 'meta_begin_line' : b-above} 350 | 351 | data_rows = rows[b:e] 352 | meta_rows = rows[b-above:b] 353 | 354 | structure, data, headers = structure_rows(data_rows, meta_rows) 355 | 356 | captions = [col['value'] for col in structure] 357 | 358 | table['captions'] = captions 359 | table['data'] = data 360 | table['headers'] = headers 361 | table['types'] = [col['type'] if 'type' in col else "NaN" for col in structure] 362 | table['subtypes'] = [col['subtype'] if 'subtype' in col else "NaN" for col in structure] 363 | return table 364 | 365 | def indexed_tables_from_rows(row_features): 366 | 367 | #Uniquely identify tables by their first row 368 | tables = OrderedDict() 369 | last_end = 0 370 | for b,e in filter_row_spans(row_features, row_qualifies): 371 | #Slice out the next table and limit the context rows to have no overlaps 372 | #Todo: manage the lower meta lines 373 | max_lines_above = config['meta_info_lines_above'] 374 | if not config["count_ws_header"]: 375 | meta_counter = 0 376 | above = 0 377 | while meta_counter < max_lines_above and (above <= b-last_end): 378 | mf = row_features[b-above] 379 | if len(mf): meta_counter += 1 380 | above += 1 381 | max_lines_above = above 382 | tables[b] = convert_to_table(row_features, b, e, min(max_lines_above, b-last_end)) 383 | last_end = tables[b]['end_line'] 384 | return tables 385 | 386 | def return_tables(txt_path): 387 | 388 | #Uniquely identify tables by their first row 389 | tables = OrderedDict() 390 | 391 | with codecs.open(txt_path, "r", "utf-8") as f: 392 | lines = [l.replace(u'\n', '').replace(u'\r', '') for l in f] 393 | rows = [row_feature(l) for l in lines] 394 | 395 | return indexed_tables_from_rows(rows) 396 | 397 | def table_to_df(table): 398 | df = pd.DataFrame() 399 | for i in range(len(table['captions'])): 400 | values = [] 401 | for r in table['data']: 402 | values.append(r[i]['value']) 403 | df[i] = values 404 | df.columns = table['captions'] 405 | return df 406 | 407 | 408 | -------------------------------------------------------------------------------- /bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "TabulaRazr-OS", 3 | "homepage": "https://github.com/ahirner/TabulaRazr-OS", 4 | "authors": [ 5 | "wongalvis " 6 | ], 7 | "description": "", 8 | "main": "", 9 | "moduleType": [], 10 | "license": "MIT", 11 | "private": true, 12 | "ignore": [ 13 | "**/.*", 14 | "node_modules", 15 | "bower_components", 16 | "./project/static/bower_components", 17 | "test", 18 | "tests" 19 | ], 20 | "dependencies" : { 21 | "materialize": "^0.97.5" 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /bulk_processing/IRR_estimate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# Applies a table filter across all extracted tables from a project and calculates \n", 12 | "# the net underwriter discount, and\n", 13 | "# the face value\n", 14 | "# for a specific type of table (standard case: two column with $ denominated key-value pairs)\n", 15 | "\n", 16 | "from __future__ import print_function\n", 17 | "\n", 18 | "import os\n", 19 | "import sys\n", 20 | "import glob\n", 21 | "import codecs\n", 22 | "import json\n", 23 | "\n", 24 | "import string\n", 25 | "\n", 26 | "sys.path.insert(0, os.path.pardir)\n", 27 | "\n", 28 | "from backend import *\n", 29 | "from data_query import *\n", 30 | "\n", 31 | "UPLOAD_FOLDER = os.path.join('..', 'static', 'ug')\n", 32 | "FILTER_FOLDER = os.path.join('..', 'static', 'filters')\n", 33 | "PROJECT = 'muni_bonds_bulk_2'\n", 34 | "FILTER = 'funds'\n", 35 | "\n", 36 | "path = os.path.join(UPLOAD_FOLDER, PROJECT, '*.tables.json')\n", 37 | "table_files = glob.glob(path)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 5, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [ 47 | { 48 | "name": "stdout", 49 | "output_type": "stream", 50 | "text": [ 51 | "Processing with filter {u'headers': {u'threshold': 0.35, u'terms': [u'USES OF FUNDS']}, u'name': u'Estimated use and sources of funds'}\n", 52 | "Procssing with value dictionary {'underwriter_discount': 'Underwriter Discount', 'premium': 'Issue Premium', 'premium_or_discount': 'Premium Discount', 'discount': 'Issue Discount', 'face_value': ['Principal Amount', 'Par Amount', 'Face Amount'], 'cost_of_issuance': 'Costs of Issuance'}\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "def clean_string(s):\n", 58 | " lc = s.encode('ascii', errors='ignore').lower()#.translate(remove_punctuation_map)\n", 59 | " return lc.translate(None, string.punctuation + '0123456789').strip()\n", 60 | " \n", 61 | "from collections import Counter\n", 62 | "\n", 63 | "table_counter = Counter()\n", 64 | "tables_looked_at = 0\n", 65 | "confidences = []\n", 66 | "no_table_files = []\n", 67 | "no_ud_tables = []\n", 68 | "no_fv_tables = []\n", 69 | "funny_tables = {}\n", 70 | "\n", 71 | "salient_values = {}\n", 72 | "\n", 73 | "# Get those line items sufficient for IRR estimation\n", 74 | "# remark: improved query terms from TF analysis and annotation\n", 75 | "irr_estimate_dict = {'face_value' : ['Principal Amount', 'Par Amount', 'Face Amount'], \n", 76 | " 'premium' : 'Issue Premium',\n", 77 | " 'discount': 'Issue Discount',\n", 78 | " 'premium_or_discount' : 'Premium Discount', #will match line items that signify either at high confidence on the token level\n", 79 | " 'underwriter_discount' : 'Underwriter Discount',\n", 80 | " 'cost_of_issuance' : 'Costs of Issuance'}\n", 81 | "\n", 82 | "filter_file = os.path.join(FILTER_FOLDER, FILTER+'.json')\n", 83 | "with codecs.open(filter_file, \"r\", \"utf-8\", errors=\"replace\") as file:\n", 84 | " _filter = json.load(file) \n", 85 | "\n", 86 | "print (\"Processing with filter %s\" % str(_filter))\n", 87 | "print (\"Procssing with value dictionary %s\" % str(irr_estimate_dict))" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 6, 93 | "metadata": { 94 | "collapsed": false, 95 | "scrolled": true 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "#Get all tables\n", 100 | "for i,f in enumerate(table_files):\n", 101 | "\n", 102 | " with codecs.open(f, 'r', 'utf-8') as file:\n", 103 | " tables = json.load(file)\n", 104 | " tables_looked_at += len(tables)\n", 105 | " \n", 106 | " filename = f.split(r'/')[-1].replace('.tables.json', '')\n", 107 | " \n", 108 | " filter_results = []\n", 109 | " for t in filter_tables(tables.values(), _filter):\n", 110 | " if len(filter_results) == 0 or t[0] >= max(r[0] for r in filter_results):\n", 111 | " filter_results.append(t)\n", 112 | " \n", 113 | " table_counter[len(filter_results)] += 1 \n", 114 | " if len(filter_results):\n", 115 | "\n", 116 | " #Only keep first one\n", 117 | " confidence, table, _, _ = max( sorted( filter_results, key = lambda t: t[1]['begin_line'] ), \n", 118 | " key = lambda t: t[0])\n", 119 | " confidences.append(confidence)\n", 120 | " if len(table['captions']) != 2 or table['subtypes'][1] != 'dollar':\n", 121 | " funny_tables[filename] = table['begin_line']\n", 122 | " \n", 123 | " else:\n", 124 | " values = get_key_values(table, irr_estimate_dict, raw_cell=True)\n", 125 | " #invert line item if in brackets\n", 126 | " if values['premium_or_discount']:\n", 127 | " r = values['premium_or_discount'][1]\n", 128 | " if 'leftover' in r and '(' in r['leftover'][0] and ')' in r['leftover'][1]:\n", 129 | " values['premium_or_discount'][0] = values['premium_or_discount'][0]\n", 130 | " \n", 131 | " #strip raw rows\n", 132 | " values = {k : (v[0] if v else None) for k,v in values.iteritems()}\n", 133 | " key = filename+'#'+str(table['begin_line'])\n", 134 | " \n", 135 | " if not values['face_value']: \n", 136 | " no_fv_tables.append(key)\n", 137 | "\n", 138 | " if not values['underwriter_discount']: \n", 139 | " no_ud_tables.append(key)\n", 140 | " \n", 141 | " #maybe problem with ordering guarantee\n", 142 | " salient_values[key] = values.values()\n", 143 | "\n", 144 | " else:\n", 145 | " no_table_files.append(filename)\n", 146 | " \n", 147 | " if ( (i+1) % 100 ) == 0:\n", 148 | " print (\"%i files and %i tables processed... with %i best matches\" % \\\n", 149 | " (i+1, tables_looked_at, len(confidences)))\n", 150 | "\n", 151 | " \n", 152 | "results = {'high_confidence_candidates' : table_counter.most_common(),\n", 153 | " 'tables_looked_at' : tables_looked_at,\n", 154 | " 'tables_canonical' : len(confidences),\n", 155 | " 'confidence_mean' : sum(confidences) / len(confidences),\n", 156 | " 'confidences' : confidences, \n", 157 | " 'no_table_files' : no_table_files,\n", 158 | " 'no_ud_tables' : no_ud_tables,\n", 159 | " 'no_fv_tables' : no_fv_tables,\n", 160 | " 'funny_tables' : funny_tables,\n", 161 | " 'salient_values' : salient_values\n", 162 | " }" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 7, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "#Save intermediate results\n", 174 | "with codecs.open(\"IRR_estimate.results.json\", \"w\", \"utf-8\") as file:\n", 175 | " json.dump(results, file)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 8, 181 | "metadata": { 182 | "collapsed": false 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "#Work from intermediate results\n", 187 | "with codecs.open(\"IRR_estimate.results.json\", \"r\", \"utf-8\") as file:\n", 188 | " results = json.load(file)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 9, 194 | "metadata": { 195 | "collapsed": false 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "import xlwt\n", 200 | "\n", 201 | "bold = xlwt.Style.easyxf(\"font: bold on\")\n", 202 | "\n", 203 | "def write_table(sheet, keys, values, row, c_offset = 0, column_style = bold):\n", 204 | " for j, k in enumerate(keys):\n", 205 | " sheet.write(row, c_offset+j, k, column_style)\n", 206 | " row += 1\n", 207 | " for v in values:\n", 208 | " for j, vv in enumerate(v):\n", 209 | " sheet.write(row, c_offset+j, vv)\n", 210 | " row +=1\n", 211 | " return row" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 10, 217 | "metadata": { 218 | "collapsed": true 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "url_prefix = \"http://tabularazr.eastus.cloudapp.azure.com:7081/show/\"+PROJECT+'/'" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 11, 228 | "metadata": { 229 | "collapsed": false 230 | }, 231 | "outputs": [], 232 | "source": [ 233 | "def to_xls_url(url, link = None):\n", 234 | " f = 'HYPERLINK(\"'+url+'\"' + ('; \"'+link+'\")' if link else ')')\n", 235 | " return xlwt.Formula(f)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 12, 241 | "metadata": { 242 | "collapsed": false 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "wkb = xlwt.Workbook(encoding='utf-8')\n", 247 | "s_summary, s_funding_values, s_confidence, s_no_table, s_no_fv_tables, s_no_ud_tables, s_funny_tables = \\\n", 248 | " (wkb.add_sheet(s) for s in ['summary', 'funding_values', 'confidence', 'no_table', \n", 249 | " 'no_face_value_tables', 'no_underwriter_discount_tables', 'funny_tables'])" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 13, 255 | "metadata": { 256 | "collapsed": false 257 | }, 258 | "outputs": [], 259 | "source": [ 260 | "i = 0\n", 261 | "s_summary.write(i,0, 'Filter used', bold)\n", 262 | "s_summary.write(i,1, str(_filter))\n", 263 | "i+=1\n", 264 | "s_summary.write(i,0, 'Value extraction dictionary used', bold)\n", 265 | "s_summary.write(i,1, str(irr_estimate_dict))\n", 266 | "i+=2\n", 267 | "s_summary.write(i,0, 'Distribution of good table matches per document', bold)\n", 268 | "i+=1\n", 269 | "i = write_table(s_summary, ['Nr. of Table Candidates', 'Nr. of Documents'], \n", 270 | " results[\"high_confidence_candidates\"], i)\n", 271 | "\n", 272 | "i+=1\n", 273 | "s_summary.write(i, 2, 'Total nr. of Table Candidates')\n", 274 | "s_summary.write(i, 3, 'out of..')\n", 275 | "i+=1\n", 276 | "s_summary.write(i, 2, results['tables_canonical'])\n", 277 | "s_summary.write(i, 3, results['tables_looked_at'])\n", 278 | "\n", 279 | "i = write_table(s_confidence, ['Confidence in best Table found'], ([c] for c in results['confidences']), 0)\n", 280 | "i = write_table(s_no_table, ['Files with no suitable table found', 'URL'], \n", 281 | " ( ([c], to_xls_url(url_prefix+c)) for c in results['no_table_files'] ), 0)\n", 282 | "i = write_table(s_no_ud_tables, ['Tables with no Underwriter Discount found', 'URL'], \n", 283 | " ( ([c], to_xls_url(url_prefix+c)) for c in results['no_ud_tables'] ), 0)\n", 284 | "i = write_table(s_no_fv_tables, ['Tables with no Face Value found', 'URL'], \n", 285 | " ( ([c], to_xls_url(url_prefix+c)) for c in results['no_fv_tables'] ), 0)\n", 286 | "\n", 287 | "\n", 288 | "s_funny_tables.write(0,4, \"[as returned by filter but with <> 2 rows, and/or no $ value in the 2nd column]\")\n", 289 | "i = write_table(s_funny_tables, ['Funny Tables in File', 'Table ID', 'URL'], \n", 290 | " ( ( f, t, to_xls_url(url_prefix+f+'#'+str(t)) ) for f, t in results['funny_tables'].iteritems() ), 0)\n", 291 | "\n", 292 | "header_funding_values = ['Filename/Table', 'URL'] + irr_estimate_dict.keys()\n", 293 | "i = write_table(s_funding_values, header_funding_values, \n", 294 | " (( [k, to_xls_url(url_prefix+k)] + v) for k, v in results['salient_values'].iteritems()), 0)\n" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 14, 300 | "metadata": { 301 | "collapsed": true 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "wkb.save('IRR_estimate.results.xls')" 306 | ] 307 | } 308 | ], 309 | "metadata": { 310 | "kernelspec": { 311 | "display_name": "Python 2", 312 | "language": "python", 313 | "name": "python2" 314 | }, 315 | "language_info": { 316 | "codemirror_mode": { 317 | "name": "ipython", 318 | "version": 2 319 | }, 320 | "file_extension": ".py", 321 | "mimetype": "text/x-python", 322 | "name": "python", 323 | "nbconvert_exporter": "python", 324 | "pygments_lexer": "ipython2", 325 | "version": "2.7.10" 326 | } 327 | }, 328 | "nbformat": 4, 329 | "nbformat_minor": 0 330 | } 331 | -------------------------------------------------------------------------------- /bulk_processing/bulk_proc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #FILES = regex for folder 4 | #PROJECT = project for POST request 5 | 6 | for f in $FILES 7 | do 8 | echo "Processing $f ..." 9 | curl -X POST -H "Content-Type: application/json" http://localhost:7081/analyze/"$PROJECT"/"$f" > /dev/null 10 | echo "Done" 11 | 12 | done 13 | echo "FINISHED" 14 | -------------------------------------------------------------------------------- /data_query.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | import dateutil.parser as date_parser 3 | 4 | from backend import config 5 | from fuzzywuzzy import fuzz 6 | 7 | from itertools import product 8 | 9 | # Cascades: 10 | # 1) case sensitive partial ratio on character level with penalty 11 | # 2) case insensitive partial ratio on character level with penalty 12 | # 3) token sorted case insensitive ratio with penalty 13 | FUZZY_INV_CASCADES = 1.0 / 3.0 14 | def fuzzy_str_match(query, string): 15 | 16 | score = 1.0 17 | inv_cascades = FUZZY_INV_CASCADES 18 | min_fuzzy_ratio = config["min_fuzzy_ratio"] 19 | 20 | query = query.encode('ascii', errors='ignore') 21 | string = string.encode('ascii', errors='ignore') 22 | 23 | #Penalize shorter target strings and early exit on null length strings 24 | len_query = len(query) 25 | len_string = len(string.strip()) 26 | if not len_string: return None 27 | if not len_query: return score 28 | penalty = min(len_string / float(len_query), 1.0) 29 | 30 | fuzzy_partial = (fuzz.partial_ratio(query, string)/100.0) * penalty 31 | #print ("fuzzy_partial of %s vs %s * penalty %.2f" % (query, string, penalty), fuzzy_partial) 32 | if fuzzy_partial > min_fuzzy_ratio: 33 | f_score = score - (1.0 - (fuzzy_partial - (1.0 - min_fuzzy_ratio)) / min_fuzzy_ratio) * inv_cascades 34 | return f_score 35 | score -= inv_cascades 36 | 37 | q_l = query.lower() 38 | s_l = string.lower() 39 | 40 | fuzzy_partial = (fuzz.partial_ratio(q_l, s_l)/100.0) * penalty 41 | #print ("fuzzy_partial lower_case of %s vs %s * penalty %.2f" % (query, string, penalty), fuzzy_partial) 42 | 43 | if fuzzy_partial > min_fuzzy_ratio: 44 | f_score = score - (1.0 - (fuzzy_partial - (1.0 - min_fuzzy_ratio)) / min_fuzzy_ratio) * inv_cascades 45 | return f_score 46 | score -= inv_cascades 47 | 48 | fuzzy_partial = (fuzz.partial_token_sort_ratio(q_l, s_l)/100.0) * penalty 49 | #print ("fuzzy_partial token_sort_lower_case of %s vs %s * penalty %.2f" % (query, string, penalty), fuzzy_partial) 50 | if fuzzy_partial > min_fuzzy_ratio: 51 | f_score = score - (1.0 - (fuzzy_partial - (1.0 - min_fuzzy_ratio)) / min_fuzzy_ratio) * inv_cascades 52 | return f_score 53 | 54 | return None 55 | 56 | #Flatmap from tables to sequence of tuples (confidence, table, row or None, value or None) 57 | def filter_tables(tables, filter_dict, treshold = 0.0, only_max = False): 58 | row = None 59 | value = None 60 | 61 | for t in tables: 62 | 63 | if 'headers' in filter_dict: 64 | 65 | max_conf, index, best_term = None, None, None 66 | terms = filter_dict['headers']['terms'] 67 | _threshold = max(treshold, filter_dict['headers']['threshold']) 68 | for term in terms: 69 | if t['headers']: 70 | current_max_conf = (max_conf if only_max else _threshold) or _threshold 71 | scores_indices = ((val, idx) for (idx, val) in enumerate(fuzzy_str_match(term, h) for h in t['headers'] ) ) 72 | 73 | conf, idx = max(scores_indices) 74 | 75 | if conf > max_conf: 76 | max_conf = conf 77 | index = idx 78 | best_term = term 79 | best_header = "" 80 | 81 | #Todo: other filter criteria like column names, rows etc. and combinatorial confidence score 82 | if max_conf: 83 | yield max_conf, t, row, value 84 | 85 | 86 | def get_fuzzy_date(string): 87 | today = date.today() 88 | v_ascii = string.encode("ascii", errors="ignore") 89 | try: 90 | dt = date_parser.parse(v_ascii, fuzzy=True, default=today) 91 | if dt != today: 92 | return dt 93 | except: 94 | return None 95 | 96 | def get_first_date(lines, query_string, threshold = 0.4): 97 | for i, l in enumerate(lines): 98 | if fuzzy_str_match(query_string, l) > threshold: 99 | dt = get_fuzzy_date(l) 100 | if dt: 101 | return dt, i, l 102 | 103 | def find_row(table, query_string, threshold = 0.4): 104 | #Find first 'other' typed row 105 | try: 106 | index = table['types'].index('other') 107 | except ValueError: 108 | print "no column consisting of mainly string data found" 109 | return None 110 | 111 | strings = (s[index]['value'] for s in table['data']) 112 | 113 | #query_string can either be a single one or an iterable 114 | if isinstance(query_string, basestring): 115 | query_string = [query_string] 116 | 117 | scores_indices = ((val, idx) for (idx, val) in ( (s[0], fuzzy_str_match(qs, s[1])) \ 118 | for qs, s in product(query_string, enumerate(strings))) ) 119 | val, idx = max(scores_indices) 120 | if val >= threshold: 121 | return table['data'][idx] 122 | else: 123 | return None 124 | 125 | 126 | def closest_row_numeric_value(table, query_string, threshold = 0.4, raw_cell = False): 127 | row = find_row(table, query_string, threshold) 128 | if row: 129 | for c in row: 130 | if 'type' in c: 131 | if c['type'] in ('integer'): 132 | v = int(c['value']) 133 | return (v, c) if raw_cell else v 134 | elif c['type'] in ('large_num', 'small_float'): 135 | v = float(c['value'].replace(",", "")) 136 | return (v, c) if raw_cell else v 137 | 138 | def get_key_values(table, key_queries, threshold = 0.4, raw_cell = False): 139 | return { k : closest_row_numeric_value(table, kk, threshold, raw_cell) for k, kk in key_queries.iteritems() } 140 | 141 | 142 | def find_column(table, query_string, types=None, subtypes=None, threshold = 0.4): 143 | #Find first column with specific types 144 | columns = [] 145 | for i, t in enumerate(zip(table['types'], table['subtypes'])): 146 | t, st = t[0], t[1] 147 | if t in (types or t) and st in (subtypes or st): 148 | if fuzzy_str_match(query_string, table['captions'][i]) > threshold: return i 149 | 150 | def filter_time_series(table, query_string, subtypes = ['dollar'], threshold = 0.4): 151 | time_index = find_column(table, "", subtypes=['date', 'year'], threshold=threshold) 152 | value_index = find_column(table, query_string, subtypes=subtypes, threshold=threshold) 153 | 154 | for r in table['data']: 155 | dt = get_fuzzy_date(r[time_index]['value']) 156 | if dt: 157 | c = r[value_index] 158 | v = None 159 | if c['type'] in ('integer'): 160 | v = int(c['value']) 161 | elif c['type'] in ('large_num', 'small_float'): 162 | v = float(c['value'].replace(",", "")) 163 | if v: yield dt, v 164 | -------------------------------------------------------------------------------- /deprecated/GetMuniBondData.cfg: -------------------------------------------------------------------------------- 1 | [FileLocations] 2 | OutputFileName = D:\munidocs\VA\BondData.csv 3 | OutputColumnSeparator = , 4 | InputPath = D:\munidocs\VA\?????????-IRIS-*.pdf -------------------------------------------------------------------------------- /deprecated/GetMuniBondData.py: -------------------------------------------------------------------------------- 1 | 2 | import ConfigParser 3 | import gc 4 | import glob 5 | import io 6 | import os 7 | import cStringIO 8 | import re 9 | import subprocess 10 | import string 11 | import sys 12 | import numpy as np 13 | import pandas as pd 14 | 15 | def GetConfigParm(section): 16 | dict1 = {} 17 | options = Config.options(section) 18 | for option in options: 19 | try: 20 | dict1[option] = Config.get(section, option) 21 | if dict1[option] == -1: 22 | DebugPrint("skip: %s" % option) 23 | except: 24 | print("exception on %s!" % option) 25 | dict1[option] = None 26 | return dict1 27 | 28 | # Main Process 29 | # Read Configuration Parameters 30 | config = ConfigParser.RawConfigParser() 31 | config.read('GetMuniBondData.cfg') 32 | OutputFileName = config.get("FileLocations","OutputFileName") 33 | OutputColumnSeparator = config.get("FileLocations","OutputColumnSeparator") 34 | InputPath = config.get("FileLocations","InputPath") 35 | 36 | # Initialize Data Frame 37 | df = pd.DataFrame(np.zeros(0 , dtype=[('file', 'a99'),('caption', 'a99'),('value', 'a99')])) 38 | 39 | for file in glob.glob(InputPath): 40 | 41 | printline = 0 42 | linesleft = 0 43 | blanklines = 0 44 | 45 | intxtfilename = file + ".txt" 46 | 47 | out, err = subprocess.Popen(["pdftotext", "-layout", file, file + ".txt" ]).communicate() 48 | 49 | try: 50 | intxtfile = io.open(intxtfilename, mode='rb') 51 | except: 52 | print "Unable to extract text from " + file 53 | continue 54 | 55 | lines = intxtfile.readlines() 56 | 57 | topfound = 0 58 | headerline = 0 59 | 60 | for line in lines: 61 | 62 | strippedline = line.upper().strip() 63 | 64 | if topfound == 0 and string.find(line," $") > 0: 65 | headerline = 1 66 | topfound = 1 67 | 68 | if 1 <= headerline <= 3: 69 | caption = "HEADER " + str(headerline) 70 | value = strippedline 71 | df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True) 72 | headerline = headerline + 1 73 | continue 74 | 75 | if strippedline == "SOURCES AND USES OF FUNDS" \ 76 | or strippedline == "SOURCES AND USES OF FUNDS*" \ 77 | or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS" \ 78 | or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS*" \ 79 | or strippedline == "SOURCES AND USES OF FUNDS(1)" \ 80 | or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS(1)" \ 81 | or strippedline == "PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS": 82 | printline = 1 83 | linesleft = 25 84 | 85 | if printline == 1: 86 | dollar_amount_regex = re.compile("[\$]{0,1}[\s]{0,6}[0-9,]{0,15}(\.[0-9]{1,2})$") 87 | dollar_amount_match = re.search(dollar_amount_regex,strippedline) 88 | if dollar_amount_match: 89 | caption = strippedline[:dollar_amount_match.start(0)].strip() 90 | value = strippedline[dollar_amount_match.start(0):].strip() 91 | df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True) 92 | if len(line.strip()) < 5 and linesleft < 10: 93 | blanklines = blanklines + 1 94 | linesleft = linesleft - 1 95 | 96 | if linesleft == 0: 97 | printline = 0 98 | 99 | del lines 100 | gc.collect() 101 | 102 | df.to_csv(OutputFileName,OutputColumnSeparator,index=False) 103 | -------------------------------------------------------------------------------- /deprecated/GetMuniBondsTest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 35, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "skipping pdf/ER862677-ER674128-ER1075876.pdf, already exists.\n", 15 | "skipping pdf/EP849915-EP657701-EP1059361.pdf, already exists.\n", 16 | "skipping pdf/ER866175-ER676833-ER1078611.pdf, already exists.\n" 17 | ] 18 | } 19 | ], 20 | "source": [ 21 | "from __future__ import print_function\n", 22 | "import re\n", 23 | "import os\n", 24 | "import codecs\n", 25 | "import string\n", 26 | "\n", 27 | "def create_path(path):\n", 28 | " try: \n", 29 | " os.makedirs(path)\n", 30 | " except OSError:\n", 31 | " if not os.path.isdir(path):\n", 32 | " raise \n", 33 | "\n", 34 | "#Convert all pdfs\n", 35 | "files = os.listdir('pdf')\n", 36 | "create_path \n", 37 | "for i,f in enumerate(files):\n", 38 | "\n", 39 | " pdf_path = os.path.join('pdf', f)\n", 40 | " txt_path = os.path.join('txt', f+'.txt')\n", 41 | " \n", 42 | " if not os.path.isfile(txt_path):\n", 43 | " #Layout preservation crucial to maintain clues about tabular data\n", 44 | " cmd = \"pdftotext -layout %s %s\" % (pdf_path, txt_path)\n", 45 | " print ('%d/%d %s' % (i, len(files), cmd))\n", 46 | " os.system(cmd)\n", 47 | " else:\n", 48 | " print ('skipping %s, already exists.' % (pdf_path, ))" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 147, 54 | "metadata": { 55 | "collapsed": false, 56 | "scrolled": false 57 | }, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "--------ER862677-ER674128-ER1075876.pdf.txt--------\n", 64 | "HEADER 1 $95,885,000\n", 65 | "HEADER 2 CALIFORNIA MUNICIPAL FINANCE AUTHORITY\n", 66 | "HEADER 3 REVENUE BONDS, SERIES 2015-A\n", 67 | "--------EP849915-EP657701-EP1059361.pdf.txt--------\n", 68 | "HEADER 1 $6,645,000\n", 69 | "HEADER 2 CITY OF PALM SPRINGS\n", 70 | "HEADER 3 LIMITED OBLIGATION REFUNDING IMPROVEMENT BONDS\n", 71 | "--------ER866175-ER676833-ER1078611.pdf.txt--------\n", 72 | "HEADER 1 $19,560,000\n", 73 | "HEADER 2 RNR SCHOOL FINANCING AUTHORITY\n", 74 | "HEADER 3 COMMUNITY FACILITIES DISTRICT NO. 92-1\n", 75 | "PRINCIPAL AMOUNT OF 2015 REFUNDING BONDS $19,560,000.00\n", 76 | "PLUS: NET ORIGINAL ISSUE PREMIUM 2,550,554.30\n", 77 | "PLUS: TRANSFERRED MONEYS FROM FUNDS FOR 2006 BONDS 367,663.99\n", 78 | "TOTAL SOURCES $22,302,178.29\n", 79 | "DEPOSIT INTO ESCROW FUND (1) $21,893,691.38\n", 80 | "DEPOSIT INTO 2015A COSTS OF ISSUANCE ACCOUNT (2) 408,486.91\n", 81 | "TOTAL USES $22,302,178.29\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "#Existing Version\n", 87 | "for file in os.listdir('txt'):\n", 88 | " \n", 89 | " print (\"--------\" + file + \"--------\")\n", 90 | " \n", 91 | " printline = 0\n", 92 | " linesleft = 0\n", 93 | " blanklines = 0\n", 94 | " \n", 95 | " topfound = 0\n", 96 | " headerline = 0 \n", 97 | " \n", 98 | " with codecs.open('txt/'+file, \"r\", \"utf-8\") as f:\n", 99 | " for i, line in enumerate(f):\n", 100 | "\n", 101 | " strippedline = line.upper().strip()\n", 102 | "\n", 103 | " if topfound == 0 and string.find(line,\" $\") > 0:\n", 104 | " headerline = 1\n", 105 | " topfound = 1\n", 106 | "\n", 107 | " if 1 <= headerline <= 3:\n", 108 | " caption = \"HEADER \" + str(headerline)\n", 109 | " value = strippedline\n", 110 | " #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n", 111 | " print (u\"{:60s} {:10s}\".format(caption, value))\n", 112 | " headerline = headerline + 1\n", 113 | " continue\n", 114 | "\n", 115 | " if strippedline == \"SOURCES AND USES OF FUNDS\" \\\n", 116 | " or strippedline == \"SOURCES AND USES OF FUNDS*\" \\\n", 117 | " or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS\" \\\n", 118 | " or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS*\" \\\n", 119 | " or strippedline == \"SOURCES AND USES OF FUNDS(1)\" \\\n", 120 | " or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS(1)\" \\\n", 121 | " or strippedline == \"PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS\":\n", 122 | " printline = 1\n", 123 | " linesleft = 25\n", 124 | "\n", 125 | " if printline == 1:\n", 126 | " dollar_amount_regex = re.compile(\"[\\$]{0,1}[\\s]{0,6}[0-9,]{0,15}(\\.[0-9]{1,2})$\")\n", 127 | " dollar_amount_match = re.search(dollar_amount_regex,strippedline)\n", 128 | " if dollar_amount_match:\n", 129 | " caption = strippedline[:dollar_amount_match.start(0)].strip()\n", 130 | " value = strippedline[dollar_amount_match.start(0):].strip()\n", 131 | " #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n", 132 | " print (u\"{:60s} {:10s}\".format(caption, value))\n", 133 | " if len(line.strip()) < 5 and linesleft < 10:\n", 134 | " blanklines = blanklines + 1\n", 135 | " linesleft = linesleft - 1\n", 136 | "\n", 137 | " if linesleft == 0:\n", 138 | " printline = 0" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "collapsed": true 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "#Issues:\n", 150 | "## Doesn't pick up caption in EP1059361 --> add USES OF FUNDS but then no SOURCES OF PAYMENTS\n", 151 | "## Doesn't pick up line items in ER1075876 --> match sequences of .... to indicate tables as well, plus be more lenient with cents values\n" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 154, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [ 161 | { 162 | "name": "stdout", 163 | "output_type": "stream", 164 | "text": [ 165 | "--------ER862677-ER674128-ER1075876.pdf.txt--------\n", 166 | "HEADER 1 $95,885,000\n", 167 | "HEADER 2 CALIFORNIA MUNICIPAL FINANCE AUTHORITY\n", 168 | "HEADER 3 REVENUE BONDS, SERIES 2015-A\n", 169 | "PRINCIPAL AMOUNT $ 95,885,000\n", 170 | "BOND PREMIUM 12,984,339\n", 171 | "OTHER AVAILABLE FUNDS(1) 6,600,643 \n", 172 | "TOTAL SOURCES $115,469,982\n", 173 | "DEPOSIT TO ACQUISITION FUND $ 41,000,000\n", 174 | "RETIREMENT OF WATER REVENUE ANTICIPATION NOTES(2) 14,000,000\n", 175 | "DEPOSIT TO ESCROW FUND FOR REFUNDED 2008 BONDS 52,742,691\n", 176 | "DISCHARGE OF STATE LOAN 7,096,550 \n", 177 | "COSTS OF ISSUANCE(3) 630,741 \n", 178 | "TOTAL USES $115,469,982\n", 179 | "--------EP849915-EP657701-EP1059361.pdf.txt--------\n", 180 | "HEADER 1 $6,645,000\n", 181 | "HEADER 2 CITY OF PALM SPRINGS\n", 182 | "HEADER 3 LIMITED OBLIGATION REFUNDING IMPROVEMENT BONDS\n", 183 | "TRANSFER TO ESCROW BANK $6,086,693.08\n", 184 | "RESERVE FUND (1) 274,331.25\n", 185 | "COSTS OF ISSUANCE FUND (2) 152,404.72\n", 186 | "TOTAL USES $6,513,429.05\n", 187 | "--------ER866175-ER676833-ER1078611.pdf.txt--------\n", 188 | "HEADER 1 $19,560,000\n", 189 | "HEADER 2 RNR SCHOOL FINANCING AUTHORITY\n", 190 | "HEADER 3 COMMUNITY FACILITIES DISTRICT NO. 92-1\n", 191 | "PRINCIPAL AMOUNT OF 2015 REFUNDING BONDS $19,560,000.00\n", 192 | "PLUS: NET ORIGINAL ISSUE PREMIUM 2,550,554.30\n", 193 | "PLUS: TRANSFERRED MONEYS FROM FUNDS FOR 2006 BONDS 367,663.99\n", 194 | "TOTAL SOURCES $22,302,178.29\n", 195 | "DEPOSIT INTO ESCROW FUND (1) $21,893,691.38\n", 196 | "DEPOSIT INTO 2015A COSTS OF ISSUANCE ACCOUNT (2) 408,486.91\n", 197 | "TOTAL USES $22,302,178.29\n" 198 | ] 199 | } 200 | ], 201 | "source": [ 202 | "#New Version\n", 203 | "for file in os.listdir('txt'):\n", 204 | " \n", 205 | " print (\"--------\" + file + \"--------\")\n", 206 | " \n", 207 | " printline = 0\n", 208 | " linesleft = 0\n", 209 | " blanklines = 0\n", 210 | " \n", 211 | " topfound = 0\n", 212 | " headerline = 0 \n", 213 | " \n", 214 | " with codecs.open('txt/'+file, \"r\", \"utf-8\") as f:\n", 215 | " for i, line in enumerate(f):\n", 216 | "\n", 217 | " \n", 218 | " strippedline = line.upper().strip()\n", 219 | "\n", 220 | " if topfound == 0 and string.find(line,\" $\") > 0:\n", 221 | " headerline = 1\n", 222 | " topfound = 1\n", 223 | "\n", 224 | " if 1 <= headerline <= 3:\n", 225 | " caption = \"HEADER \" + str(headerline)\n", 226 | " value = strippedline\n", 227 | " #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n", 228 | " print (u\"{:60s} {:10s}\".format(caption, value))\n", 229 | " headerline = headerline + 1\n", 230 | " continue\n", 231 | "\n", 232 | " if strippedline == \"SOURCES AND USES OF FUNDS\" \\\n", 233 | " or strippedline == \"SOURCES AND USES OF FUNDS*\" \\\n", 234 | " or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS\" \\\n", 235 | " or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS*\" \\\n", 236 | " or strippedline == \"SOURCES AND USES OF FUNDS(1)\" \\\n", 237 | " or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS(1)\" \\\n", 238 | " or strippedline == \"PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS\" \\\n", 239 | " or strippedline == \"ESTIMATED USES OF FUNDS\": #New\n", 240 | " printline = 1\n", 241 | " linesleft = 25\n", 242 | " #print (\"#### line:\", i, \"to\", i+linesleft)\n", 243 | "\n", 244 | " if printline == 1:\n", 245 | " #Include a minimum of preceding dots or whitespace\n", 246 | " #Group 1 = preceding whitespace\n", 247 | " #Group 2 = Dollar value\n", 248 | " #Group 3 = $Cents value if existing\n", 249 | " dollar_amount_regex = ur\"([\\.]{4,}|[\\s]{4,})[\\s]*\" + \\\n", 250 | " ur\"([\\$]{0,1}[\\s]{0,6}[0-9,]{2,15})(\\.[0-9]{1,2})?$\"\n", 251 | " dollar_amount_regex = re.compile(dollar_amount_regex)\n", 252 | " dollar_amount_match = re.search(dollar_amount_regex,strippedline)\n", 253 | " \n", 254 | " #Check whether we found something tabular and a dollar value\n", 255 | " if dollar_amount_match and dollar_amount_match.group(2):\n", 256 | " caption = strippedline[:dollar_amount_match.start(1)].strip()\n", 257 | " value = strippedline[dollar_amount_match.start(2):].strip()\n", 258 | " #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n", 259 | " print (u\"{:60s} {:10s}\".format(caption, value))\n", 260 | " if len(line.strip()) < 5 and linesleft < 10:\n", 261 | " blanklines = blanklines + 1\n", 262 | " linesleft = linesleft - 1\n", 263 | "\n", 264 | " if linesleft == 0:\n", 265 | " printline = 0" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 150, 271 | "metadata": { 272 | "collapsed": false 273 | }, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | "--------ER862677-ER674128-ER1075876.pdf.txt--------\n", 280 | "27 - issuance of the Bonds. See “PLAN OF FINANCE” and “ESTIMATED SOURCES AND USES OF FUNDS.”\n", 281 | "\n", 282 | "229 - ESTIMATED SOURCES AND USES OF FUNDS ................................................................................... 7\n", 283 | "\n", 284 | "370 - of the Bonds. See “PLAN OF FINANCE” and “ESTIMATED SOURCES AND USES OF FUNDS.”\n", 285 | "\n", 286 | "653 - ESTIMATED SOURCES AND USES OF FUNDS\n", 287 | "\n", 288 | "--------EP849915-EP657701-EP1059361.pdf.txt--------\n", 289 | "--------ER866175-ER676833-ER1078611.pdf.txt--------\n", 290 | "223 - ESTIMATED SOURCES AND USES OF FUNDS .................................................................................. 13 \n", 291 | "\n", 292 | "429 - Bonds. See “ESTIMATED SOURCES AND USES OF FUNDS.”\n", 293 | "\n", 294 | "715 - “ESTIMATED SOURCES AND USES OF FUNDS.”\n", 295 | "\n", 296 | "983 - ESTIMATED SOURCES AND USES OF FUNDS\n", 297 | "\n" 298 | ] 299 | } 300 | ], 301 | "source": [ 302 | "#Some exploration\n", 303 | "max_distance_below = 25\n", 304 | "max_distance_above = 5\n", 305 | "context_identifier = u\"SOURCES AND USES OF FUNDS|SOURCES AND USES OF FUNDS*|ESTIMATED SOURCES AND USES OF FUNDS|\" + \\\n", 306 | " \"ESTIMATED SOURCES AND USES OF FUNDS*|SOURCES AND USES OF FUNDS(1)|\" + \\\n", 307 | " \"ESTIMATED SOURCES AND USES OF FUNDS(1)|PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS\"\n", 308 | "context_identifier = context_identifier.split(u\"|\")\n", 309 | "\n", 310 | "for file in os.listdir('txt'):\n", 311 | " \n", 312 | " print (\"--------\" + file + \"--------\")\n", 313 | " with codecs.open('txt/'+file, \"r\", \"utf-8\") as f:\n", 314 | " for i, line in enumerate(f):\n", 315 | " \n", 316 | " #Print Candidates\n", 317 | " id_found = reduce(lambda x,y: x or y, ( (id in line) for id in context_identifier ))\n", 318 | " if id_found:\n", 319 | " print(i, '-', line)\n", 320 | " " 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": { 327 | "collapsed": true 328 | }, 329 | "outputs": [], 330 | "source": [] 331 | } 332 | ], 333 | "metadata": { 334 | "kernelspec": { 335 | "display_name": "Python 2", 336 | "language": "python", 337 | "name": "python2" 338 | }, 339 | "language_info": { 340 | "codemirror_mode": { 341 | "name": "ipython", 342 | "version": 2 343 | }, 344 | "file_extension": ".py", 345 | "mimetype": "text/x-python", 346 | "name": "python", 347 | "nbconvert_exporter": "python", 348 | "pygments_lexer": "ipython2", 349 | "version": "2.7.10" 350 | } 351 | }, 352 | "nbformat": 4, 353 | "nbformat_minor": 0 354 | } 355 | -------------------------------------------------------------------------------- /deprecated/Procfile: -------------------------------------------------------------------------------- 1 | web: python server.py 2 | -------------------------------------------------------------------------------- /deprecated/TableParser.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | #TABLE Parser 7 | #Infers a table with arbitrary number of columns from reoccuring patterns in text lines 8 | 9 | #Main assumptions Table identificatin: 10 | #1) each row is either in one line or not a row at all [DONE] 11 | #2) each column features at least one number (=dollar amount) [MISSING] 12 | #2a) each column features at least one date-like string 13 | #3) a table exists if rows are in narrow consecutive order and share similarities --> scoring algo [DONE] 14 | #4) each column is separated by more than 2 consecutive whitespace indicators (e.g. ' ' or '..') 15 | 16 | #Feature List: 17 | #1) Acknowledge Footnotes / make lower meta-data available 18 | #2) make delimiter length smartly dependent on number of columns (iteration) 19 | #3) expand non canonical values in tables [DONE] .. but only to the extent that type matches 20 | #4) UI: parameterize extraction on the show page on the fly 21 | #5) more type inference (e.g. date) 22 | 23 | 24 | # In[128]: 25 | 26 | import re 27 | import os 28 | import codecs 29 | import string 30 | from collections import OrderedDict 31 | 32 | config = { "min_delimiter_length" : 3, "min_columns": 2, "min_consecutive_rows" : 3, "max_grace_rows" : 2, 33 | "caption_reorder_tolerance" : 10.0, "meta_info_lines_above" : 8, "aggregate_captions_missing" : 0.5} 34 | 35 | 36 | # In[129]: 37 | 38 | import json 39 | import sys 40 | 41 | from flask import Flask, request, redirect, url_for, send_from_directory 42 | from werkzeug import secure_filename 43 | 44 | from flask import jsonify, render_template, make_response 45 | import numpy as np 46 | import pandas as pd 47 | 48 | from pyxley import UILayout 49 | from pyxley.filters import SelectButton 50 | from pyxley.charts.mg import LineChart, Figure, ScatterPlot, Histogram 51 | from pyxley.charts.datatables import DataTable 52 | 53 | 54 | # In[3]: 55 | 56 | #Regex tester online: https://regex101.com 57 | #Contrast with Basic table parsing capabilities of http://docs.astropy.org/en/latest/io/ascii/index.html 58 | 59 | tokenize_pattern = "[.]{%i,}|[\ \$]{%i,}|" % ((config['min_delimiter_length'],)*2) 60 | tokenize_pattern = "[.\ \$]{%i,}" % (config['min_delimiter_length'],) 61 | 62 | column_pattern = OrderedDict() 63 | #column_pattern['large_num'] = ur"\d{1,3}(,\d{3})*(\.\d+)?" 64 | column_pattern['large_num'] = ur"(([0-9]{1,3})(,\d{3})+(\.[0-9]{2})?)" 65 | column_pattern['small_float'] = ur"[0-9]+\.[0-9]+" 66 | column_pattern['integer'] = ur"^\s*[0-9]+\s*$" 67 | column_pattern['other'] = ur"([a-zA-Z0-9]{2,}\w)" 68 | column_pattern['other'] = ur".+" 69 | 70 | subtype_indicator = OrderedDict() 71 | subtype_indicator['dollar'] = r".*\$.*" 72 | subtype_indicator['rate'] = r"[%]" 73 | subtype_indicator['year'] = "(20[0-9]{2})|(19[0-9]{2})" 74 | 75 | 76 | # In[4]: 77 | 78 | #import dateutil.parser as date_parser 79 | #(type, subtype, value, leftover) 80 | def tag_token(token, ws): 81 | for t, p in column_pattern.iteritems(): 82 | result = re.search(p, token) 83 | if result: 84 | leftover = token[:result.start()] + token[result.end():] 85 | value = token[result.start():result.end()] 86 | 87 | #First match on left-overs 88 | subtype = "none" 89 | for sub, indicator in subtype_indicator.iteritems(): 90 | if re.match(indicator, leftover): subtype = sub 91 | #Only if no indicator matched there, try on full token 92 | if subtype == "none": 93 | for sub, indicator in subtype_indicator.iteritems(): 94 | if re.match(indicator, token): subtype = sub 95 | #Only if no indicator matched again, try on whitespace 96 | if subtype == "none": 97 | for sub, indicator in subtype_indicator.iteritems(): 98 | if re.match(indicator, ws): subtype = sub 99 | #print token, ":", ws, ":", subtype 100 | 101 | return t, subtype, value, leftover 102 | return "unknown", "none", token, "" 103 | 104 | def row_feature(line): 105 | features = [] 106 | matches = re.finditer(tokenize_pattern, line) 107 | start_end = [ (match.start(), match.end()) for match in matches] 108 | if len(start_end) < 1: 109 | return features 110 | 111 | tokens = re.split(tokenize_pattern, line) 112 | if tokens[0] == "": 113 | tokens = tokens[1:] 114 | else: 115 | start_end = [(0,0)] + start_end 116 | 117 | for se, token in zip(start_end, tokens): 118 | t, subtype, value, _ = tag_token(token, line[se[0]:se[1]]) 119 | feature = {"start" : se[1], "value" : value, "type" : t, "subtype" : subtype} 120 | features.append(feature) 121 | return features 122 | 123 | #date_parser.parse("asdf") 124 | 125 | 126 | # In[5]: 127 | 128 | #Establish whether amount of rows is above a certain threshold and whether there is at least one number 129 | def row_qualifies(row): 130 | return len(row) >= config['min_columns'] and sum( 1 if c['type'] in ['large_num', 'small_float', 'integer'] else 0 for c in row) > 0 131 | 132 | def row_equal_types(row1, row2): 133 | max_len = max(len(row1), len(row2) ) 134 | same_types = sum (map(lambda t: 1 if t[0]==t[1] else 0, ((c1['type'], c2['type']) for c1, c2 in zip(row1, row2)))) 135 | return same_types == max_len 136 | 137 | 138 | # In[6]: 139 | 140 | def filter_row_spans(row_features, row_qualifies): 141 | 142 | min_consecutive = config["min_consecutive_rows"] 143 | grace_rows = config['max_grace_rows'] 144 | 145 | last_qualified = None 146 | consecutive = 0 147 | underqualified = 0 148 | i = 0 149 | 150 | for row in row_features: 151 | if row_qualifies(row): 152 | underqualified = 0 153 | if last_qualified is None: 154 | last_qualified = i 155 | consecutive = 1 156 | else: 157 | consecutive += 1 158 | 159 | else: 160 | underqualified += 1 161 | if underqualified > grace_rows: 162 | if consecutive >= min_consecutive: 163 | yield last_qualified, i-underqualified+1 164 | last_qualified = None 165 | consecutive = 0 166 | else: 167 | last_qualified = None 168 | consecutive = 0 169 | underqualified = 0 170 | #print i, underqualified, last_qualified, consecutive#, "" or row 171 | i += 1 172 | 173 | if consecutive >= min_consecutive: 174 | yield last_qualified, i-underqualified 175 | 176 | 177 | # In[126]: 178 | 179 | from collections import Counter 180 | 181 | def readjust_cols(feature_row, slots): 182 | feature_new = [{'value' : 'NaN'}] * len(slots) 183 | for v in feature_row: 184 | dist = [ abs((float(v['start'])) - s) for s in slots ] 185 | val , idx = min((val, idx) for (idx, val) in enumerate(dist)) 186 | if val <= config['caption_reorder_tolerance']: feature_new[idx] = v 187 | return feature_new 188 | 189 | def normalize_rows(rows_in, structure): 190 | 191 | slots = [c['start'] for c in structure] 192 | nrcols = len(structure) 193 | 194 | for r in rows_in: 195 | if len(r) != nrcols: 196 | if len(r)/float(nrcols) > config['aggregate_captions_missing']: 197 | yield readjust_cols(r, slots) 198 | else: 199 | yield r 200 | 201 | #TODO: make side-effect free 202 | def structure_rows(row_features, meta_features): 203 | #Determine maximum nr. of columns 204 | lengths = [len(r) for r in row_features] 205 | nrcols = max(lengths) 206 | canonical = filter(lambda r: len(r) == nrcols , row_features) 207 | 208 | #print canonical 209 | 210 | structure = [] 211 | values = [] 212 | for i in range(nrcols): 213 | col = {} 214 | col['start'] = float (sum (c[i]['start'] for c in canonical )) / len(canonical) 215 | 216 | types = Counter(c[i]['type'] for c in canonical) 217 | col['type'] = types.most_common(1)[0][0] 218 | subtypes = Counter(c[i]['subtype'] for c in canonical if c[i]['subtype'] is not "none") 219 | subtype = "none" if len(subtypes) == 0 else subtypes.most_common(1)[0][0] 220 | col['subtype'] = subtype 221 | structure.append(col) 222 | 223 | #Add the first non canonical rows to the meta_features above data 224 | for r in row_features: 225 | if r in canonical: 226 | break 227 | else: 228 | meta_features.append(r) 229 | row_features.remove(r) 230 | 231 | #Try to find caption from first rows above the data, skip one empty row if necessary 232 | #Todo: make two steps process cleaner and more general 233 | if len(meta_features[-1]) == 0: meta_features = meta_features[:-1] 234 | caption = meta_features[-1] if len(meta_features[-1])/float(nrcols) > config['aggregate_captions_missing'] else None 235 | if caption: 236 | slots = [c['start'] for c in structure] 237 | meta_features = meta_features[:-1] 238 | if len(caption) != nrcols: caption = readjust_cols(caption, slots) 239 | if len(meta_features[-1])/float(nrcols) > config['aggregate_captions_missing']: 240 | caption2 = readjust_cols(meta_features[-1], slots) 241 | for c,c2 in zip(caption, caption2): 242 | if c2['value'] != 'NaN': 243 | c['value'] = c2['value'] + ' ' + c['value'] 244 | meta_features = meta_features[:-1] 245 | 246 | #Assign captions as the value in structure 247 | for i, c in enumerate(caption): 248 | structure[i]['value'] = c['value'] 249 | 250 | headers = [] 251 | for h in meta_features: 252 | if len(h) == 1: 253 | headers.append(h[0]['value']) 254 | 255 | #Expand all the non canonical rows with NaN values (Todo: if type matches) 256 | normalized_data = [r for r in normalize_rows(row_features, structure)] 257 | 258 | return structure, normalized_data, headers 259 | 260 | 261 | # In[115]: 262 | 263 | def output_table_html(txt_path): 264 | out = [] 265 | out.append("--------" + txt_path + "--------") 266 | 267 | with codecs.open(txt_path, "r", "utf-8") as f: 268 | 269 | lines = [l.encode('ascii', 'ignore').replace('\n', '') for l in f] 270 | rows = [row_feature(l) for l in lines] 271 | 272 | for b,e in filter_row_spans(rows, row_qualifies): 273 | out.append("TABLE STARTING FROM LINE %i to %i" % (b,e)) 274 | table = rows[b:e] 275 | structure, data, headers = structure_rows(table, rows[b-config['meta_info_lines_above']:b]) 276 | 277 | for h in headers: out.append(h) 278 | if caption: 279 | out.append("\t".join(caption)) 280 | else: 281 | out.append('NO COLUMN NAMES DETECTED') 282 | 283 | for f in rows[b:e]: 284 | cols = "\t|\t".join([col['value']+" (%s, %s)" % (col['type'], col['subtype']) for col in f]) 285 | out.append("%i %s" % (len(f), cols) ) 286 | return out 287 | 288 | def return_tables(txt_path): 289 | 290 | #Uniquely identify tables by their first row 291 | tables = OrderedDict() 292 | 293 | with codecs.open(txt_path, "r", "utf-8") as f: 294 | lines = [l.encode('ascii', 'ignore').replace('\n', '') for l in f] 295 | rows = [row_feature(l) for l in lines] 296 | 297 | for b,e in filter_row_spans(rows, row_qualifies): 298 | table = {'begin_line' : b, 'end_line' : e} 299 | 300 | data_rows = rows[b:e] 301 | meta_rows = rows[b-config['meta_info_lines_above']:b] 302 | 303 | structure, data, headers = structure_rows(data_rows, meta_rows) 304 | 305 | #Construct df 306 | captions = [(col['value'] if 'value' in col.keys() else "---") +" (%s, %s)" % (col['type'], col['subtype']) for col in structure] 307 | 308 | table['captions'] = captions 309 | table['data'] = data 310 | table['header'] = " | ".join(headers) 311 | 312 | tables[b] = table 313 | 314 | return tables 315 | 316 | 317 | # ## Web App ## 318 | 319 | # In[124]: 320 | 321 | TITLE = "docX - Table View" 322 | 323 | scripts = [ 324 | "./bower_components/jquery/dist/jquery.min.js", 325 | "./bower_components/datatables/media/js/jquery.dataTables.js", 326 | "./bower_components/d3/d3.min.js", 327 | "./bower_components/metrics-graphics/dist/metricsgraphics.js", 328 | "./require.min.js", 329 | "./bower_components/react/react.js", 330 | "./bower_components/react-bootstrap/react-bootstrap.min.js", 331 | "./bower_components/pyxley/build/pyxley.js", 332 | ] 333 | 334 | css = [ 335 | "./bower_components/bootstrap/dist/css/bootstrap.min.css", 336 | "./bower_components/metrics-graphics/dist/metricsgraphics.css", 337 | "./bower_components/datatables/media/css/jquery.dataTables.min.css", 338 | "./css/main.css" 339 | ] 340 | 341 | 342 | UPLOAD_FOLDER = './' 343 | ALLOWED_EXTENSIONS = set(['txt', 'pdf']) 344 | 345 | app = Flask(__name__) 346 | app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER 347 | 348 | def get_extension(filename): 349 | return '.' in filename and filename.rsplit('.', 1)[1] 350 | 351 | def allowed_file(filename): 352 | return get_extension(filename) in ALLOWED_EXTENSIONS 353 | 354 | @app.route('/', methods=['GET', 'POST']) 355 | def upload_file(): 356 | if request.method == 'POST': 357 | file = request.files['file'] 358 | min_columns = request.form['min_columns'] 359 | if file and allowed_file(file.filename): 360 | filename = secure_filename(file.filename) 361 | extension = get_extension(file.filename) 362 | file.save(os.path.join(app.config['UPLOAD_FOLDER'], extension, filename)) 363 | return redirect(url_for('uploaded_file', 364 | filename=filename, min_columns=min_columns)) 365 | return ''' 366 | 367 | docX - Table Extractor 368 |

Upload a pdf or txt file

369 |
370 |

371 | 372 |

373 |

Select the minimum amount of columns tables should have

374 | 379 |
380 | ''' 381 | 382 | all_charts = {} 383 | all_uis = {} 384 | 385 | @app.route('/show/') 386 | def uploaded_file(filename): 387 | extension = get_extension(filename) 388 | path = os.path.join(app.config['UPLOAD_FOLDER'], extension, filename) 389 | txt_path = os.path.join(app.config['UPLOAD_FOLDER'], 'txt', filename) 390 | if extension == "pdf": 391 | txt_path += '.txt' 392 | if not os.path.isfile(txt_path): 393 | #Layout preservation crucial to preserve clues about tabular data 394 | cmd = "pdftotext -layout %s %s" % (path, txt_path) 395 | os.system(cmd) 396 | 397 | min_columns = request.args.get('min_columns') 398 | tables = return_tables(txt_path) 399 | 400 | #Construct histogram 401 | lines_per_page = 80 402 | nr_data_rows = [] 403 | for b, t in tables.iteritems(): 404 | e = t['end_line'] 405 | #print b, e 406 | for l in range(b, e): 407 | page = l / lines_per_page 408 | if len(nr_data_rows) <= page: 409 | nr_data_rows += ([0]*(page-len(nr_data_rows)+1)) 410 | nr_data_rows[page] += 1 411 | dr = pd.DataFrame() 412 | dr['value'] = nr_data_rows 413 | dr['page'] = range(0, len(dr)) 414 | 415 | js_layout = filename+".js" 416 | 417 | ui_show = UILayout( 418 | "FilterChart", 419 | "../static/bower_components/pyxley/build/pyxley.js", 420 | "component_id", 421 | filter_style="''") 422 | 423 | if filename in all_charts: 424 | print "old/update ui", filename 425 | path_to_fig = '/show/line/'+filename 426 | #del all_charts[filename] 427 | #hFig = Figure(path_to_fig, "line") 428 | #bc = LineChart(dr, hFig, "page", ["page"], "Rows containing Data per Page") 429 | elif True: 430 | print "new ui", filename 431 | 432 | # Make a Button 433 | cols = ["page"] 434 | btn = SelectButton("Data", cols, "Data", "Data Rows per Page") 435 | 436 | # Make a FilterFrame and add the button to the UI 437 | ui_show.add_filter(btn) 438 | 439 | # Now make a FilterFrame for the histogram 440 | path_to_fig = '/show/line/'+filename 441 | hFig = Figure(path_to_fig, "line") 442 | hFig.layout.set_size(width=1000, height=300) 443 | hFig.layout.set_margin(left=80, right=80) 444 | #hFig.graphics.animate_on_load() 445 | 446 | bc = LineChart(dr, hFig, "page", ["value"], "Rows containing Data per Page") 447 | ui_show.add_chart(bc) 448 | all_charts[filename] = bc 449 | 450 | sb = ui_show.render_layout(app, "./static/ug/"+js_layout) 451 | 452 | _scripts = ["ug/"+js_layout] 453 | notices = ['Extraction Results for ' + filename, 'Ordered by lines'] 454 | 455 | dfs = (table_to_df(table).to_html() for table in tables.values()) 456 | headers = [] 457 | for t in tables.values(): 458 | if 'header' in t: 459 | headers.append(t['header']) 460 | else: 461 | headers.append('-') 462 | 463 | line_nrs = [('line %i-%i' % (t['begin_line'], t['end_line'])) for t in tables.values() ] 464 | #headers = ['aslkdfjas', ' alsdkfjasoedf'] 465 | 466 | return render_template('index.html', 467 | title=TITLE + ' - ' + filename, 468 | base_scripts=scripts, 469 | page_scripts=_scripts, 470 | css=css, notices = notices, tables = dfs, headers=headers, line_nrs=line_nrs) 471 | 472 | 473 | # In[ ]: 474 | 475 | app.run(debug=True, host='0.0.0.0') 476 | 477 | 478 | # 479 | # ## Tests ## 480 | 481 | # In[123]: 482 | 483 | def table_to_df(table): 484 | df = pd.DataFrame() 485 | 486 | for i, c in enumerate(table['captions']): 487 | values = [] 488 | for r in table['data']: 489 | values.append(r[i]['value']) 490 | df[c] = values 491 | 492 | return df 493 | 494 | for file in os.listdir('txt'): 495 | 496 | print ("--------" + file + "--------") 497 | tables = return_tables('txt/'+file) 498 | 499 | #print tables 500 | 501 | #Construct histogram 502 | lines_per_page = 80 503 | nr_data_rows = [] 504 | for b, t in tables.iteritems(): 505 | e = t['end_line'] 506 | #print b, e 507 | for l in range(b, e): 508 | page = l / lines_per_page 509 | if len(nr_data_rows) <= page: 510 | nr_data_rows += ([0]*(page-len(nr_data_rows)+1)) 511 | nr_data_rows[page] += 1 512 | dr = pd.DataFrame() 513 | dr['value'] = nr_data_rows 514 | dr['page'] = range(0, len(dr)) 515 | #print dr.head() 516 | 517 | line_nrs = [('line %i-%i' % (t['begin_line'], t['end_line'])) for t in tables.values() ] 518 | print line_nrs 519 | 520 | for k, table in tables.iteritems(): 521 | df = table_to_df(table) 522 | print k, ' !!! ', table['header'], ' -----------------' 523 | print df.head() 524 | 525 | 526 | #print dr 527 | 528 | # Make a Button 529 | cols = [c for c in df.columns if c != "Date"] 530 | btn = SelectButton("Data", cols, "Data", "Steps") 531 | 532 | # Make a FilterFrame and add the button to the UI 533 | ui.add_filter(btn) 534 | 535 | # Now make a FilterFrame for the histogram 536 | hFig = Figure("/mghist/", "myhist") 537 | hFig.layout.set_size(width=450, height=200) 538 | hFig.layout.set_margin(left=40, right=40) 539 | hFig.graphics.animate_on_load() 540 | # Make a histogram with 20 bins 541 | hc = Histogram(sf, hFig, "value", 20, init_params={"Data": "Steps"}) 542 | ui.add_chart(hc) 543 | 544 | # Let's play with our input 545 | df["Date"] = pd.to_datetime(df["Date"]) 546 | df["week"] = df["Date"].apply(lambda x: x.isocalendar()[1]) 547 | gf = df.groupby("week").agg({ 548 | "Date": [np.min, np.max], 549 | "Steps": np.sum, 550 | "Calories Burned": np.sum, 551 | "Distance": np.sum 552 | }).reset_index() 553 | f = lambda x: '_'.join(x) if (len(x[1]) > 0) and x[1] != 'sum' else x[0] 554 | gf.columns = [f(c) for c in gf.columns] 555 | gf = gf.sort_index(by="week", ascending=False) 556 | gf["Date_amin"] = gf["Date_amin"].apply(lambda x: x.strftime("%Y-%m-%d")) 557 | gf["Date_amax"] = gf["Date_amax"].apply(lambda x: x.strftime("%Y-%m-%d")) 558 | 559 | cols = OrderedDict([ 560 | ("week", {"label": "Week"}), 561 | ("Date_amin", {"label": "Start Date"}), 562 | ("Date_amax", {"label": "End Date"}), 563 | ("Calories Burned", {"label": "Calories Burned"}), 564 | ("Steps", {"label": "Steps"}), 565 | ("Distance", {"label": "Distance (mi)", "format": "%5.2f"}) 566 | ]) 567 | 568 | tb = DataTable("mytable", "/mytable/", gf, columns=cols, paging=True, pageLength=5) 569 | ui.add_chart(tb) 570 | 571 | sb = ui.render_layout(app, "./static/layout.js") 572 | # In[ ]: 573 | 574 | test_string =""" 575 | The following table sets forth statistical information relating to the Water System during the five 576 | Fiscal Years shown. 577 | TABLE 1 578 | WATER SYSTEM STATISTICS 579 | Fiscal Year Ended June 30 580 | 2014 2013 2012 2011 2010 581 | Anaheim Population Served .................................. 348,305 346,161 343,793 341,034 336,265 582 | Population Served Outside City (Est.) ................... 8,457 9,000 9,000 9,000 9,000 583 | Total Population Served ........................... 356,762 355,161 352,793 350,034 345,265 584 | 585 | Total Water Sales (Million Gallons) ................... 20,740 20,465 19,672 19,526 20,488 586 | 587 | Capacity (Million Gallons Per Day) 588 | From MWD Connections ................................... 110 110 110 110 110 589 | From Water System Wells (Average) ............... 79 86 88 81 75 590 | Total Supply Capacity ............................. 189 196 198 191 185 591 | 592 | Treatment Plant Capacity .................................. 15 15 15 15 15 593 | 594 | Peak Day Distribution (Million Gallons) ............... 82.2 78.7 79.2 87.2 87.2 595 | Average Daily Distribution (Million Gallons) ....... 60.3 58.9 57.3 59.4 56.1 596 | Average Daily Sales Per Capita (Gallons) ............. 159.3 157.9 152.8 152.8 162.6 597 | __________________ 598 | Source: Anaheim 599 | 600 | Existing Facilities 601 | 602 | """.decode('ascii', 'ignore').split("\n") 603 | 604 | 605 | # In[ ]: 606 | 607 | rows = [row_feature(l) for l in test_string] 608 | 609 | tables = [rows[b:e] for b,e in filter_row_spans(rows, row_qualifies)] 610 | table = tables[0] 611 | s = structure_rows(table, rows[b-4:b]) 612 | print s[0] 613 | 614 | 615 | # In[ ]: 616 | 617 | test_string =""" 618 | CALIFORNIA MUNICIPAL FINANCE AUTHORITY 619 | Revenue Bonds, Series 2015-A 620 | (City of Anaheim Water System Project) 621 | 622 | MATURITY SCHEDULE 623 | 624 | $58,205,000 Serial Bonds 625 | 626 | Maturity Date Principal Interest 627 | (October 1) Amount Rate Yield CUSIP† 628 | 2015 $ 775,000 2.000% 0.100% 13048TTV5 629 | 2016 1,575,000 2.000 0.300 13048TTW3 630 | 2017 1,620,000 3.000 0.660 13048TTX1 631 | 2018 1,675,000 4.000 0.930 13048TTY9 632 | 2019 2,045,000 5.000 1.150 13048TTZ6 633 | 2020 2,155,000 5.000 1.320 13048TUA9 634 | 2021 2,250,000 4.000 1.520 13048TUB7 635 | 2022 2,610,000 5.000 1.670 13048TUC5 636 | 2023 2,730,000 4.000 1.810 13048TUD3 637 | 2024 2,875,000 5.000 1.920 13048TUE1 638 | 2025 3,025,000 5.000 2.030(c) 13048TUF8 639 | 2026 3,190,000 5.000 2.200(c) 13048TUG6 640 | 2027 3,355,000 5.000 2.320(c) 13048TUH4 641 | 2028 3,520,000 5.000 2.450(c) 13048TUJ0 642 | 2029 3,700,000 5.000 2.520(c) 13048TUK7 643 | 2030 3,880,000 5.000 2.600(c) 13048TUL5 644 | 2031 4,055,000 4.000 3.140(c) 13048TUM3 645 | 2032 4,220,000 4.000 3.190(c) 13048TUN1 646 | 2033 4,390,000 4.000 3.230(c) 13048TUP6 647 | 2034 4,560,000 4.000 3.270(c) 13048TUQ4 648 | 649 | $24,535,000 4.000% Term Bonds due October 1, 2040 – Yield: 3.400%(c); CUSIP†: 13048TUR2 650 | $13,145,000 5.250% Term Bonds due October 1, 2045 – Yield: 2.970%(c); CUSIP†: 13048TUS0 651 | 652 | """.decode('ascii', 'ignore').split("\n") 653 | 654 | 655 | # In[ ]: 656 | 657 | for file in os.listdir('txt'): 658 | 659 | print ("--------" + file + "--------") 660 | 661 | with codecs.open('txt/'+file, "r", "utf-8") as f: 662 | 663 | lines = [l.encode('ascii', 'ignore').replace('\n', '') for l in f] 664 | rows = [row_feature(l) for l in lines] 665 | 666 | for b,e in filter_row_spans(rows, row_qualifies): 667 | print "TABLE STARTING AT LINE", b 668 | table = rows[b:e] 669 | structure, data, headers = structure_rows(table, rows[b-config['meta_info_lines_above']:b]) 670 | print headers 671 | captions = [(col['value'] if 'value' in col.keys() else "---") +" (%s, %s)" % (col['type'], col['subtype']) for col in structure] 672 | print captions 673 | for r in data: 674 | cols = [col['value']+" (%s, %s)" % (col['type'], col['subtype']) for col in r] 675 | print len(cols), cols 676 | 677 | 678 | 679 | # In[ ]: 680 | 681 | rstr =""" 682 | Population Served Outside City (Est.) ................... 8,457 9,000 9,000 9,000 9,000 683 | Total Population Served ........................... 356,762 355,161 352,793 350,034 345,265 684 | """.decode('ascii', 'ignore').split("\n") 685 | for r in rstr: 686 | print "split", re.split(tokenize_pattern, r) 687 | print "token", [v['value'] for v in row_feature(r)], row_feature(r) 688 | 689 | 690 | # In[ ]: 691 | 692 | #subtype_indicator['test'] = r'.*\$.*' 693 | for sub, indicator in subtype_indicator.iteritems(): 694 | print sub, indicator, re.match(indicator, " .......................................................... $ ") 695 | 696 | 697 | # In[ ]: 698 | 699 | 700 | 701 | -------------------------------------------------------------------------------- /deprecated/deploy_notebook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | SERVLET = "server" 4 | 5 | ipython nbconvert --to python $1 6 | OFILEP=${1%%ipynb} 7 | 8 | rm "$SERVLET"".py" 9 | mv "$OFILEP"".py" "$SERVLET"".py" 10 | 11 | nohup python "$SERVLET"".py" &> nohup.out.log& 12 | -------------------------------------------------------------------------------- /deprecated/manifest.yml: -------------------------------------------------------------------------------- 1 | applications: 2 | - path: . 3 | memory: 128M 4 | instances: 1 5 | domain: mybluemix.net 6 | name: TabulaRazr 7 | host: tabularazr 8 | disk_quota: 1024M 9 | 10 | -------------------------------------------------------------------------------- /deprecated/pdf2text_bulk.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "!pwd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 7, 17 | "metadata": { 18 | "collapsed": false 19 | }, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "0/4 pdftotext -enc UTF-8 -layout pdf/EA716610-EA562590-EA958701.pdf txt/EA716610-EA562590-EA958701.pdf.txt\n", 26 | "1/4 pdftotext -enc UTF-8 -layout pdf/EP753324-ER508056-ER910760.pdf txt/EP753324-ER508056-ER910760.pdf.txt\n", 27 | "2/4 pdftotext -enc UTF-8 -layout pdf/ER544111-ER421289-ER823264.pdf txt/ER544111-ER421289-ER823264.pdf.txt\n", 28 | "3/4 pdftotext -enc UTF-8 -layout pdf/ER588705-ER457598-ER860368.pdf txt/ER588705-ER457598-ER860368.pdf.txt\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "from __future__ import print_function\n", 34 | "import re\n", 35 | "import os\n", 36 | "import codecs\n", 37 | "import string\n", 38 | "\n", 39 | "PDF_SUBFOLDER = 'pdf'\n", 40 | "TXT_SUBFOLDER = 'txt'\n", 41 | "\n", 42 | "def create_path(path):\n", 43 | " try: \n", 44 | " os.makedirs(path)\n", 45 | " except OSError:\n", 46 | " if not os.path.isdir(path):\n", 47 | " raise \n", 48 | "\n", 49 | "#Convert all pdfs\n", 50 | "files = os.listdir(PDF_SUBFOLDER)\n", 51 | "create_path(os.path.join(TXT_SUBFOLDER))\n", 52 | "\n", 53 | "for i,f in enumerate(files):\n", 54 | "\n", 55 | " pdf_path = os.path.join(PDF_SUBFOLDER, f)\n", 56 | " txt_path = os.path.join(TXT_SUBFOLDER, f+'.txt')\n", 57 | " \n", 58 | " if not os.path.isfile(txt_path):\n", 59 | " #Layout preservation crucial to maintain clues about tabular data\n", 60 | " cmd = \"pdftotext -enc UTF-8 -layout %s %s\" % (pdf_path, txt_path)\n", 61 | " print ('%d/%d %s' % (i, len(files), cmd))\n", 62 | " os.system(cmd)\n", 63 | " else:\n", 64 | " print ('skipping %s, already exists.' % (pdf_path, ))" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [] 75 | } 76 | ], 77 | "metadata": { 78 | "kernelspec": { 79 | "display_name": "Python 2", 80 | "language": "python", 81 | "name": "python2" 82 | }, 83 | "language_info": { 84 | "codemirror_mode": { 85 | "name": "ipython", 86 | "version": 2 87 | }, 88 | "file_extension": ".py", 89 | "mimetype": "text/x-python", 90 | "name": "python", 91 | "nbconvert_exporter": "python", 92 | "pygments_lexer": "ipython2", 93 | "version": "2.7.10" 94 | } 95 | }, 96 | "nbformat": 4, 97 | "nbformat_minor": 0 98 | } 99 | -------------------------------------------------------------------------------- /design/1_Home.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/design/1_Home.jpg -------------------------------------------------------------------------------- /design/2_Show.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/design/2_Show.jpg -------------------------------------------------------------------------------- /design/3_Browse.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/design/3_Browse.jpg -------------------------------------------------------------------------------- /design/browse_similar_data_tables_feature.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/design/browse_similar_data_tables_feature.png -------------------------------------------------------------------------------- /design/screenshot_show_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/design/screenshot_show_example.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask 2 | Jinja2 3 | pandas 4 | matplotlib 5 | fuzzywuzzy 6 | python-Levenshtein 7 | -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | from __future__ import print_function 5 | from backend import * 6 | from data_query import filter_tables 7 | import os 8 | import sys 9 | from collections import OrderedDict 10 | 11 | import json 12 | from flask import Flask, request, redirect, url_for, send_from_directory 13 | from werkzeug import secure_filename 14 | from flask import jsonify, render_template, make_response 15 | import urllib 16 | from urlparse import urlparse 17 | 18 | import matplotlib 19 | #Uncomment the line below to prevent runtime error during file processing 20 | matplotlib.use('Agg') 21 | 22 | import matplotlib.pyplot as plt 23 | 24 | ####### 25 | 26 | TITLE = "TabulaRazr (XIRR for muni_bonds)" 27 | 28 | scripts = [ 29 | "bower_components/jquery/dist/jquery.min.js", 30 | "bower_components/materialize/dist/js/materialize.js" 31 | ] 32 | css = [ 33 | "./css/main.css", 34 | "./css/style.css", 35 | "bower_components/materialize/dist/css/materialize.css" 36 | ] 37 | 38 | UPLOAD_FOLDER = './static/ug' 39 | ALLOWED_EXTENSIONS = set(['txt', 'pdf']) 40 | 41 | TITLE = "TabulaRazr" 42 | app = Flask(__name__) 43 | app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER 44 | 45 | 46 | 47 | def get_extension(filename): 48 | return '.' in filename and filename.rsplit('.', 1)[1] 49 | 50 | def allowed_file(filename): 51 | return get_extension(filename) in ALLOWED_EXTENSIONS 52 | 53 | def create_path(path): 54 | try: 55 | os.makedirs(path) 56 | except OSError: 57 | if not os.path.isdir(path): 58 | raise 59 | 60 | @app.route('/', methods=['GET', 'POST']) 61 | def upload_file(): 62 | 63 | if request.method == 'POST': 64 | 65 | file = request.files['file'] 66 | project = request.form['project'] 67 | url = request.form['url'] 68 | path = os.path.join(app.config['UPLOAD_FOLDER'], project) 69 | 70 | filename = None 71 | 72 | if url: 73 | url_fragments = urlparse(url) 74 | filename_temp = url_fragments.path.split(r'/')[-1] 75 | if url_fragments.scheme in ('http', 'ftp') and path and allowed_file(filename_temp): 76 | filename = secure_filename(filename_temp) 77 | create_path(path) 78 | path = os.path.join(path, filename) 79 | urllib.urlretrieve (url, path) 80 | 81 | elif file and allowed_file(file.filename): 82 | filename = secure_filename(file.filename) 83 | create_path(path) 84 | 85 | path = os.path.join(path, filename) 86 | file.save(path) 87 | 88 | if filename: 89 | return redirect(url_for('analyze', project=project, filename=filename)) 90 | 91 | return render_template('index.html', 92 | title=TITLE, 93 | css=css) 94 | 95 | 96 | def analyze_file(filename, project): 97 | 98 | if not project or project in ("/", "-"): 99 | project = "" 100 | 101 | path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename) 102 | extension = get_extension(filename) 103 | 104 | txt_path = path 105 | if extension == "pdf": 106 | txt_path += '.txt' 107 | filename += '.txt' 108 | if not os.path.isfile(txt_path): 109 | #Layout preservation crucial to preserve clues about tabular data 110 | cmd = "pdftotext -enc UTF-8 -layout %s %s " % (path, txt_path) 111 | os.system(cmd) 112 | 113 | if not os.path.isfile(txt_path): 114 | return None, filename, jsonify({'error' : txt_path+' not found' }) 115 | 116 | #Export tables 117 | tables = return_tables(txt_path) 118 | 119 | with codecs.open(txt_path + '.tables.json', "w", "utf-8") as file: 120 | json.dump(tables, file) 121 | 122 | #Export chart 123 | lines_per_page = 80 124 | nr_data_rows = [] 125 | #for t in tables.values(): 126 | # print t 127 | for key, t in tables.iteritems(): 128 | e = t['end_line'] 129 | b = t['begin_line'] 130 | for l in range(b, e): 131 | page = l / lines_per_page 132 | if len(nr_data_rows) <= page: 133 | nr_data_rows += ([0]*(page-len(nr_data_rows)+1)) 134 | nr_data_rows[page] += 1 135 | dr = pd.DataFrame() 136 | dr['value'] = nr_data_rows 137 | dr['page'] = range(0, len(dr)) 138 | 139 | #plot the row density 140 | chart = filename+".png" 141 | fig, ax = plt.subplots( nrows=1, ncols=1, figsize=(7,2.5) ) # create figure & 1 axis 142 | ax.set_xlabel('page nr.') 143 | ax.set_ylabel('number of data rows') 144 | ax.set_title('Distribution of Rows with Data') 145 | ax.plot(dr['page'], dr['value'], ) 146 | fig.savefig(txt_path + '.png') # save the figure to file 147 | plt.close(fig) # close the figure 148 | 149 | return tables, filename, None 150 | 151 | #Todo: accetpt URLs 152 | @app.route('/analyze//', methods=['GET', 'POST']) 153 | def analyze(filename, project): 154 | 155 | tables, filename_new, error = analyze_file(filename, project) 156 | if error: 157 | return error 158 | 159 | if request.method == 'POST': 160 | return jsonify(tables) 161 | 162 | return redirect(url_for('show_one_file', filename=filename_new, project=project)) 163 | 164 | 165 | #Todo: factor out table rendering, overview etc., i.e. make functions more composable 166 | @app.route('/show//') 167 | def show_one_file(filename, project): 168 | 169 | if not project or project in ("/", "-"): 170 | project = "" 171 | path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename) 172 | 173 | tables_path = path + '.tables.json' 174 | chart_path_html = os.path.join('ug', project, filename + '.png') 175 | if not os.path.isfile(tables_path): 176 | analyze(filename, project) 177 | 178 | with codecs.open(tables_path, "r", "utf-8") as file: 179 | tables = json.load(file) 180 | tables = OrderedDict(sorted(tables.iteritems(), key=lambda kv: int(kv[0]))) 181 | #Todo: actually do the filtering 182 | filter_arg = request.args.get('filter_arg') 183 | 184 | #Create HTML 185 | notices = ['Extraction Results for ' + filename, 'Ordered by lines', 'Applied filter: %s' % filter_arg] 186 | dfs = (table_to_df(table).to_html() for table in tables.values()) 187 | 188 | headers = [] 189 | for t in tables.values(): 190 | if 'headers' in t: 191 | headers.append(" | ".join(h for h in t['headers'])) 192 | else: 193 | headers.append('NO HEADER') 194 | meta_data = [{'begin_line' : t['begin_line'], 'end_line' : t['end_line'], 195 | 'margin_top' : (t['begin_line']-t['meta_begin_line']) if 'meta_begin_line' in t else config["meta_info_lines_above"]} for t in tables.values()] 196 | 197 | filename_pdf = None 198 | 199 | path_pdf = path[:-4] 200 | if get_extension(path_pdf) == 'pdf' and os.path.isfile(path_pdf): 201 | filename_pdf = filename[:-4] 202 | 203 | return render_template('viewer.html', 204 | title=TITLE + ' - ' + filename, 205 | base_scripts=scripts, filename=filename, filename_pdf=filename_pdf, project=project, 206 | css=css, notices = notices, tables = dfs, headers=headers, meta_data=meta_data, chart=chart_path_html) 207 | 208 | @app.route('/inspector//') 209 | def inspector(filename, project): 210 | if not project or project in ("/", "-"): 211 | project = "" 212 | path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename) 213 | 214 | begin_line = int(request.args.get('data_begin')) 215 | end_line = int(request.args.get('data_end')) 216 | margin_top = int(request.args.get('margin_top', config["meta_info_lines_above"])) 217 | margin_bottom = margin_top 218 | 219 | #Todo: solve overlap as in more advanced branches 220 | notices = ['showing data lines from %i to %i with %i meta-lines above and below' % (begin_line, end_line, margin_top)] 221 | with codecs.open(path, "r", "utf-8", errors="replace") as file: 222 | lines = [l.encode("utf-8", errors="replace") for l in file][begin_line - margin_top:end_line + margin_bottom] 223 | top_lines = lines[:margin_top] 224 | table_lines = lines[margin_top:margin_top+end_line-begin_line] 225 | bottom_lines = lines[margin_top+end_line-begin_line:] 226 | 227 | offset = begin_line-margin_top 228 | table_id = begin_line 229 | 230 | return render_template('inspector.html', 231 | title=TITLE, 232 | base_scripts=scripts, css=css, notices = notices, filename=filename, top_lines=top_lines, project=project, 233 | table_lines=table_lines, bottom_lines=bottom_lines, offset=offset, table_id=begin_line) 234 | 235 | @app.route('/project_analysis', methods=['POST']) 236 | def project_analysis(): 237 | project = request.form['project'] 238 | if not project or project in ("/", "-"): 239 | project = "" 240 | filter_arg = request.form['filter'] 241 | return redirect(url_for('filter_tables_web', project=project, filter=filter_arg)) 242 | 243 | @app.route('/filter_tables/', methods=['GET', 'POST']) 244 | def filter_tables_web(project): 245 | if not project or project in ("/", "-"): 246 | project = "" 247 | path = os.path.join(app.config['UPLOAD_FOLDER'], project) 248 | 249 | filter_arg = request.args.get('filter') 250 | filter_file = os.path.join('static', 'filters', filter_arg +'.json') 251 | with codecs.open(filter_file, "r", "utf-8", errors="replace") as file: 252 | _filter = json.load(file) 253 | 254 | #Go through all .txt files in the project, grab tables and return filtered result 255 | files = os.listdir(path) 256 | results = {} 257 | files_analyzed = set() 258 | nr_tables = 0 259 | for i,f in enumerate(files): 260 | 261 | extension = get_extension(f) 262 | tables_path = path + '.json' 263 | 264 | if extension == "txt": 265 | 266 | tables = None 267 | if not os.path.isfile(tables_path): 268 | #Analyze on the spot: 269 | tables, f_new, error = analyze_file(f, project) 270 | print ("on the spot", f_new, project, tables_path, error, len(tables)) 271 | if error: 272 | return error 273 | else: 274 | with codecs.open(tables_path, "r", "utf-8") as file: 275 | tables = json.load(file) 276 | files_analyzed.update(f) 277 | 278 | #Only keep highest results 279 | for t in filter_tables(tables.values(), _filter): 280 | if f not in results: 281 | results[f] = [t] 282 | else: 283 | max_c = max(r[0] for r in results[f]) 284 | if t[0] >= max_c: 285 | results[f].append(t) 286 | nr_tables += len(tables) 287 | #Keep all results 288 | #results[f] = [t for t in filter_tables(tables.values(), _filter)] 289 | 290 | #return jsonify(results) 291 | #Todo: create .csv for download 292 | for filename, extracted_results in results.iteritems(): 293 | for result in extracted_results: 294 | t_html = table_to_df(result[1]).to_html() 295 | result[1]['html'] = t_html 296 | 297 | total_best_tables = sum(len(results[r]) for r in results.keys()) 298 | notices = ["Project %s filtered by %s" % (project, filter_arg), 299 | "Total of %i tables exist in %i files" % (nr_tables, len(files_analyzed)), 300 | "%i best tables across %i files" % (total_best_tables, len(results)) ] 301 | 302 | return render_template('filtered_project.html', 303 | title=TITLE + ' - ' + project + ' filtered by ' + filter_arg, 304 | base_scripts=scripts, project=project, 305 | css=css, notices = notices, results=results) 306 | 307 | from xirr_calc import xirr 308 | import traceback 309 | @app.route('/calculate_xirr//') 310 | def calculate_xirr(filename, project): 311 | 312 | if not project or project in ("/", "-"): 313 | project = "" 314 | path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename) 315 | tables_path = path + '.tables.json' 316 | 317 | if not os.path.isfile(tables_path): 318 | analyze(filename, project) 319 | 320 | with codecs.open(tables_path, "r", "utf-8") as file: 321 | tables = json.load(file) 322 | 323 | #Todo: factor out into xirr_calc 324 | results = {"funds" : [], "maturity_schedule" : [] } 325 | try: 326 | #Todo: factor out into "take_one" function 327 | for k, filter_results in results.iteritems(): 328 | 329 | filter_file = os.path.join('static', 'filters', k+'.json') 330 | with codecs.open(filter_file, "r", "utf-8", errors="replace") as file: 331 | _filter = json.load(file) 332 | 333 | #Only keep highest results 334 | for t in filter_tables(tables.values(), _filter): 335 | if len(filter_results) == 0 or t[0] >= max(r[0] for r in filter_results): 336 | filter_results.append(t) 337 | t_html = table_to_df(t[1]).to_html() 338 | filter_results[-1][1]['html'] = t_html 339 | 340 | log = [] 341 | # Get salient tables 342 | log.append("Found %i candidates for funds and %i for maturity schedule" % \ 343 | (len(results['funds']), len(results['maturity_schedule'])) ) 344 | funds_table = max( sorted( results['funds'], key = lambda t: t[1]['begin_line'] ), key = lambda t: t[0])[1] 345 | schedule_table = max( sorted( results['maturity_schedule'], key = lambda t: t[1]['begin_line'] ), key = lambda t: t[0])[1] 346 | log.append("Using table %i for funds and table %i for maturity schedule" 347 | % (funds_table['begin_line'], schedule_table['begin_line'])) 348 | 349 | with codecs.open(path, "r", "utf-8") as file: 350 | rate, log_list = xirr(file, funds_table, schedule_table) 351 | 352 | log += log_list 353 | if rate: 354 | log.append("

Final Rate: %0.2f%%

" % rate) 355 | 356 | except Exception as e: 357 | log.append("... failed with %s" % traceback.format_exception(*sys.exc_info())) 358 | return render_template('view_filtered.html', 359 | title=TITLE + ' - ' + filename + ' XIRR calculator with filters,' + ", ".join(results.keys()), 360 | base_scripts=scripts, filename=filename, project=project, 361 | css=css, notices = log, results=results) 362 | 363 | 364 | def run_from_ipython(): 365 | try: 366 | __IPYTHON__ 367 | return True 368 | except NameError: 369 | return False 370 | 371 | if __name__ == "__main__": 372 | if run_from_ipython(): 373 | app.run(host='0.0.0.0', port = 7080) 374 | else: 375 | PORT = int(os.getenv('PORT', 7081)) 376 | app.run(debug=True, host='0.0.0.0', port = PORT) 377 | 378 | 379 | 380 | 381 | 382 | -------------------------------------------------------------------------------- /static/TabulaRazr_Logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/TabulaRazr_Logo.png -------------------------------------------------------------------------------- /static/center_for_municipal_finance_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/center_for_municipal_finance_logo.png -------------------------------------------------------------------------------- /static/css/main.css: -------------------------------------------------------------------------------- 1 | 2 | .pyx-slider { 3 | width: 200px !important; 4 | } 5 | 6 | .custom-dt { 7 | width: 600px; 8 | } 9 | 10 | html { 11 | height: 100%; 12 | } 13 | 14 | body { 15 | height: 100%; 16 | font-style:normal; 17 | font-family:"BrandonText-Regular", sans-serif; 18 | } 19 | 20 | footer { 21 | position: absolute; 22 | bottom: 0; 23 | width: 100%; 24 | } 25 | 26 | #logos { 27 | line-height:100px; 28 | } 29 | 30 | #logos img { 31 | display: inline-block; 32 | vertical-align: middle; 33 | } 34 | 35 | .info-block-icon { 36 | display: inline-block; 37 | vertical-align: middle; 38 | font-size: 3.5em; 39 | margin: 0 3% 0 0; 40 | } 41 | 42 | .info-block-text { 43 | display: inline-block; 44 | vertical-align: middle; 45 | font-size: 1.5em; 46 | } 47 | 48 | @media only screen and (max-width: 1250px) { 49 | .info-block-icon { 50 | font-size: 3em; 51 | } 52 | .info-block-text { 53 | font-size: 1.25em; 54 | } 55 | } 56 | 57 | #mychart { 58 | display: inline-block; 59 | } 60 | 61 | #myhist { 62 | display: inline-block; 63 | } 64 | 65 | select { 66 | display: initial !important; 67 | } 68 | 69 | .icon-block .material-icons { 70 | font-size: inherit !important; 71 | } 72 | 73 | .full-width { 74 | width: 100%; 75 | } 76 | 77 | .card-btn { 78 | box-shadow: 0 2px 5px 0 rgba(0, 0, 0, 0.16), 0 2px 10px 0 rgba(0, 0, 0, 0.12); 79 | } 80 | 81 | .card-btn:hover { 82 | transition: all .25s; 83 | box-shadow: 0 5px 11px 0 rgba(0, 0, 0, 0.18), 0 4px 15px 0 rgba(0, 0, 0, 0.15); 84 | } 85 | 86 | .card-btn.blue:hover{ 87 | background-color: #42a5f5 !important; 88 | } 89 | 90 | h5.truncate { 91 | cursor: pointer; 92 | } 93 | 94 | blockquote { 95 | border-color: #2196F3 !important; 96 | } 97 | 98 | .table-showhide { 99 | float: right; 100 | } 101 | 102 | .dataTables_scroll { 103 | clear: both; 104 | overflow: auto; 105 | } 106 | 107 | .mg-line1-color { 108 | stroke: #7bc9c1; 109 | } 110 | 111 | .mg-area1-color { 112 | fill: #7bc9c1; 113 | } 114 | 115 | .mg-histogram .mg-bar rect.active { 116 | fill: #7bc9c1; 117 | } 118 | 119 | .mg-histogram .mg-bar rect { 120 | fill: #7bc9c1 ; 121 | shape-rendering: auto; 122 | } 123 | 124 | 125 | .mg-x-axis text, 126 | .mg-y-axis text, 127 | .mg-histogram .axis text { 128 | fill: black; 129 | font-size: 1.2rem; 130 | opacity: 0.5; 131 | } 132 | 133 | .mg-x-axis .label, 134 | .mg-y-axis .label, 135 | .mg-axis .label { 136 | font-size: 1.2rem; 137 | text-transform: uppercase; 138 | font-weight: 400; 139 | } 140 | 141 | /* Temporary fix - Hide footer when screen is not sufficiently tall*/ 142 | @media only screen and (max-height: 730px) { 143 | .page-footer { 144 | display: none; 145 | } 146 | } 147 | 148 | .footer-image { 149 | width: 60px; 150 | box-shadow: 0 2px 5px 0 rgba(0, 0, 0, 0.16), 0 2px 10px 0 rgba(0, 0, 0, 0.12); 151 | } 152 | 153 | .footer-image:hover { 154 | transition: all .25s; 155 | box-shadow: 0 5px 11px 0 rgba(0, 0, 0, 0.18), 0 4px 15px 0 rgba(0, 0, 0, 0.15); 156 | } 157 | 158 | #find-us-list li{ 159 | display: inline-block; 160 | margin-right: 0.5em; 161 | vertical-align: top; 162 | } -------------------------------------------------------------------------------- /static/css/source/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Top 10 Express Table Designs - Smashing Magazine Source 6 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 |
EmployeeSalaryBonusSupervisor
Stephen C. Cox$300$50Bob
Josephin Tan$150-Annie
Joyce Ming$200$35Andy
James A. Pentel$175$25Annie
49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 |
EmployeeSalaryBonusSupervisor
Stephen C. Cox$300$50Bob
Josephin Tan$150-Annie
Joyce Ming$200$35Andy
James A. Pentel$175$25Annie
86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 |
ComedyAdventureActionChildren
Scary MovieIndiana JonesThe PunisherWall-E
Epic MovieStar WarsBad BoysMadagascar
SpartanLOTRDie HardFinding Nemo
Dr. DolittleThe Mummy300A Bug's Life
124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 |
EmployeeSalaryBonusSupervisor
Stephen C. Cox$300$50Bob
Josephin Tan$150-Annie
Joyce Ming$200$35Andy
James A. Pentel$175$25Annie
161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 |
ComedyAdventureActionChildren
Scary MovieIndiana JonesThe PunisherWall-E
Epic MovieStar WarsBad BoysMadagascar
SpartanLOTRDie HardFinding Nemo
Dr. DolittleThe Mummy300A Bug's Life
199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 |
EmployeeSalaryBonusSupervisor
Stephen C. Cox$300$50Bob
Josephin Tan$150-Annie
Joyce Ming$200$35Andy
James A. Pentel$175$25Annie
237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 |
ComedyAdventureActionChildren
Scary MovieIndiana JonesThe PunisherWall-E
Epic MovieStar WarsBad BoysMadagascar
SpartanLOTRDie HardFinding Nemo
Dr. DolittleThe Mummy300A Bug's Life
281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 |
CompanyQ1Q2Q3Q4
Microsoft20.330.523.540.3
Google50.240.6345.2339.3
Apple25.430.233.336.7
IBM20.415.622.329.3
326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 |
CompanyQ1Q2Q3Q4
Microsoft20.330.523.540.3
Google50.240.6345.2339.3
Apple25.430.233.336.7
IBM20.415.622.329.3
369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 |
CompanyQ1Q2Q3Q4
The above data were fictional and made up, please do not sue me
Microsoft20.330.523.540.3
Google50.240.6345.2339.3
Apple25.430.233.336.7
IBM20.415.622.329.3
417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 |
FavoriteGreatNiceBad
Passion of the ChristBourne UltimatumShoot 'Em UpAli
The Big FishThe MummyApocalyptoMonster
Shawshank RedemptionCold MountainIndiana JonesDead or Alive
Greatest Story Ever ToldI Am LegendStar WarsSaw 3
454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 |
CompanyQ1Q2Q3Q4
The above data were fictional and made up, please do not sue me 
Microsoft20.330.523.540.3
Google50.240.6345.2339.3
Apple25.430.233.336.7
IBM20.415.622.329.3
503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 |
EmployeeDivisionSuggestions
IE 6 users won't see the transparent background if the hack is not applied
Stephen C. CoxMarketingMake discount offers
Josephin TanAdvertisingGive bonuses
Joyce MingMarketingNew designs
James A. PentelMarketingBetter Packaging
540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 |
EmployeeDivisionSuggestionsRating
Give background color to the table cells to achieve seamless transition
Stephen C. CoxMarketingMake discount offers3/10
Josephin TanAdvertisingGive bonuses5/10
Joyce MingMarketingNew designs8/10
James A. PentelMarketingBetter Packaging8/10
583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 |
EmployeeSalaryBonusSupervisor
Stephen C. Cox$300$50Bob
Josephin Tan$150-Annie
Joyce Ming$200$35Andy
James A. Pentel$175$25Annie
621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 |
NationCapitalLanguageUnique
JapanTokyoJapaneseKarate
South KoreaSeoulKoreanGinseng
ChinaBeijingMandarinKung-Fu
IndonesiaJakartaIndonesianBatik
658 | 659 | 660 | -------------------------------------------------------------------------------- /static/css/source/table-images/back.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/back.png -------------------------------------------------------------------------------- /static/css/source/table-images/blurry.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/blurry.jpg -------------------------------------------------------------------------------- /static/css/source/table-images/botleft.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/botleft.png -------------------------------------------------------------------------------- /static/css/source/table-images/botright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/botright.png -------------------------------------------------------------------------------- /static/css/source/table-images/gradback.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/gradback.png -------------------------------------------------------------------------------- /static/css/source/table-images/gradhead.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/gradhead.png -------------------------------------------------------------------------------- /static/css/source/table-images/gradhover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/gradhover.png -------------------------------------------------------------------------------- /static/css/source/table-images/header.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/header.jpg -------------------------------------------------------------------------------- /static/css/source/table-images/left.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/left.png -------------------------------------------------------------------------------- /static/css/source/table-images/pattern-head.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/pattern-head.png -------------------------------------------------------------------------------- /static/css/source/table-images/pattern.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/pattern.gif -------------------------------------------------------------------------------- /static/css/source/table-images/pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/pattern.png -------------------------------------------------------------------------------- /static/css/source/table-images/patternb-head.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/patternb-head.png -------------------------------------------------------------------------------- /static/css/source/table-images/patternb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/patternb.png -------------------------------------------------------------------------------- /static/css/source/table-images/right.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/right.png -------------------------------------------------------------------------------- /static/css/style.css: -------------------------------------------------------------------------------- 1 | /* ------------------ 2 | styling for the tables 3 | ------------------ */ 4 | 5 | 6 | body 7 | { 8 | line-height: 1.6em; 9 | } 10 | 11 | 12 | .dataframe 13 | { 14 | font-family: "Myriad Pro", "Open Sans", Sans-Serif; 15 | font-size: 14px; 16 | background: #fff; 17 | margin: 10px; 18 | width: 100%; 19 | border-collapse: collapse; 20 | text-align: left; 21 | } 22 | .dataframe thead th 23 | { 24 | font-size: 14px; 25 | font-weight: normal; 26 | color: #039; 27 | padding: 10px 8px; 28 | border-bottom: 2px solid #6678b1; 29 | } 30 | 31 | .dataframe tbody tr{ 32 | border-bottom: 1px solid #B4C1E8; 33 | } 34 | .dataframe td 35 | { 36 | transition: all 0.3s; 37 | color: #669; 38 | padding: 9px 8px 9px 8px; 39 | } 40 | 41 | .dataframe tbody tr:nth-child(even) { 42 | background: #F5F5F5; 43 | } 44 | 45 | .dataframe tbody tr:hover{ 46 | background-color: #F1F1F9; 47 | } 48 | 49 | .dataframe tbody tr:hover td 50 | { 51 | color: #009; 52 | } 53 | 54 | .meta-line, .table-line 55 | { 56 | padding: 0px; 57 | margin: 0px; 58 | } 59 | 60 | .meta-line 61 | { 62 | color: grey; 63 | } 64 | 65 | 66 | #hor-minimalist-b 67 | { 68 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 69 | font-size: 12px; 70 | background: #fff; 71 | margin: 45px; 72 | width: 480px; 73 | border-collapse: collapse; 74 | text-align: left; 75 | } 76 | #hor-minimalist-b th 77 | { 78 | font-size: 14px; 79 | font-weight: normal; 80 | color: #039; 81 | padding: 10px 8px; 82 | border-bottom: 2px solid #6678b1; 83 | } 84 | #hor-minimalist-b td 85 | { 86 | border-bottom: 1px solid #ccc; 87 | color: #669; 88 | padding: 6px 8px; 89 | } 90 | #hor-minimalist-b tbody tr:hover td 91 | { 92 | color: #009; 93 | } 94 | 95 | 96 | #ver-minimalist 97 | { 98 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 99 | font-size: 12px; 100 | margin: 45px; 101 | width: 480px; 102 | text-align: left; 103 | border-collapse: collapse; 104 | } 105 | #ver-minimalist th 106 | { 107 | padding: 8px 2px; 108 | font-weight: normal; 109 | font-size: 14px; 110 | border-bottom: 2px solid #6678b1; 111 | border-right: 30px solid #fff; 112 | border-left: 30px solid #fff; 113 | color: #039; 114 | } 115 | #ver-minimalist td 116 | { 117 | padding: 12px 2px 0px 2px; 118 | border-right: 30px solid #fff; 119 | border-left: 30px solid #fff; 120 | color: #669; 121 | } 122 | 123 | 124 | #box-table-a 125 | { 126 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 127 | font-size: 12px; 128 | margin: 45px; 129 | width: 480px; 130 | text-align: left; 131 | border-collapse: collapse; 132 | } 133 | #box-table-a th 134 | { 135 | font-size: 13px; 136 | font-weight: normal; 137 | padding: 8px; 138 | background: #b9c9fe; 139 | border-top: 4px solid #aabcfe; 140 | border-bottom: 1px solid #fff; 141 | color: #039; 142 | } 143 | #box-table-a td 144 | { 145 | padding: 8px; 146 | background: #e8edff; 147 | border-bottom: 1px solid #fff; 148 | color: #669; 149 | border-top: 1px solid transparent; 150 | } 151 | #box-table-a tr:hover td 152 | { 153 | background: #d0dafd; 154 | color: #339; 155 | } 156 | 157 | 158 | #box-table-b 159 | { 160 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 161 | font-size: 12px; 162 | margin: 45px; 163 | width: 480px; 164 | text-align: center; 165 | border-collapse: collapse; 166 | border-top: 7px solid #9baff1; 167 | border-bottom: 7px solid #9baff1; 168 | } 169 | #box-table-b th 170 | { 171 | font-size: 13px; 172 | font-weight: normal; 173 | padding: 8px; 174 | background: #e8edff; 175 | border-right: 1px solid #9baff1; 176 | border-left: 1px solid #9baff1; 177 | color: #039; 178 | } 179 | #box-table-b td 180 | { 181 | padding: 8px; 182 | background: #e8edff; 183 | border-right: 1px solid #aabcfe; 184 | border-left: 1px solid #aabcfe; 185 | color: #669; 186 | } 187 | 188 | 189 | #hor-zebra 190 | { 191 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 192 | font-size: 12px; 193 | margin: 45px; 194 | width: 480px; 195 | text-align: left; 196 | border-collapse: collapse; 197 | } 198 | #hor-zebra th 199 | { 200 | font-size: 14px; 201 | font-weight: normal; 202 | padding: 10px 8px; 203 | color: #039; 204 | } 205 | #hor-zebra td 206 | { 207 | padding: 8px; 208 | color: #669; 209 | } 210 | #hor-zebra .odd 211 | { 212 | background: #e8edff; 213 | } 214 | 215 | 216 | #ver-zebra 217 | { 218 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 219 | font-size: 12px; 220 | margin: 45px; 221 | width: 480px; 222 | text-align: left; 223 | border-collapse: collapse; 224 | } 225 | #ver-zebra th 226 | { 227 | font-size: 14px; 228 | font-weight: normal; 229 | padding: 12px 15px; 230 | border-right: 1px solid #fff; 231 | border-left: 1px solid #fff; 232 | color: #039; 233 | } 234 | #ver-zebra td 235 | { 236 | padding: 8px 15px; 237 | border-right: 1px solid #fff; 238 | border-left: 1px solid #fff; 239 | color: #669; 240 | } 241 | .vzebra-odd 242 | { 243 | background: #eff2ff; 244 | } 245 | .vzebra-even 246 | { 247 | background: #e8edff; 248 | } 249 | #ver-zebra #vzebra-adventure, #ver-zebra #vzebra-children 250 | { 251 | background: #d0dafd; 252 | border-bottom: 1px solid #c8d4fd; 253 | } 254 | #ver-zebra #vzebra-comedy, #ver-zebra #vzebra-action 255 | { 256 | background: #dce4ff; 257 | border-bottom: 1px solid #d6dfff; 258 | } 259 | 260 | 261 | #one-column-emphasis 262 | { 263 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 264 | font-size: 12px; 265 | margin: 45px; 266 | width: 480px; 267 | text-align: left; 268 | border-collapse: collapse; 269 | } 270 | #one-column-emphasis th 271 | { 272 | font-size: 14px; 273 | font-weight: normal; 274 | padding: 12px 15px; 275 | color: #039; 276 | } 277 | #one-column-emphasis td 278 | { 279 | padding: 10px 15px; 280 | color: #669; 281 | border-top: 1px solid #e8edff; 282 | } 283 | .oce-first 284 | { 285 | background: #d0dafd; 286 | border-right: 10px solid transparent; 287 | border-left: 10px solid transparent; 288 | } 289 | #one-column-emphasis tr:hover td 290 | { 291 | color: #339; 292 | background: #eff2ff; 293 | } 294 | 295 | 296 | #newspaper-a 297 | { 298 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 299 | font-size: 12px; 300 | margin: 45px; 301 | width: 480px; 302 | text-align: left; 303 | border-collapse: collapse; 304 | border: 1px solid #69c; 305 | } 306 | #newspaper-a th 307 | { 308 | padding: 12px 17px 12px 17px; 309 | font-weight: normal; 310 | font-size: 14px; 311 | color: #039; 312 | border-bottom: 1px dashed #69c; 313 | } 314 | #newspaper-a td 315 | { 316 | padding: 7px 17px 7px 17px; 317 | color: #669; 318 | } 319 | #newspaper-a tbody tr:hover td 320 | { 321 | color: #339; 322 | background: #d0dafd; 323 | } 324 | 325 | 326 | #newspaper-b 327 | { 328 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 329 | font-size: 12px; 330 | margin: 45px; 331 | width: 480px; 332 | text-align: left; 333 | border-collapse: collapse; 334 | border: 1px solid #69c; 335 | } 336 | #newspaper-b th 337 | { 338 | padding: 15px 10px 10px 10px; 339 | font-weight: normal; 340 | font-size: 14px; 341 | color: #039; 342 | } 343 | #newspaper-b tbody 344 | { 345 | background: #e8edff; 346 | } 347 | #newspaper-b td 348 | { 349 | padding: 10px; 350 | color: #669; 351 | border-top: 1px dashed #fff; 352 | } 353 | #newspaper-b tbody tr:hover td 354 | { 355 | color: #339; 356 | background: #d0dafd; 357 | } 358 | 359 | 360 | #newspaper-c 361 | { 362 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 363 | font-size: 12px; 364 | margin: 45px; 365 | width: 480px; 366 | text-align: left; 367 | border-collapse: collapse; 368 | border: 1px solid #6cf; 369 | } 370 | #newspaper-c th 371 | { 372 | padding: 20px; 373 | font-weight: normal; 374 | font-size: 13px; 375 | color: #039; 376 | text-transform: uppercase; 377 | border-right: 1px solid #0865c2; 378 | border-top: 1px solid #0865c2; 379 | border-left: 1px solid #0865c2; 380 | border-bottom: 1px solid #fff; 381 | } 382 | #newspaper-c td 383 | { 384 | padding: 10px 20px; 385 | color: #669; 386 | border-right: 1px dashed #6cf; 387 | } 388 | 389 | 390 | #rounded-corner 391 | { 392 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 393 | font-size: 12px; 394 | margin: 45px; 395 | width: 480px; 396 | text-align: left; 397 | border-collapse: collapse; 398 | } 399 | #rounded-corner thead th.rounded-company 400 | { 401 | background: #b9c9fe url('table-images/left.png') left -1px no-repeat; 402 | } 403 | #rounded-corner thead th.rounded-q4 404 | { 405 | background: #b9c9fe url('table-images/right.png') right -1px no-repeat; 406 | } 407 | #rounded-corner th 408 | { 409 | padding: 8px; 410 | font-weight: normal; 411 | font-size: 13px; 412 | color: #039; 413 | background: #b9c9fe; 414 | } 415 | #rounded-corner td 416 | { 417 | padding: 8px; 418 | background: #e8edff; 419 | border-top: 1px solid #fff; 420 | color: #669; 421 | } 422 | #rounded-corner tfoot td.rounded-foot-left 423 | { 424 | background: #e8edff url('table-images/botleft.png') left bottom no-repeat; 425 | } 426 | #rounded-corner tfoot td.rounded-foot-right 427 | { 428 | background: #e8edff url('table-images/botright.png') right bottom no-repeat; 429 | } 430 | #rounded-corner tbody tr:hover td 431 | { 432 | background: #d0dafd; 433 | } 434 | 435 | 436 | #background-image 437 | { 438 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 439 | font-size: 12px; 440 | margin: 45px; 441 | width: 480px; 442 | text-align: left; 443 | border-collapse: collapse; 444 | background: url('table-images/blurry.jpg') 330px 59px no-repeat; 445 | } 446 | #background-image th 447 | { 448 | padding: 12px; 449 | font-weight: normal; 450 | font-size: 14px; 451 | color: #339; 452 | } 453 | #background-image td 454 | { 455 | padding: 9px 12px; 456 | color: #669; 457 | border-top: 1px solid #fff; 458 | } 459 | #background-image tfoot td 460 | { 461 | font-size: 11px; 462 | } 463 | #background-image tbody td 464 | { 465 | background: url('table-images/back.png'); 466 | } 467 | * html #background-image tbody td 468 | { 469 | /* 470 | ---------------------------- 471 | PUT THIS ON IE6 ONLY STYLE 472 | AS THE RULE INVALIDATES 473 | YOUR STYLESHEET 474 | ---------------------------- 475 | */ 476 | filter:progid:DXImageTransform.Microsoft.AlphaImageLoader(src='table-images/back.png',sizingMethod='crop'); 477 | background: none; 478 | } 479 | #background-image tbody tr:hover td 480 | { 481 | color: #339; 482 | background: none; 483 | } 484 | 485 | 486 | #gradient-style 487 | { 488 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 489 | font-size: 12px; 490 | margin: 45px; 491 | width: 480px; 492 | text-align: left; 493 | border-collapse: collapse; 494 | } 495 | #gradient-style th 496 | { 497 | font-size: 13px; 498 | font-weight: normal; 499 | padding: 8px; 500 | background: #b9c9fe url('table-images/gradhead.png') repeat-x; 501 | border-top: 2px solid #d3ddff; 502 | border-bottom: 1px solid #fff; 503 | color: #039; 504 | } 505 | #gradient-style td 506 | { 507 | padding: 8px; 508 | border-bottom: 1px solid #fff; 509 | color: #669; 510 | border-top: 1px solid #fff; 511 | background: #e8edff url('table-images/gradback.png') repeat-x; 512 | } 513 | #gradient-style tfoot tr td 514 | { 515 | background: #e8edff; 516 | font-size: 12px; 517 | color: #99c; 518 | } 519 | #gradient-style tbody tr:hover td 520 | { 521 | background: #d0dafd url('table-images/gradhover.png') repeat-x; 522 | color: #339; 523 | } 524 | 525 | 526 | #pattern-style-a 527 | { 528 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 529 | font-size: 12px; 530 | margin: 45px; 531 | width: 480px; 532 | text-align: left; 533 | border-collapse: collapse; 534 | background: url('table-images/pattern.png'); 535 | } 536 | #pattern-style-a thead tr 537 | { 538 | background: url('table-images/pattern-head.png'); 539 | } 540 | #pattern-style-a th 541 | { 542 | font-size: 13px; 543 | font-weight: normal; 544 | padding: 8px; 545 | border-bottom: 1px solid #fff; 546 | color: #039; 547 | } 548 | #pattern-style-a td 549 | { 550 | padding: 8px; 551 | border-bottom: 1px solid #fff; 552 | color: #669; 553 | border-top: 1px solid transparent; 554 | } 555 | #pattern-style-a tbody tr:hover td 556 | { 557 | color: #339; 558 | background: #fff; 559 | } 560 | 561 | 562 | #pattern-style-b 563 | { 564 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 565 | font-size: 12px; 566 | margin: 45px; 567 | width: 480px; 568 | text-align: left; 569 | border-collapse: collapse; 570 | background: url('table-images/patternb.png'); 571 | } 572 | #pattern-style-b thead tr 573 | { 574 | background: url('table-images/patternb-head.png'); 575 | } 576 | #pattern-style-b th 577 | { 578 | font-size: 13px; 579 | font-weight: normal; 580 | padding: 8px; 581 | border-bottom: 1px solid #fff; 582 | color: #039; 583 | } 584 | #pattern-style-b td 585 | { 586 | padding: 8px; 587 | border-bottom: 1px solid #fff; 588 | color: #669; 589 | border-top: 1px solid transparent; 590 | } 591 | #pattern-style-b tbody tr:hover td 592 | { 593 | color: #339; 594 | background: #cdcdee; 595 | } 596 | -------------------------------------------------------------------------------- /static/filters/funds.json: -------------------------------------------------------------------------------- 1 | { 2 | "name" : "Estimated use and sources of funds", 3 | "headers" : { 4 | "terms" : ["USES OF FUNDS"], 5 | "threshold" : 0.35 6 | } 7 | } -------------------------------------------------------------------------------- /static/filters/maturity_schedule.json: -------------------------------------------------------------------------------- 1 | { 2 | "name" : "Maturity Schedule / Debt Service", 3 | "headers" : { 4 | "terms" : ["DEBT SERVICE", "Bonds Maturing"], 5 | "threshold" : 0.35 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /static/scrutiny.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/scrutiny.png -------------------------------------------------------------------------------- /static/xirr_calculator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/xirr_calculator.png -------------------------------------------------------------------------------- /templates/filtered_project.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | {{title}} 8 | 9 | {% for s in css %} 10 | 11 | {% endfor %} 12 | 13 | 14 | 15 | 16 | 17 | {% for s in base_scripts %} 18 | 19 | {% endfor %} 20 | 21 | {% for s in page_scripts %} 22 | 23 | {% endfor %} 24 | 25 | 26 | 27 |
    28 | {% for n in notices %} 29 |
  • {{n}}
  • 30 | {% endfor %} 31 |
32 | 33 |

34 |

35 |
    36 | {% for filename, results in results.iteritems() %} 37 |
  • 38 |

    {{filename}} has {{results|length}} 39 | highest ranking table(s)

    40 | {% for result in results %} 41 | {% set table = result[1] %} 42 |
    Confidence = {{result[0]}} in Table {{table["begin_line"]}}
    43 | {% for header in table["headers"] %} 44 | {{header}}
    45 | {% endfor %} 46 | {{table["html"]|safe}} 47 | {% endfor %} 48 |
  • 49 | 50 | {% endfor %} 51 | 52 |
53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | {{title}} 8 | 9 | {% for s in css %} 10 | 11 | {% endfor %} 12 | 13 | 14 | 15 | {% for j in scripts %} 16 | 17 | {% endfor %} 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 |
26 |
27 | Fork me on GitHub 28 |
29 | 30 | 31 |
32 |
33 |
34 |

Extract the real costs from Municipal Bonds

35 |
36 | Browse .pdf or .txt file
37 |
38 |
39 | File 40 | 41 |
42 |
43 | 44 |
45 |
46 | .. or paste URL
47 | Choose project
48 | 56 |

57 | 60 |
61 | 62 |
63 |
64 |

cloud_upload

65 |

Upload PDF documents

66 |
67 |
68 |

unarchive

69 |

Extract tabular data

70 |
71 |
72 |

visibility

73 |

Analyze and compare tables

74 |
75 |
76 | 100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
About TabulaRazr
108 |

Extract and browse tabular data from legacy financial documents with ease. 109 |
DeveloperWeek (Accelerate.im) Project Page 110 |

111 |
112 |
113 |
Find Us On
114 |
    115 |
  • 116 |
  • email
  • 117 |
118 |
119 |
120 |
121 | 126 |
127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /templates/inspector.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | {{title}} 8 | 9 | {% for s in css %} 10 | 11 | {% endfor %} 12 | 13 | 14 | 15 | 16 |
17 | 18 | 19 |
20 | 21 |
22 | 23 | add 24 | 25 | 28 |
29 | 30 |
31 | {% for s in base_scripts %} 32 | 33 | {% endfor %} 34 | 35 | {% for s in page_scripts %} 36 | 37 | {% endfor %} 38 | 39 |

Viewing Table @{{table_id}} from {{filename}}

40 | 41 | {% for s in page_scripts %} 42 | 43 | {% endfor %} 44 | 45 | {% for n in notices %} 46 |
{{n}} 47 | {% endfor %} 48 | 49 | 50 |

Context Rows Above

51 | {% for l in top_lines %} 52 |
{{loop.index0+offset}} {{l.decode('utf-8')}}
53 | {% endfor %} 54 |

Table Rows and Lines

55 | {% for l in table_lines %} 56 | 57 | {% endfor %} 58 |

Context Rows Below

59 | {% for l in bottom_lines %} 60 | 61 | {% endfor %} 62 |
63 | 64 | 65 | -------------------------------------------------------------------------------- /templates/view_filtered.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | {{title}} 8 | 9 | {% for s in css %} 10 | 11 | 12 | {% endfor %} 13 | 14 | 15 | 16 | 17 | 18 |
19 | 20 | 21 |
22 |
23 | 24 | add 25 | 26 | 29 |
30 |
31 | {% for s in base_scripts %} 32 | 33 | {% endfor %} 34 | 35 | {% for s in page_scripts %} 36 | 37 | {% endfor %} 38 | 39 |

Calculate Total Interest Cost for {{filename}}

40 |
41 |
42 |
    43 | {% for n in notices %} 44 |
  • {{n|safe}}
  • 45 | {% endfor %} 46 |
47 |
48 |
49 | View all tables 50 |
51 |
52 | 53 | {% for filter, results in results.iteritems() %} 54 | 55 |

Filter {{filter}} returned those tables with high confidence:

56 | 57 | {% for result in results %} 58 |
  • 59 |
    60 | 61 | 62 | {% set table = result[1] %} 63 | Confidence = {{result[0]}} in Table {{table["begin_line"]}}
    64 | {% for header in table["headers"] %} 65 | {{header}}
    66 | {% endfor %} 67 | 68 | {{table["html"]|safe}} 69 |
    70 |
  • 71 | {% endfor %} 72 | 73 | {% endfor %} 74 | 75 | 76 |
    77 | 78 | 79 | -------------------------------------------------------------------------------- /templates/viewer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | {{title}} 8 | 9 | {% for s in css %} 10 | 11 | {% endfor %} 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
    21 | 22 | 23 |
    24 |
    25 | 26 | add 27 | 28 | 31 |
    32 |
    33 | {% for s in base_scripts %} 34 | 35 | {% endfor %} 36 | 37 | {% for s in page_scripts %} 38 | 39 | {% endfor %} 40 | 41 |
    42 | {% for n in notices %} 43 | {% if loop.index==1 %}

    {{n}}

    {% endif %} 44 | {% if loop.index!=1 %}
    {{n}}
    {% endif %} 45 | {% endfor %} 46 |
    47 | 48 |
    49 |
    50 | 51 |
    52 | 53 |
    54 | {% if filename_pdf %} 55 | View raw pdf file
    56 | {% endif %} 57 | View raw txt file 58 | {% if 'muni_bonds' in project %} 59 | 60 |
    61 | 62 | Calculate XIRR [BETA] 63 | 64 | 65 |
    66 | {% endif %} 67 |
    68 |
    69 | 70 | 71 | {% for table in tables %} 72 |
    73 | 74 | 75 |
    {{headers[loop.index-1]}}
    76 | 77 | lines {{meta_data[loop.index-1].begin_line}} to {{meta_data[loop.index-1].end_line}} 78 | Show/Hide 79 | {{ table|safe }} 80 |
    81 | {% endfor %} 82 |
    83 | 84 | 96 | 97 | -------------------------------------------------------------------------------- /xirr_calc.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | #Calculations adapted from Marc Joffe, 2016 5 | 6 | import os 7 | import sys 8 | import json 9 | from backend import * 10 | from data_query import * 11 | 12 | import traceback 13 | import time 14 | 15 | from itertools import chain 16 | 17 | def calc_net_proceeds(table, first_cf_dict, log=None): 18 | v = get_key_values(table, first_cf_dict) 19 | #Avoid picking up discount twice 20 | if v['discount'] == v['underwriter_discount']: 21 | v['discount'] = 0.0 22 | 23 | #Avoid picking up underwriter discount if included in cost of issuance 24 | if v['underwriter_discount'] == v['cost_of_issuance']: 25 | v['underwriter_discount'] = 0.0 26 | 27 | if log: 28 | log.append("working with these values for calculating net proceeds: %s" % str(v)) 29 | if not (v['premium'] or v['discount'] or v['underwriter_discount']): 30 | log.append("Warning: neither a premium nor discounts found") 31 | 32 | net_proceeds_calc = + v['face_value'] \ 33 | + (v['premium'] or 0.) \ 34 | - (v['discount'] or 0.) \ 35 | - (v['underwriter_discount'] or 0.) \ 36 | - v['cost_of_issuance'] 37 | 38 | # Added by Marc 20160306 - Calculate and display cost of issuance and underwriter discount data 39 | total_cost_of_issuance = v['underwriter_discount'] + v['cost_of_issuance'] 40 | total_cost_of_issuance_pct_of_face = total_cost_of_issuance / v['face_value'] 41 | underwriter_discount_pct_of_face = v['underwriter_discount'] / v['face_value'] 42 | log.append("Underwriter Discount as Percent of Face Value: %s" % '{:5.4f}'.format(underwriter_discount_pct_of_face)) 43 | log.append("Total Cost of Issuance as Percent of Face Value: %s" % '{:5.4f}'.format(total_cost_of_issuance_pct_of_face)) 44 | log.append("Total Cost of Issuance (Including Underwiter Discount): %s" % '{:15,.2f}'.format(total_cost_of_issuance)) 45 | 46 | return net_proceeds_calc 47 | 48 | #Todo: refactor into class 49 | debug_each_guess = True # Change to True for verbose output 50 | 51 | 52 | def newton(func, x0, fprime=None, args=(), tol=1.48e-8, maxiter=50): 53 | """Given a function of a single variable and a starting point, 54 | find a nearby zero using Newton-Raphson. 55 | 56 | fprime is the derivative of the function. If not given, the 57 | Secant method is used. 58 | 59 | # Source: http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.newton.html 60 | # File: scipy.optimize.minpack.py 61 | # License: BSD: http://www.scipy.org/License_Compatibility 62 | """ 63 | 64 | if fprime is not None: 65 | p0 = x0 66 | for iter in range(maxiter): 67 | myargs = (p0,)+args 68 | fval = func(*myargs) 69 | fpval = fprime(*myargs) 70 | if fpval == 0: 71 | print "Warning: zero-derivative encountered." 72 | return p0 73 | p = p0 - func(*myargs)/fprime(*myargs) 74 | if abs(p-p0) < tol: 75 | return p 76 | p0 = p 77 | else: # Secant method 78 | p0 = x0 79 | p1 = x0*(1+1e-4) 80 | q0 = func(*((p0,)+args)) 81 | q1 = func(*((p1,)+args)) 82 | for iter in range(maxiter): 83 | if q1 == q0: 84 | if p1 != p0: 85 | print "Tolerance of %s reached" % (p1-p0) 86 | return (p1+p0)/2.0 87 | else: 88 | p = p1 - q1*(p1-p0)/(q1-q0) 89 | if abs(p-p1) < tol: 90 | return p 91 | p0 = p1 92 | q0 = q1 93 | p1 = p 94 | q1 = func(*((p1,)+args)) 95 | raise RuntimeError, "Failed to converge after %d iterations, value is %s" % (maxiter,p) 96 | 97 | class xirr_calc(object): 98 | 99 | def __init__(self): 100 | self.guess_num = 0 101 | self.debug_each_guess = False 102 | self.guesses = [] 103 | 104 | def eir_func(self, rate, pmts, dates): 105 | """Loop through the dates and calculate a discounted cashflow total 106 | 107 | This is a simple process, but the debug messages clutter it up to 108 | make it seem more complex than it is. With the debug messages removed, 109 | it is very similar to eir_derivative_func, but with the EIR formula, 110 | rather than f'rate. 111 | 112 | Credit: http://mail.scipy.org/pipermail/numpy-discussion/2009-May/042736.html 113 | """ 114 | 115 | # Globals used for debug printing 116 | 117 | print_debug_messages = self.debug_each_guess 118 | if rate not in self.guesses: 119 | self.guesses.append(rate) 120 | if print_debug_messages: 121 | print "-----------------------------------------------------------------------------------------------" 122 | print "Guess #%s: %s" % (self.guess_num, rate) 123 | print "" 124 | print " # DATE # DAYS CASHFLOW DISCOUNTED Formula: cf * (rate + 1)^(-days/365)" 125 | print " --------------------------------------------------------------------------------------------" 126 | self.guess_num +=1 127 | 128 | dcf=[] 129 | for i, cf in enumerate(pmts): 130 | d = dates[i] - dates[0] 131 | discounted_period = cf * (rate + 1)**(-d.days / 365.) 132 | dcf.append( discounted_period ) 133 | 134 | if print_debug_messages: 135 | cf = "%.2f" % cf 136 | cf = cf.rjust(9, " ") 137 | discounted_period = '%.8f' % discounted_period 138 | formula = '%s * ((%0.10f + 1)^(-%d /365)) ' % (cf, rate, d.days) 139 | discounted_period = discounted_period.rjust(15, " ") 140 | print " %2i %s %3.0d days %s %s =%s" % (i, dates[i], d.days, cf, discounted_period, formula ) 141 | 142 | discounted_cashflow = sum(dcf) 143 | 144 | if print_debug_messages: 145 | discounted_cashflow = "%.8f" % discounted_cashflow 146 | total = "total:".rjust(35, " ") 147 | print "%s %s" % (total, discounted_cashflow.rjust(15, " ")) 148 | print "" 149 | 150 | return discounted_cashflow 151 | 152 | def eir_derivative_func(rate, pmts, dates): 153 | """Find the derivative or the EIR function, used for calculating 154 | Newton's method: 155 | 156 | http://en.wikipedia.org/wiki/Newton's_method 157 | 158 | EIR = cf*(1+rate)^d 159 | f'rate = cf*d*(rate+1)^(d-1) 160 | 161 | Credit: http://mail.scipy.org/pipermail/numpy-discussion/2009-May/042736.html 162 | """ 163 | 164 | dcf=[] 165 | for i, cf in enumerate(pmts): 166 | d = dates[i] - dates[0] 167 | n = (-d.days / 365.) 168 | dcf.append( cf * n * (rate + 1)**(n - 1) ) 169 | return sum(dcf) 170 | 171 | def xirr(file_lines, funds_table, schedule_table): 172 | 173 | try: 174 | log = [] 175 | 176 | # Get due date 177 | due_date_query = 'deliver' 178 | log.append("Try fetching due date with first occurrence of fuzzy term: %s" % due_date_query) 179 | due_date, date_linenr, line_str = get_first_date(file_lines, 'deliver') 180 | 181 | log.append("... succeeded with date %s in line %i" % (str(due_date), date_linenr)) 182 | 183 | # Get first cash flow 184 | first_cf_dict = {'face_value' : ['Principal Amount', 'Par Amount', 'Face Amount'], 185 | 'premium' : 'Issue Premium', 186 | 'discount': ['Issue Discount', 'Net Discount'], 187 | 'underwriter_discount' : 'Underwriter Discount', 'cost_of_issuance' : 'Costs of Issuance'} 188 | 189 | log.append("Try calculating first cashflow by fetching with those fuzzy terms: %s" % str(first_cf_dict.values())) 190 | net_proceeds = calc_net_proceeds(funds_table, first_cf_dict, log) 191 | log.append("... succeed with first cashflow as net proceeds of %s" % '{:,.2f}'.format(net_proceeds)) 192 | 193 | # Get the rest of the time series 194 | payments_column = "Debt Service" 195 | log.append("Getting remaining time series by looking for first date column and a column of subtype 'dollar' named similar to '%s'" % payments_column) 196 | cf_time = chain( ((due_date, net_proceeds),) , 197 | ((d, -v) for d,v in filter_time_series(schedule_table, payments_column))) 198 | dates = {} 199 | payments = [] 200 | # Convert our sequence of dates and cashflows into random access iterables 201 | for i, cf_dt in enumerate(cf_time): 202 | date, cf = cf_dt[0], cf_dt[1] 203 | dates[i]=date 204 | payments.append(cf) 205 | log.append("... succeed and yielded %i date / cashflow tuples" % len(payments)) 206 | 207 | except Exception as e: 208 | log.append("... failed with %s" % traceback.format_exception(*sys.exc_info())) 209 | return None, log 210 | 211 | # Begin Main Calculation 212 | guess = .05 213 | calculator = xirr_calc() 214 | 215 | maxiter=100 216 | timer_start = time.clock() 217 | if len(dates) > 1: 218 | f = lambda x: calculator.eir_func(x, payments, dates) 219 | derivative = lambda x: eir_derivative_func(x, payments, dates) 220 | try: 221 | rate = newton(f, guess, fprime=derivative, args=(), 222 | tol=0.00000000001, maxiter=maxiter) 223 | except RuntimeError: 224 | log.append("failed to converge after a maxiumum of %i iterations" %maxiter) 225 | 226 | timer_end = time.clock() 227 | # End Main Calculation 228 | 229 | elapsed_time = timer_end - timer_start 230 | final_rate = rate * 100 231 | 232 | if not calculator.debug_each_guess: 233 | log.append("") 234 | log.append('Cashflow and Dates') 235 | #log.append("-------------------------") 236 | for i, dte in enumerate(dates.values()): 237 | log.append ("
    %i | %s ... $ %s
    " % (i, str(dte), '{:,.2f}'.format(payments[i])) ) 238 | 239 | log.append('Guesses Summary') 240 | 241 | for i, g in enumerate(calculator.guesses): 242 | log.append("%i guessed %0.10f" % (i +1, g)) 243 | 244 | log.append("Calculation time: %s seconds" % elapsed_time) 245 | return final_rate, log 246 | 247 | --------------------------------------------------------------------------------