├── .bowerrc
├── .gitignore
├── .ipynb_checkpoints
    ├── GetMuniBondsTest-checkpoint.ipynb
    ├── TableParser-checkpoint.ipynb
    ├── TableParser2-checkpoint.ipynb
    ├── TableParser3-checkpoint.ipynb
    └── TableParser4-checkpoint.ipynb
├── LICENSE
├── README.md
├── backend.py
├── bower.json
├── bulk_processing
    ├── IRR_estimate.ipynb
    ├── bulk_proc.sh
    └── tf_calc.ipynb
├── data_query.py
├── deprecated
    ├── GetMuniBondData.cfg
    ├── GetMuniBondData.py
    ├── GetMuniBondsTest.ipynb
    ├── Procfile
    ├── TableParser.ipynb
    ├── TableParser.py
    ├── TableParser2.ipynb
    ├── TableParser2.py
    ├── TableParser3.ipynb
    ├── TableParser3.py
    ├── deploy_notebook.sh
    ├── manifest.yml
    ├── pdf2text_bulk.ipynb
    └── tableparser.py
├── design
    ├── 1_Home.jpg
    ├── 2_Show.jpg
    ├── 3_Browse.jpg
    ├── browse_similar_data_tables_feature.png
    └── screenshot_show_example.png
├── prototyping.ipynb
├── requirements.txt
├── server.py
├── static
    ├── TabulaRazr_Logo.png
    ├── center_for_municipal_finance_logo.png
    ├── css
    │   ├── main.css
    │   ├── source
    │   │   ├── index.html
    │   │   └── table-images
    │   │   │   ├── back.png
    │   │   │   ├── blurry.jpg
    │   │   │   ├── botleft.png
    │   │   │   ├── botright.png
    │   │   │   ├── gradback.png
    │   │   │   ├── gradhead.png
    │   │   │   ├── gradhover.png
    │   │   │   ├── header.jpg
    │   │   │   ├── left.png
    │   │   │   ├── pattern-head.png
    │   │   │   ├── pattern.gif
    │   │   │   ├── pattern.png
    │   │   │   ├── patternb-head.png
    │   │   │   ├── patternb.png
    │   │   │   └── right.png
    │   └── style.css
    ├── filters
    │   ├── funds.json
    │   └── maturity_schedule.json
    ├── scrutiny.png
    └── xirr_calculator.png
├── templates
    ├── filtered_project.html
    ├── index.html
    ├── inspector.html
    ├── view_filtered.html
    └── viewer.html
└── xirr_calc.py


/.bowerrc:
--------------------------------------------------------------------------------
1 | {
2 |     "directory": "static/bower_components"
3 | }
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | cf
 2 | nohup.out
 3 | guni.log
 4 | static/ug/*
 5 | .ipynb_checkpoints/
 6 | bulk_processing/.ipynb_checkpoints/
 7 | prototyping.py
 8 | *.pyc
 9 | static/bower_components/
10 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/GetMuniBondsTest-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 35,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stdout",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "skipping pdf/ER862677-ER674128-ER1075876.pdf, already exists.\n",
 15 |       "skipping pdf/EP849915-EP657701-EP1059361.pdf, already exists.\n",
 16 |       "skipping pdf/ER866175-ER676833-ER1078611.pdf, already exists.\n"
 17 |      ]
 18 |     }
 19 |    ],
 20 |    "source": [
 21 |     "from __future__ import print_function\n",
 22 |     "import re\n",
 23 |     "import os\n",
 24 |     "import codecs\n",
 25 |     "import string\n",
 26 |     "\n",
 27 |     "#Convert all pdfs\n",
 28 |     "files = os.listdir('pdf')\n",
 29 |     "for i,f in enumerate(files):\n",
 30 |     "\n",
 31 |     "    pdf_path = os.path.join('pdf', f)\n",
 32 |     "    txt_path = os.path.join('txt', f+'.txt')\n",
 33 |     "    \n",
 34 |     "    if not os.path.isfile(txt_path):\n",
 35 |     "        #Layout preservation crucial to maintain clues about tabular data\n",
 36 |     "        cmd = \"pdftotext -layout %s %s\" % (pdf_path, txt_path)\n",
 37 |     "        print ('%d/%d %s' % (i, len(files), cmd))\n",
 38 |     "        os.system(cmd)\n",
 39 |     "    else:\n",
 40 |     "        print ('skipping %s, already exists.' % (pdf_path, ))"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 147,
 46 |    "metadata": {
 47 |     "collapsed": false,
 48 |     "scrolled": false
 49 |    },
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "--------ER862677-ER674128-ER1075876.pdf.txt--------\n",
 56 |       "HEADER 1                                                     $95,885,000\n",
 57 |       "HEADER 2                                                     CALIFORNIA MUNICIPAL FINANCE AUTHORITY\n",
 58 |       "HEADER 3                                                     REVENUE BONDS, SERIES 2015-A\n",
 59 |       "--------EP849915-EP657701-EP1059361.pdf.txt--------\n",
 60 |       "HEADER 1                                                     $6,645,000\n",
 61 |       "HEADER 2                                                     CITY OF PALM SPRINGS\n",
 62 |       "HEADER 3                                                     LIMITED OBLIGATION REFUNDING IMPROVEMENT BONDS\n",
 63 |       "--------ER866175-ER676833-ER1078611.pdf.txt--------\n",
 64 |       "HEADER 1                                                     $19,560,000\n",
 65 |       "HEADER 2                                                     RNR SCHOOL FINANCING AUTHORITY\n",
 66 |       "HEADER 3                                                     COMMUNITY FACILITIES DISTRICT NO. 92-1\n",
 67 |       "PRINCIPAL AMOUNT OF 2015 REFUNDING BONDS                     $19,560,000.00\n",
 68 |       "PLUS: NET ORIGINAL ISSUE PREMIUM                             2,550,554.30\n",
 69 |       "PLUS: TRANSFERRED MONEYS FROM FUNDS FOR 2006 BONDS           367,663.99\n",
 70 |       "TOTAL SOURCES                                                $22,302,178.29\n",
 71 |       "DEPOSIT INTO ESCROW FUND (1)                                 $21,893,691.38\n",
 72 |       "DEPOSIT INTO 2015A COSTS OF ISSUANCE ACCOUNT (2)             408,486.91\n",
 73 |       "TOTAL USES                                                   $22,302,178.29\n"
 74 |      ]
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "#Existing Version\n",
 79 |     "for file in os.listdir('txt'):\n",
 80 |     "    \n",
 81 |     "    print (\"--------\" + file + \"--------\")\n",
 82 |     "    \n",
 83 |     "    printline = 0\n",
 84 |     "    linesleft = 0\n",
 85 |     "    blanklines = 0\n",
 86 |     "    \n",
 87 |     "    topfound = 0\n",
 88 |     "    headerline = 0 \n",
 89 |     "    \n",
 90 |     "    with codecs.open('txt/'+file, \"r\", \"utf-8\") as f:\n",
 91 |     "        for i, line in enumerate(f):\n",
 92 |     "\n",
 93 |     "            strippedline = line.upper().strip()\n",
 94 |     "\n",
 95 |     "            if topfound == 0 and string.find(line,\"       $\") > 0:\n",
 96 |     "                headerline = 1\n",
 97 |     "                topfound = 1\n",
 98 |     "\n",
 99 |     "            if 1 <= headerline <= 3:\n",
100 |     "                caption = \"HEADER \" + str(headerline)\n",
101 |     "                value = strippedline\n",
102 |     "                #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n",
103 |     "                print (u\"{:60s} {:10s}\".format(caption, value))\n",
104 |     "                headerline = headerline + 1\n",
105 |     "                continue\n",
106 |     "\n",
107 |     "            if strippedline == \"SOURCES AND USES OF FUNDS\" \\\n",
108 |     "            or strippedline == \"SOURCES AND USES OF FUNDS*\" \\\n",
109 |     "            or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS\" \\\n",
110 |     "            or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS*\" \\\n",
111 |     "            or strippedline == \"SOURCES AND USES OF FUNDS(1)\" \\\n",
112 |     "            or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS(1)\" \\\n",
113 |     "            or strippedline == \"PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS\":\n",
114 |     "                printline = 1\n",
115 |     "                linesleft = 25\n",
116 |     "\n",
117 |     "            if printline == 1:\n",
118 |     "                dollar_amount_regex = re.compile(\"[\\$]{0,1}[\\s]{0,6}[0-9,]{0,15}(\\.[0-9]{1,2})$\")\n",
119 |     "                dollar_amount_match = re.search(dollar_amount_regex,strippedline)\n",
120 |     "                if dollar_amount_match:\n",
121 |     "                    caption = strippedline[:dollar_amount_match.start(0)].strip()\n",
122 |     "                    value = strippedline[dollar_amount_match.start(0):].strip()\n",
123 |     "                    #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n",
124 |     "                    print (u\"{:60s} {:10s}\".format(caption, value))\n",
125 |     "                if len(line.strip()) < 5 and linesleft < 10:\n",
126 |     "                    blanklines = blanklines + 1\n",
127 |     "                linesleft = linesleft - 1\n",
128 |     "\n",
129 |     "            if linesleft == 0:\n",
130 |     "                printline = 0"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {
137 |     "collapsed": true
138 |    },
139 |    "outputs": [],
140 |    "source": [
141 |     "#Issues:\n",
142 |     "## Doesn't pick up caption in EP1059361 --> add USES OF FUNDS but then no SOURCES OF PAYMENTS\n",
143 |     "## Doesn't pick up line items in ER1075876 --> match sequences of .... to indicate tables as well, plus be more lenient with cents values\n"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 154,
149 |    "metadata": {
150 |     "collapsed": false
151 |    },
152 |    "outputs": [
153 |     {
154 |      "name": "stdout",
155 |      "output_type": "stream",
156 |      "text": [
157 |       "--------ER862677-ER674128-ER1075876.pdf.txt--------\n",
158 |       "HEADER 1                                                     $95,885,000\n",
159 |       "HEADER 2                                                     CALIFORNIA MUNICIPAL FINANCE AUTHORITY\n",
160 |       "HEADER 3                                                     REVENUE BONDS, SERIES 2015-A\n",
161 |       "PRINCIPAL AMOUNT                                             $ 95,885,000\n",
162 |       "BOND PREMIUM                                                 12,984,339\n",
163 |       "OTHER AVAILABLE FUNDS(1)                                     6,600,643 \n",
164 |       "TOTAL SOURCES                                                $115,469,982\n",
165 |       "DEPOSIT TO ACQUISITION FUND                                  $ 41,000,000\n",
166 |       "RETIREMENT OF WATER REVENUE ANTICIPATION NOTES(2)            14,000,000\n",
167 |       "DEPOSIT TO ESCROW FUND FOR REFUNDED 2008 BONDS               52,742,691\n",
168 |       "DISCHARGE OF STATE LOAN                                      7,096,550 \n",
169 |       "COSTS OF ISSUANCE(3)                                         630,741   \n",
170 |       "TOTAL USES                                                   $115,469,982\n",
171 |       "--------EP849915-EP657701-EP1059361.pdf.txt--------\n",
172 |       "HEADER 1                                                     $6,645,000\n",
173 |       "HEADER 2                                                     CITY OF PALM SPRINGS\n",
174 |       "HEADER 3                                                     LIMITED OBLIGATION REFUNDING IMPROVEMENT BONDS\n",
175 |       "TRANSFER TO ESCROW BANK                                      $6,086,693.08\n",
176 |       "RESERVE FUND (1)                                             274,331.25\n",
177 |       "COSTS OF ISSUANCE FUND (2)                                   152,404.72\n",
178 |       "TOTAL USES                                                   $6,513,429.05\n",
179 |       "--------ER866175-ER676833-ER1078611.pdf.txt--------\n",
180 |       "HEADER 1                                                     $19,560,000\n",
181 |       "HEADER 2                                                     RNR SCHOOL FINANCING AUTHORITY\n",
182 |       "HEADER 3                                                     COMMUNITY FACILITIES DISTRICT NO. 92-1\n",
183 |       "PRINCIPAL AMOUNT OF 2015 REFUNDING BONDS                     $19,560,000.00\n",
184 |       "PLUS: NET ORIGINAL ISSUE PREMIUM                             2,550,554.30\n",
185 |       "PLUS: TRANSFERRED MONEYS FROM FUNDS FOR 2006 BONDS           367,663.99\n",
186 |       "TOTAL SOURCES                                                $22,302,178.29\n",
187 |       "DEPOSIT INTO ESCROW FUND (1)                                 $21,893,691.38\n",
188 |       "DEPOSIT INTO 2015A COSTS OF ISSUANCE ACCOUNT (2)             408,486.91\n",
189 |       "TOTAL USES                                                   $22,302,178.29\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "#New Version\n",
195 |     "for file in os.listdir('txt'):\n",
196 |     "    \n",
197 |     "    print (\"--------\" + file + \"--------\")\n",
198 |     "    \n",
199 |     "    printline = 0\n",
200 |     "    linesleft = 0\n",
201 |     "    blanklines = 0\n",
202 |     "    \n",
203 |     "    topfound = 0\n",
204 |     "    headerline = 0 \n",
205 |     "    \n",
206 |     "    with codecs.open('txt/'+file, \"r\", \"utf-8\") as f:\n",
207 |     "        for i, line in enumerate(f):\n",
208 |     "\n",
209 |     "            \n",
210 |     "            strippedline = line.upper().strip()\n",
211 |     "\n",
212 |     "            if topfound == 0 and string.find(line,\"       $\") > 0:\n",
213 |     "                headerline = 1\n",
214 |     "                topfound = 1\n",
215 |     "\n",
216 |     "            if 1 <= headerline <= 3:\n",
217 |     "                caption = \"HEADER \" + str(headerline)\n",
218 |     "                value = strippedline\n",
219 |     "                #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n",
220 |     "                print (u\"{:60s} {:10s}\".format(caption, value))\n",
221 |     "                headerline = headerline + 1\n",
222 |     "                continue\n",
223 |     "\n",
224 |     "            if strippedline == \"SOURCES AND USES OF FUNDS\" \\\n",
225 |     "            or strippedline == \"SOURCES AND USES OF FUNDS*\" \\\n",
226 |     "            or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS\" \\\n",
227 |     "            or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS*\" \\\n",
228 |     "            or strippedline == \"SOURCES AND USES OF FUNDS(1)\" \\\n",
229 |     "            or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS(1)\" \\\n",
230 |     "            or strippedline == \"PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS\" \\\n",
231 |     "            or strippedline == \"ESTIMATED USES OF FUNDS\": #New\n",
232 |     "                printline = 1\n",
233 |     "                linesleft = 25\n",
234 |     "                #print (\"#### line:\", i, \"to\", i+linesleft)\n",
235 |     "\n",
236 |     "            if printline == 1:\n",
237 |     "                #Include a minimum of preceding dots or whitespace\n",
238 |     "                #Group 1 = preceding whitespace\n",
239 |     "                #Group 2 = Dollar value\n",
240 |     "                #Group 3 = $Cents value if existing\n",
241 |     "                dollar_amount_regex = ur\"([\\.]{4,}|[\\s]{4,})[\\s]*\" + \\\n",
242 |     "                                      ur\"([\\$]{0,1}[\\s]{0,6}[0-9,]{2,15})(\\.[0-9]{1,2})?$\"\n",
243 |     "                dollar_amount_regex = re.compile(dollar_amount_regex)\n",
244 |     "                dollar_amount_match = re.search(dollar_amount_regex,strippedline)\n",
245 |     "                \n",
246 |     "                #Check whether we found something tabular and a dollar value\n",
247 |     "                if dollar_amount_match and dollar_amount_match.group(2):\n",
248 |     "                    caption = strippedline[:dollar_amount_match.start(1)].strip()\n",
249 |     "                    value = strippedline[dollar_amount_match.start(2):].strip()\n",
250 |     "                    #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n",
251 |     "                    print (u\"{:60s} {:10s}\".format(caption, value))\n",
252 |     "                if len(line.strip()) < 5 and linesleft < 10:\n",
253 |     "                    blanklines = blanklines + 1\n",
254 |     "                linesleft = linesleft - 1\n",
255 |     "\n",
256 |     "            if linesleft == 0:\n",
257 |     "                printline = 0"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 150,
263 |    "metadata": {
264 |     "collapsed": false
265 |    },
266 |    "outputs": [
267 |     {
268 |      "name": "stdout",
269 |      "output_type": "stream",
270 |      "text": [
271 |       "--------ER862677-ER674128-ER1075876.pdf.txt--------\n",
272 |       "27 - issuance of the Bonds. See “PLAN OF FINANCE” and “ESTIMATED SOURCES AND USES OF FUNDS.”\n",
273 |       "\n",
274 |       "229 - ESTIMATED SOURCES AND USES OF FUNDS ................................................................................... 7\n",
275 |       "\n",
276 |       "370 - of the Bonds. See “PLAN OF FINANCE” and “ESTIMATED SOURCES AND USES OF FUNDS.”\n",
277 |       "\n",
278 |       "653 -                                  ESTIMATED SOURCES AND USES OF FUNDS\n",
279 |       "\n",
280 |       "--------EP849915-EP657701-EP1059361.pdf.txt--------\n",
281 |       "--------ER866175-ER676833-ER1078611.pdf.txt--------\n",
282 |       "223 - ESTIMATED SOURCES AND USES OF FUNDS .................................................................................. 13 \n",
283 |       "\n",
284 |       "429 - Bonds. See “ESTIMATED SOURCES AND USES OF FUNDS.”\n",
285 |       "\n",
286 |       "715 - “ESTIMATED SOURCES AND USES OF FUNDS.”\n",
287 |       "\n",
288 |       "983 -                              ESTIMATED SOURCES AND USES OF FUNDS\n",
289 |       "\n"
290 |      ]
291 |     }
292 |    ],
293 |    "source": [
294 |     "#Some exploration\n",
295 |     "max_distance_below = 25\n",
296 |     "max_distance_above = 5\n",
297 |     "context_identifier = u\"SOURCES AND USES OF FUNDS|SOURCES AND USES OF FUNDS*|ESTIMATED SOURCES AND USES OF FUNDS|\" + \\\n",
298 |     "                      \"ESTIMATED SOURCES AND USES OF FUNDS*|SOURCES AND USES OF FUNDS(1)|\" + \\\n",
299 |     "                      \"ESTIMATED SOURCES AND USES OF FUNDS(1)|PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS\"\n",
300 |     "context_identifier = context_identifier.split(u\"|\")\n",
301 |     "\n",
302 |     "for file in os.listdir('txt'):\n",
303 |     "    \n",
304 |     "    print (\"--------\" + file + \"--------\")\n",
305 |     "    with codecs.open('txt/'+file, \"r\", \"utf-8\") as f:\n",
306 |     "        for i, line in enumerate(f):\n",
307 |     "            \n",
308 |     "            #Print Candidates\n",
309 |     "            id_found = reduce(lambda x,y: x or y, ( (id in line) for id in context_identifier ))\n",
310 |     "            if id_found:\n",
311 |     "                print(i, '-', line)\n",
312 |     "            "
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {
319 |     "collapsed": true
320 |    },
321 |    "outputs": [],
322 |    "source": []
323 |   }
324 |  ],
325 |  "metadata": {
326 |   "kernelspec": {
327 |    "display_name": "Python 2",
328 |    "language": "python",
329 |    "name": "python2"
330 |   },
331 |   "language_info": {
332 |    "codemirror_mode": {
333 |     "name": "ipython",
334 |     "version": 2
335 |    },
336 |    "file_extension": ".py",
337 |    "mimetype": "text/x-python",
338 |    "name": "python",
339 |    "nbconvert_exporter": "python",
340 |    "pygments_lexer": "ipython2",
341 |    "version": "2.7.6"
342 |   }
343 |  },
344 |  "nbformat": 4,
345 |  "nbformat_minor": 0
346 | }
347 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/TableParser4-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "#TabulaRazr - specific to calculate  - TABLE Parser\n",
 12 |     "#Infers a table with arbitrary number of columns from reoccuring patterns in text lines\n",
 13 |     "#(c) Alexander Hirner 2016, no redistribution without permission\n",
 14 |     "#Contributions: ____ (refactoring), UI styling (), ....\n",
 15 |     "\n",
 16 |     "\n",
 17 |     "#Main assumptions Table identificatin:\n",
 18 |     "#1) each row is either in one line or not a row at all\n",
 19 |     "#2) each column features at least one number (=dollar amount)\n",
 20 |     "#2a) each column features at least one date-like string [for time-series only]\n",
 21 |     "#3) a table exists if rows are in narrow consecutive order and share similarities --> scoring algo [DONE] \n",
 22 |     "#4) each column is separated by more than x consecutive whitespace indicators (e.g. '  ' or '..')\n",
 23 |     "\n",
 24 |     "#Feature List Todo:\n",
 25 |     "#1) Acknowledge footnotes / make lower meta-data available\n",
 26 |     "#2) make delimiter length smartly dependent on number of columns (possible iterative approach)\n",
 27 |     "#3) improve captioning: expand non canonical values in tables [DONE] .. but not to the extent how types match up  --> use this to further\n",
 28 |     "## delineate between caption and headers\n",
 29 |     "#4) UI: parameterize extraction on the show page on the fly\n",
 30 |     "#5) deeper type inference on token level: type complex [DONE], subtype header (centered, capitalized), \n",
 31 |     "## subtype page nr., type free flow [DONE, need paragraph]\n",
 32 |     "#5a) re\n",
 33 |     "#6) Respect negative values with potential '-' for numerical values\n",
 34 |     "#7)\n",
 35 |     "#8) classify tables with keywords (Muni Bonds) and unsupervised clustering (Hackathon)\n",
 36 |     "#9) Restructure folder and URI around MD5 hash (http://stackoverflow.com/questions/24570066/calculate-md5-from-werkzeug-datastructures-filestorage-without-saving-the-object)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "from backend import *\n",
 48 |     "import os"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 3,
 54 |    "metadata": {
 55 |     "collapsed": true
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "import json\n",
 60 |     "from flask import Flask, request, redirect, url_for, send_from_directory\n",
 61 |     "from werkzeug import secure_filename\n",
 62 |     "from flask import jsonify, render_template, make_response\n",
 63 |     "\n",
 64 |     "import matplotlib.pyplot as plt"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "metadata": {
 71 |     "collapsed": true
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "TITLE = \"TabulaRazr (XIRR for muni_bonds)\"\n",
 76 |     "\n",
 77 |     "scripts = []\n",
 78 |     "css = [\n",
 79 |     "    \"./bower_components/bootstrap/dist/css/bootstrap.min.css\",\n",
 80 |     "    \"./css/main.css\",\n",
 81 |     "    \"./css/style.css\"\n",
 82 |     "]\n",
 83 |     "\n",
 84 |     "\n",
 85 |     "UPLOAD_FOLDER = './static/ug'\n",
 86 |     "ALLOWED_EXTENSIONS = set(['txt', 'pdf'])\n",
 87 |     "\n",
 88 |     "TITLE = \"TabulaRazr\"\n",
 89 |     "\n",
 90 |     "app = Flask(__name__)\n",
 91 |     "app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER\n",
 92 |     "\n",
 93 |     "def get_extension(filename):\n",
 94 |     "    return '.' in filename and \\\n",
 95 |     "           filename.rsplit('.', 1)[1] \n",
 96 |     "\n",
 97 |     "def allowed_file(filename):\n",
 98 |     "    return get_extension(filename) in ALLOWED_EXTENSIONS\n",
 99 |     "\n",
100 |     "@app.route('/', methods=['GET', 'POST'])\n",
101 |     "def upload_file():\n",
102 |     "\n",
103 |     "    if request.method == 'POST':\n",
104 |     "        \n",
105 |     "        file = request.files['file']\n",
106 |     "        project = request.form['project']\n",
107 |     "        \n",
108 |     "        if file and allowed_file(file.filename):\n",
109 |     "            filename = secure_filename(file.filename)\n",
110 |     "            extension = get_extension(file.filename)\n",
111 |     "            path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)\n",
112 |     "            \n",
113 |     "            file.save(os.path.join(app.config['UPLOAD_FOLDER'], project, filename))\n",
114 |     "            \n",
115 |     "            if extension == \"pdf\":\n",
116 |     "                txt_path = path+'.txt'\n",
117 |     "                filename += '.txt'        \n",
118 |     "                if not os.path.isfile(txt_path):\n",
119 |     "                    #Layout preservation crucial to preserve clues about tabular data\n",
120 |     "                    cmd = \"pdftotext -enc UTF-8 -layout %s %s \" % (path, txt_path)\n",
121 |     "                    os.system(cmd)            \n",
122 |     "\n",
123 |     "            return redirect(url_for('analyze', filename=filename, project=project))\n",
124 |     "\n",
125 |     "    return render_template('index.html',\n",
126 |     "        title=TITLE ,\n",
127 |     "        css=css)\n",
128 |     "\n",
129 |     "@app.route('/analyze/<filename>', methods=['GET', 'POST'])\n",
130 |     "def analyze(filename):   \n",
131 |     "\n",
132 |     "    project = request.args.get('project')\n",
133 |     "    txt_path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)\n",
134 |     "    \n",
135 |     "    if not os.path.isfile(txt_path):\n",
136 |     "        return {'error' : txt_path+' not found' }\n",
137 |     "    \n",
138 |     "    tables = return_tables(txt_path)\n",
139 |     "    \n",
140 |     "    #Export tables\n",
141 |     "    with codecs.open(txt_path + '.tables.json', 'w', \"utf-8\") as file:\n",
142 |     "        json.dump(tables, file)\n",
143 |     "\n",
144 |     "    #Export chart\n",
145 |     "    lines_per_page = 80\n",
146 |     "    nr_data_rows = []\n",
147 |     "    #for t in tables.values():\n",
148 |     "    #    print t\n",
149 |     "    for key, t in tables.iteritems():\n",
150 |     "        e = t['end_line']\n",
151 |     "        b = t['begin_line']\n",
152 |     "        for l in range(b, e):\n",
153 |     "            page = l / lines_per_page\n",
154 |     "            if len(nr_data_rows) <= page:\n",
155 |     "                nr_data_rows += ([0]*(page-len(nr_data_rows)+1))\n",
156 |     "            nr_data_rows[page] += 1\n",
157 |     "    dr = pd.DataFrame()\n",
158 |     "    dr['value'] = nr_data_rows\n",
159 |     "    dr['page'] = range(0, len(dr))\n",
160 |     "    \n",
161 |     "    #plot the row density\n",
162 |     "    chart = filename+\".png\"\n",
163 |     "    fig, ax = plt.subplots( nrows=1, ncols=1, figsize=(8,3) )  # create figure & 1 axis\n",
164 |     "    ax.set_xlabel('page nr.')\n",
165 |     "    ax.set_ylabel('number of data rows')\n",
166 |     "    ax.set_title('Distribution of Rows with Data')\n",
167 |     "    ax.plot(dr['page'], dr['value'], )\n",
168 |     "    fig.savefig(txt_path + '.png')   # save the figure to file\n",
169 |     "    plt.close(fig)                      # close the figure\n",
170 |     "\n",
171 |     "    if request.method == 'POST':\n",
172 |     "        return json.dumps(tables)\n",
173 |     "    \n",
174 |     "    return redirect(url_for('uploaded_file', filename=filename, project=project))\n",
175 |     "    \n",
176 |     "\n",
177 |     "@app.route('/show/<filename>')\n",
178 |     "def uploaded_file(filename):\n",
179 |     "\n",
180 |     "    project = request.args.get('project')    \n",
181 |     "    path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)\n",
182 |     "    \n",
183 |     "    tables_path = path + '.tables.json'\n",
184 |     "    chart_path = path+\".png\"\n",
185 |     "    \n",
186 |     "    if not os.path.isfile(tables_path):\n",
187 |     "        analyze(path)\n",
188 |     "\n",
189 |     "    with codecs.open(tables_path) as file:\n",
190 |     "        tables = json.load(file)   \n",
191 |     "\n",
192 |     "    #Create HTML\n",
193 |     "    notices = ['Extraction Results for ' + filename, 'Ordered by lines']    \n",
194 |     "    dfs = (table_to_df(table).to_html() for table in tables.values())\n",
195 |     "    headers = []\n",
196 |     "    for t in tables.values():\n",
197 |     "        if 'header' in t:\n",
198 |     "            headers.append(t['header'])\n",
199 |     "        else:\n",
200 |     "            headers.append('-')\n",
201 |     "    meta_data = [{'begin_line' : t['begin_line'], 'end_line' : t['end_line']} for t in tables.values()]\n",
202 |     "\n",
203 |     "    return render_template('viewer.html',\n",
204 |     "        title=TITLE + ' - ' + filename,\n",
205 |     "        base_scripts=scripts, filename=filename, project=project,\n",
206 |     "        css=css, notices = notices, tables = dfs, headers=headers, meta_data=meta_data, chart=chart_path)\n",
207 |     "\n",
208 |     "@app.route('/inspector/<filename>')\n",
209 |     "def inspector(filename):\n",
210 |     "    extension = 'txt'\n",
211 |     "    path = os.path.join(app.config['UPLOAD_FOLDER'], extension, filename)\n",
212 |     "    begin_line = int(request.args.get('data_begin'))\n",
213 |     "    end_line = int(request.args.get('data_end'))\n",
214 |     "    margin_top = config[\"meta_info_lines_above\"]\n",
215 |     "    margin_bottom = margin_top\n",
216 |     "    \n",
217 |     "    notices = ['showing data lines from %i to %i with %i meta-lines above and below' % (begin_line, end_line, margin_top)]\n",
218 |     "    with codecs.open(path, \"r\", \"utf-8\") as file:\n",
219 |     "        lines = [l.encode('utf-8') for l in file][begin_line - margin_top:end_line + margin_bottom]\n",
220 |     "        top_lines = lines[:margin_top]\n",
221 |     "        table_lines = lines[margin_top:margin_top+end_line-begin_line]\n",
222 |     "        bottom_lines = lines[margin_top+end_line-begin_line:]\n",
223 |     "    \n",
224 |     "    offset = begin_line-margin_top\n",
225 |     "    table_id = begin_line\n",
226 |     "    \n",
227 |     "    return render_template('inspector.html',\n",
228 |     "        title=TITLE,\n",
229 |     "        base_scripts=scripts, css=css, notices = notices, filename=filename, top_lines=top_lines, \n",
230 |     "        table_lines=table_lines, bottom_lines=bottom_lines, offset=offset, table_id=begin_line)"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {
237 |     "collapsed": false
238 |    },
239 |    "outputs": [
240 |     {
241 |      "name": "stderr",
242 |      "output_type": "stream",
243 |      "text": [
244 |       "ERROR:__main__:Exception on / [POST]\n",
245 |       "Traceback (most recent call last):\n",
246 |       "  File \"/Library/Python/2.7/site-packages/flask/app.py\", line 1817, in wsgi_app\n",
247 |       "    response = self.full_dispatch_request()\n",
248 |       "  File \"/Library/Python/2.7/site-packages/flask/app.py\", line 1477, in full_dispatch_request\n",
249 |       "    rv = self.handle_user_exception(e)\n",
250 |       "  File \"/Library/Python/2.7/site-packages/flask/app.py\", line 1381, in handle_user_exception\n",
251 |       "    reraise(exc_type, exc_value, tb)\n",
252 |       "  File \"/Library/Python/2.7/site-packages/flask/app.py\", line 1475, in full_dispatch_request\n",
253 |       "    rv = self.dispatch_request()\n",
254 |       "  File \"/Library/Python/2.7/site-packages/flask/app.py\", line 1461, in dispatch_request\n",
255 |       "    return self.view_functions[rule.endpoint](**req.view_args)\n",
256 |       "  File \"<ipython-input-4-004e7bcd31d8>\", line 36, in upload_file\n",
257 |       "    path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)\n",
258 |       "NameError: global name 'os' is not defined\n"
259 |      ]
260 |     }
261 |    ],
262 |    "source": [
263 |     "def run_from_ipython():\n",
264 |     "    try:\n",
265 |     "        __IPYTHON__\n",
266 |     "        return True\n",
267 |     "    except NameError:\n",
268 |     "        return False\n",
269 |     "\n",
270 |     "if run_from_ipython():\n",
271 |     "    app.run(host='0.0.0.0', port = 8080)\n",
272 |     "else:\n",
273 |     "    PORT = int(os.getenv('PORT', 8080))\n",
274 |     "    app.run(debug=True, host='0.0.0.0', port = PORT)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {
281 |     "collapsed": true
282 |    },
283 |    "outputs": [],
284 |    "source": []
285 |   }
286 |  ],
287 |  "metadata": {
288 |   "kernelspec": {
289 |    "display_name": "Python 2",
290 |    "language": "python",
291 |    "name": "python2"
292 |   },
293 |   "language_info": {
294 |    "codemirror_mode": {
295 |     "name": "ipython",
296 |     "version": 2
297 |    },
298 |    "file_extension": ".py",
299 |    "mimetype": "text/x-python",
300 |    "name": "python",
301 |    "nbconvert_exporter": "python",
302 |    "pygments_lexer": "ipython2",
303 |    "version": "2.7.10"
304 |   }
305 |  },
306 |  "nbformat": 4,
307 |  "nbformat_minor": 0
308 | }
309 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # TabulaRazr
 2 | **Extract and browse tabular data from legacy financial documents with ease**.
 3 | 
 4 | This repository is a partial release from prior work and the Top 5 submission at [DeveloperWeek 2016](http://accelerate.im/projects/362) ([video presentation](https://www.youtube.com/watch?v=Snqul2fJT5c)). The more elaborate version builds semantic links between tables to efficiently compare deals and aggregate otherwise disconnected knowledge from a large collection of documents.
 5 | 
 6 | Issues, forks and heavy usage welcome. Distributed under APGL v3.
 7 | 
 8 | # Usage
 9 | After uploading a `.txt` or `.pdf` document, all identified tables are presented as well as where they occur in the document.
10 | ![View on Document](/../xirr-specific/design/screenshot_show_example.png?raw=true "Municipal Bond of Flint")
11 | The screenshot shows a bond used to construct **public buildings in Jurupa's school district**, Riverside County. 
12 | Additional information, such as inferred data types and positional features of table cells are cached in `.json` files on the local filesystem.
13 | 
14 | Once the data is structured and annotated, it is relatively easy to automatically calculate domain specific key figures. This customized version includes an experimental calculation for the [internal rate of return](http://www.investopedia.com/terms/i/irr.asp) for Municpal Bonds. Often, auxiliary information is surfaced such as unemployment rates which again can be used as a basis to aggregate hidden knowledge.
15 | 
16 | # Setup and run
17 | 
18 | ### Initial setup and run
19 | 
20 |     npm install -g bower
21 |     pip install -r requirements.txt
22 |     bower install
23 |     python server.py
24 |     
25 |     
26 | 
27 | Navigate to `http://localhost:7081` and upload an example document (see below).
28 | You may set your PORT variable to other ports than 7081.
29 | 
30 | ### Updating
31 | 
32 |     git pull
33 |     pip install -r requirements.txt
34 |     bower install
35 |     
36 | 
37 | 
38 | # Folder structure
39 | - /templates ... Jinja2 html templates
40 | - /static ... all stylesheets and media goes there
41 | - /static/ug/<project_name> ... user uploaded data and analysis files (graphs, json)
42 | 
43 | # Example documents
44 | 
45 | One running instance with Municipal Bonds and other document categories lives at: http://tabularazr.eastus.cloudapp.azure.com:7081
46 | 
47 | | Document | Category |
48 | |----------|---------:|
49 | |**Municipal Bond of the City of Flint:** [Debt Service Schedule](http://tabularazr.eastus.cloudapp.azure.com:7081/show/muni_bonds/ER544111-ER421289-ER823264.pdf.txt#1581)|Municipal Bond|
50 | |**Deep Learning Paper:** [Empirical Findings](http://tabularazr.eastus.cloudapp.azure.com:7081/show/_other/sentence_entailment_attention_LSTM.pdf.txt)|other|
51 | |**Annual Report Bosch 2014:** [Sales Figures](http://tabularazr.eastus.cloudapp.azure.com:7081/show/business_reports/Bosch_Annual_Report_2014_Financial_Report.pdf.txt#2238)|Business Report|
52 | |**Annual Report Oakland:** [Income per Sector from 2006 to 2010](http://tabularazr.eastus.cloudapp.azure.com:7081/show/muni_bonds/ER544111-ER421289-ER823264.pdf.txt#3533)|(Business) Report|
53 | |**EY's Biotech Report 2015:** [Europe's Top IPOs in 2014](http://tabularazr.eastus.cloudapp.azure.com:7081/show/business_reports/EY-beyond-borders-2015.pdf.txt#2946)|Business Report| 
54 | 
55 | # Other documents 
56 | Choose any financial document, research paper or annual report to upload yourself. Or browse these sources.
57 | 
58 | ### Example pdfs from public data (municipal bonds, audit reports, finanical reviews)
59 | 
60 | - http://emma.msrb.org/EP753324-ER508056-ER910760.pdf
61 | - http://emma.msrb.org/EP407966-EP321048-EP717328.pdf
62 | - http://emma.msrb.org/ER544111-ER421289-ER823264.pdf (very high cost of issuance)
63 | - http://emma.msrb.org/MS132788-MS108096-MD209140.pdf  (1997 bond issue)
64 | 
65 | #### Works with XIRR calculation feature
66 | 
67 | These documents can be successfully processed by the XIRR feature
68 | 
69 | - http://emma.msrb.org/ER588705-ER457598-ER860368.pdf
70 | 
71 | ### Other documents that may be of interest:
72 | 
73 | - https://treas-secure.state.mi.us/LAFDocSearch/tl41R01.aspx?&lu_id=1349&doc_yr=2015&doc_code=AUD (2015 Audit)
74 | - https://treas-secure.state.mi.us/LAFDocSearch/tl41R01.aspx?&lu_id=1349&doc_yr=2014&doc_code=AUD (2014 Audit)
75 | - http://www.michigan.gov/documents/treasury/Flint-ReviewTeamReport-11-7-11_417437_7.pdf (Review Team Report used to determine that the city faced a financial emergency)
76 | 


--------------------------------------------------------------------------------
/backend.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import re
  4 | 
  5 | import codecs
  6 | import string
  7 | 
  8 | from collections import Counter, OrderedDict
  9 | 
 10 | config = { "min_delimiter_length" : 3, "min_columns": 2, "min_consecutive_rows" : 3, "max_grace_rows" : 4,
 11 |           "caption_assign_tolerance" : 15.0, "meta_info_lines_above" : 10, "count_ws_header" : False,
 12 |           "threshold_caption_extension" : 0.45, "number_compatibility" : 0.5,
 13 |          "header_good_candidate_length" : 3, "complex_leftover_threshold" : 3, "min_canonical_rows" : 0.1,
 14 |          "min_fuzzy_ratio" : 0.75 }
 15 | 
 16 | import numpy as np
 17 | import pandas as pd
 18 | 
 19 | ### Tokenize and Tag
 20 | 
 21 | #Regex tester online: https://regex101.com
 22 | #Contrast with Basic table parsing capabilities of http://docs.astropy.org/en/latest/io/ascii/index.html
 23 | 
 24 | tokenize_pattern = ur"[.]{%i,}|[\ \$]{%i,}|" % ((config['min_delimiter_length'],)*2)
 25 | tokenize_pattern = ur"[.\ \$]{%i,}" % (config['min_delimiter_length'],)
 26 | footnote_inidicator = ur"[^,_!a-zA-Z0-9.]"
 27 | 
 28 | column_pattern = OrderedDict()
 29 | #column_pattern['large_num'] = ur"\d{1,3}(,\d{3})*(\.\d+)?"
 30 | column_pattern['large_num'] = ur"(([0-9]{1,3})(,\d{3})+(\.[0-9]{2})?)"
 31 | column_pattern['small_float'] = ur"[0-9]+\.[0-9]+"
 32 | column_pattern['integer'] = ur"^\s*[0-9]+\s*$"
 33 | #column_patter['delimiter'] = "[_=]{6,}"
 34 | #column_pattern['other'] = ur"([a-zA-Z0-9]{2,}\w)"
 35 | column_pattern['other'] = ur".+"
 36 | 
 37 | subtype_indicator = OrderedDict()
 38 | subtype_indicator['dollar'] = ur".*\$.*"
 39 | subtype_indicator['rate'] = ur"[%]"
 40 | #enter full set of date patterns here if we want refinement early on
 41 | subtype_indicator['year'] = ur"(20[0-9]{2})|(19[0-9]{2})"
 42 | 
 43 | 
 44 | 
 45 | import dateutil.parser as date_parser
 46 | from datetime import date
 47 | #Implement footnote from levtovers
 48 | def tag_token(token, ws):
 49 |     for t, p in column_pattern.iteritems():
 50 |         result = re.search(p, token)
 51 |         if result:
 52 |             leftover = token[:result.start()], token[result.end():]
 53 |             lr = "".join(leftover)
 54 |             value = token[result.start():result.end()]
 55 |             
 56 |             if len(lr) >= config['complex_leftover_threshold']:
 57 |                 return "complex", "unknown", token, leftover
 58 |             elif len(lr) == 0:
 59 |                 leftover = None
 60 |             
 61 |             subtype = "none"
 62 |             #First match on left-overs
 63 |             for sub, indicator in subtype_indicator.iteritems():
 64 |                 if re.match(indicator, lr): subtype = sub
 65 |             #Only if no indicator matched there, try on full token
 66 |             if subtype == "none":
 67 |                 for sub, indicator in subtype_indicator.iteritems():
 68 |                     if re.match(indicator, token): subtype = sub
 69 |             #Only if no indicator matched again, try on whitespace
 70 |             if subtype == "none":
 71 |                 for sub, indicator in subtype_indicator.iteritems():
 72 |                     if re.match(indicator, ws): subtype = sub
 73 |             
 74 |             if subtype == "none" and t == "other":
 75 |                 #No leftovers possible because fuzzy_token not implemented despite documented
 76 |                 today = date.today()
 77 |                 v_ascii = value.encode("ascii", errors="ignore")
 78 |                 try: 
 79 |                     dt = date_parser.parse(v_ascii, fuzzy=True, default=today)
 80 |                     if dt != today:
 81 |                         return t, "date", value, leftover
 82 |                 except:
 83 |                     pass
 84 |             
 85 |             return t, subtype, value, leftover
 86 |     #Try date at last:
 87 |     
 88 |     return "unknown", "none", token, ""
 89 |     
 90 | def row_feature(line):
 91 |     matches = re.finditer(tokenize_pattern, line)
 92 |     start_end = [ (match.start(), match.end()) for match in matches]
 93 |     #No delimiter found so it's free flowing text, i.e. part of a paragraph
 94 |     if len(start_end) < 1:
 95 |         if len(line) == 0:
 96 |             return ()
 97 |         else:
 98 |             return [{'start' : 0, 'value' : line, 'type' : 'freeflow', 'subtype' : 'none'}]
 99 |     
100 |     tokens = re.split(tokenize_pattern, line)
101 |     if tokens[0] == "": 
102 |         tokens = tokens[1:]
103 |     else:
104 |         start_end = [(0,0)] + start_end
105 |     
106 |     features = []
107 |     for se, token in zip(start_end, tokens):
108 |         t, subtype, value, leftover = tag_token(token, line[se[0]:se[1]])
109 |         feature = {"start" : se[1], "value" : value, "type" : t, "subtype" : subtype}
110 |         if leftover: feature["leftover"] = leftover
111 |         features.append(feature)
112 |     return features
113 | 
114 | 
115 | #Establish whether amount of rows is above a certain threshold and whether there is at least one number
116 | def row_qualifies(row):
117 |     return row != None and len(row) >= config['min_columns'] and \
118 |            sum( 1 if c['type'] in ['large_num', 'small_float', 'integer'] else 0 for c in row) > 0
119 | 
120 | 
121 | ### Scope tables
122 | 
123 | #Non qualified rows arm for consistency check but are tolerated for max_grace_rows (whitespace, breakline, junk)
124 | def filter_row_spans_new(row_features, row_qualifies=row_qualifies, ):    
125 | 
126 |     min_consecutive = config["min_consecutive_rows"]
127 |     grace_rows = config['max_grace_rows']
128 | 
129 |     last_qualified = None    
130 |     consecutive = 0
131 |     underqualified = 0
132 |     consistency_check = False
133 |     i = 0
134 |     
135 |     for j, row in enumerate(row_features):
136 |         qualifies = row_qualifies(row)
137 |         if consistency_check:
138 |             #print "BENCHMARKING %s AGAINST:" % row_to_string(row), row_to_string(row_features[last_qualified], 'type')
139 |             if not row_type_compatible(row_features[last_qualified], row):
140 |                 qualifies = False
141 |             consistency_check = False
142 |         #print qualifies, row_to_string(row)
143 |         
144 |         if qualifies:
145 |             if last_qualified is None:
146 |                 last_qualified = i
147 |                 consecutive = 1
148 |             else:
149 |                 consecutive += 1    
150 |         else:
151 |             underqualified += 1
152 |             if underqualified > grace_rows:
153 |                 if consecutive >= min_consecutive:
154 |                     #TODO: do post splitting upon type check and benchmark
155 |                     #print "YIELDED from", last_qualified, "to", i-underqualified+1
156 |                     yield last_qualified, i-underqualified+1
157 | 
158 |                 last_qualified = None                
159 |                 consecutive = 0
160 |                 underqualified = 0
161 |                 consistency_check = False
162 |             else:
163 |                 if last_qualified: 
164 |                     consistency_check = True
165 |         #print i, last_qualified, consecutive, consistency_check, row_to_string(row)
166 |         i += 1
167 |         
168 |     if consecutive >= min_consecutive:
169 |         yield last_qualified, i-underqualified
170 |         
171 | def row_to_string(row, key='value', sep='|'):
172 |     return sep.join(c[key] for c in row)
173 | 
174 | def row_type_compatible(row_canonical, row_test):
175 |     #Test whether to break because types differ too much
176 |     no_fit = 0.0
177 |     for c in row_test:
178 |         dist = (abs(c['start']-lc['start']) for lc in row_canonical)
179 |         val, idx = min((val, idx) for (idx, val) in enumerate(dist))
180 |         if c['type'] != row_canonical[idx]['type']:
181 |             no_fit += 1.0
182 |             number = ('large_num', 'small_float', 'integer')
183 |             if c['type'] in number and row_canonical[idx]['type'] in number:
184 |                 no_fit -= config["number_compatibility"]
185 | 
186 |     fraction_no_fit = no_fit / float(len(row_test))
187 |     #print "test row", row_to_string(row_test), ") against types (", row_to_string(row_canonical, 'type'), ") has %f unmatching types" % fraction_no_fit, "yields", fraction_no_fit < config["threshold_caption_extension"]  
188 |     return fraction_no_fit < config["threshold_caption_extension"]
189 | 
190 | def filter_row_spans(row_features, row_qualifies):    
191 | 
192 |     min_consecutive = config["min_consecutive_rows"]
193 |     grace_rows = config['max_grace_rows']
194 | 
195 |     last_qualified = None    
196 |     consecutive = 0
197 |     underqualified = 0
198 |     underqualified_rows = [] #Tuples of row number and the row    
199 |     
200 |     i = 0
201 |     
202 |     for j, row in enumerate(row_features):
203 |         if row_qualifies(row):
204 |             underqualified = 0
205 |             if last_qualified is None:
206 |                 last_qualified = i
207 |                 consecutive = 1
208 |             else:
209 |                 consecutive += 1    
210 |         else:
211 |             underqualified += 1
212 |             underqualified_rows.append((j, row) )
213 |             if underqualified > grace_rows:
214 |                 if consecutive >= min_consecutive:
215 |                     yield last_qualified, i-underqualified+1
216 | 
217 |                 last_qualified = None
218 |                 consecutive = 0
219 |                 underqualified = 0
220 |         #print i, underqualified, last_qualified, consecutive#, "" or row
221 |         i += 1
222 |         
223 |     if consecutive >= min_consecutive:
224 |         yield last_qualified, i-underqualified
225 | 
226 | ### Structure and convert tables
227 | 
228 | 
229 | def row_to_string(row, key='value', sep='|'):
230 |     return sep.join(c[key] for c in row)
231 | 
232 | 
233 | def readjust_cols(feature_row, slots):
234 | 
235 |     feature_new = [{'value' : 'NaN'}] * len(slots)
236 |     for v in feature_row:
237 |         dist = (abs((float(v['start'])) - s) for s in slots)
238 |         val , idx = min((val, idx) for (idx, val) in enumerate(dist))
239 |         if val <= config['caption_assign_tolerance']: feature_new[idx] = v
240 | 
241 |     return feature_new
242 | 
243 | 
244 | def normalize_rows(rows_in, structure):
245 |     slots = [c['start'] for c in structure] 
246 |     nrcols = len(structure)
247 |     
248 |     for r in rows_in:
249 |         if len(r) != nrcols:
250 |             if len(r)/float(nrcols) > config['threshold_caption_extension']:          
251 |                 yield readjust_cols(r, slots)
252 |         else:
253 |             yield r
254 | 
255 | #TODO: make side-effect free
256 | def structure_rows(row_features, meta_features):
257 |     #Determine maximum nr. of columns
258 |     lengths = Counter(len(r) for r in row_features)
259 |     nrcols = config['min_columns']
260 |     for l in sorted(lengths.keys(), reverse=True):
261 |         nr_of_l_rows = lengths[l]
262 |         if nr_of_l_rows/float(len(row_features)) > config['min_canonical_rows']:
263 |             nrcols = l
264 |             break
265 |             
266 |     canonical = filter(lambda r: len(r) == nrcols , row_features)
267 |     
268 |     #for c in canonical: print len(c), row_to_string(c)
269 |         
270 |     structure = []
271 |     for i in range(nrcols):
272 |         col = {}
273 |         col['start'] = float (sum (c[i]['start'] for c in canonical )) / len(canonical)
274 |     
275 |         types = Counter(c[i]['type'] for c in canonical)
276 |         col['type'] = types.most_common(1)[0][0]
277 |         subtypes = Counter(c[i]['subtype'] for c in canonical if c[i]['subtype'] != "none")        
278 |         subtype = "none" if len(subtypes) == 0 else subtypes.most_common(1)[0][0]
279 |         col['subtype'] = subtype
280 |         structure.append(col)
281 | 
282 |     #Test how far up the types are compatible and by that are data vs caption
283 |     for r in row_features:
284 |         #if r in canonical:
285 |         if len(r) and row_type_compatible(structure, r):
286 |             break
287 |         else:
288 |             meta_features.append(r)
289 |             row_features.remove(r)
290 |      
291 |     meta_features.reverse()
292 |     #for m in meta_features: print "META", row_to_string(m)
293 |  
294 |     captions = [''] * nrcols
295 |     single_headers = []
296 |     #latest_caption_len = 1
297 |     fill_up_captions_blocked = False
298 |     caption_begin = False
299 |     slots = [c['start'] for c in structure]
300 |     first_captions = []
301 |     for mf in meta_features:
302 |         #if we have at least one token as data and the closest tokens have not been exhausted yet for captions, consider them
303 |         nr_meta_tokens = len(mf)
304 |         if nr_meta_tokens > 0 and not fill_up_captions_blocked:
305 |             #Find closest slot the caption could fit
306 |             #TODO = allow doubling of captions if it is centered above more than one slot
307 |             for c in mf:
308 |                 dist = (abs((float(c['start'])) - s) for s in slots)
309 |                 dists = sorted((val, idx) for (idx, val) in enumerate(dist))
310 |                 val, idx = dists[0]
311 |                 #val2, idx2 = val, idx
312 |                 span_len = len(c['value'])
313 |                 if len(dists) > 1:
314 |                     val2, idx2 = dists[1]
315 |                     span_len = int(abs(val2-val))
316 |                     #print span_len, row_to_string(mf)
317 |                 if val <= config['caption_assign_tolerance'] and (len(c['value']) - span_len) < config['caption_assign_tolerance']: 
318 |                     captions[idx] = c['value'] + ' ' + captions[idx]
319 |                     if idx == 0: first_captions.append(c['value'])
320 |                     caption_begin = True
321 |                 else: 
322 |                     single_headers.append(c['value'])
323 |                     fill_up_captions_blocked = True
324 |             #latest_caption_len = nr_meta_tokens
325 |         
326 |         #In case of blank line, test if adding more captions should be locked
327 |         else:
328 |             if caption_begin:
329 |                 fill_up_captions_blocked = True
330 |             #Accept both orphan tokens and freeflow text as headers
331 |             #Todo: make separate data field for freeflow
332 |             if nr_meta_tokens == 1 : single_headers.append(mf[0]['value'])
333 |     
334 |     #If all meta features were aggregated into the first column caption, consider the table to have no column labels
335 |     if len(single_headers) == 0 and len(captions[0]) and sum(len(c) for c in captions[1:]) == 0:
336 |         single_headers = first_captions
337 |         captions[0] = u''
338 | 
339 |     #Assign captions as the value in structure
340 |     for i, c in enumerate(captions):
341 |         structure[i]['value'] = c
342 |     #Expand all the non canonical rows with NaN values (Todo: if types are very similar)
343 |     normalized_data = [r for r in normalize_rows(row_features, structure)]            
344 |     
345 |     return structure, normalized_data, single_headers
346 | 
347 | 
348 | def convert_to_table(rows, b, e, above):
349 |     table = {'begin_line' : b, 'end_line' : e, 'meta_begin_line' : b-above}
350 | 
351 |     data_rows = rows[b:e]
352 |     meta_rows = rows[b-above:b]
353 | 
354 |     structure, data, headers = structure_rows(data_rows, meta_rows)
355 | 
356 |     captions = [col['value'] for col in structure]
357 |  
358 |     table['captions'] = captions
359 |     table['data'] = data           
360 |     table['headers'] = headers
361 |     table['types'] = [col['type'] if 'type' in col else "NaN" for col in structure]
362 |     table['subtypes'] = [col['subtype'] if 'subtype' in col else "NaN" for col in structure]
363 |     return table 
364 | 
365 | def indexed_tables_from_rows(row_features):
366 |     
367 |     #Uniquely identify tables by their first row
368 |     tables = OrderedDict()
369 |     last_end = 0
370 |     for b,e in filter_row_spans(row_features, row_qualifies):
371 |         #Slice out the next table and limit the context rows to have no overlaps
372 |         #Todo: manage the lower meta lines
373 |         max_lines_above = config['meta_info_lines_above']
374 |         if not config["count_ws_header"]:
375 |             meta_counter = 0
376 |             above = 0
377 |             while meta_counter < max_lines_above and (above <= b-last_end):
378 |                 mf = row_features[b-above]
379 |                 if len(mf): meta_counter += 1
380 |                 above += 1
381 |             max_lines_above = above
382 |         tables[b] = convert_to_table(row_features, b, e, min(max_lines_above, b-last_end))
383 |         last_end = tables[b]['end_line']
384 |     return tables   
385 |     
386 | def return_tables(txt_path):
387 |     
388 |     #Uniquely identify tables by their first row
389 |     tables = OrderedDict()
390 |     
391 |     with codecs.open(txt_path, "r", "utf-8") as f:
392 |         lines = [l.replace(u'\n', '').replace(u'\r', '') for l in f]
393 |         rows = [row_feature(l) for l in lines] 
394 |         
395 |         return indexed_tables_from_rows(rows)
396 | 
397 | def table_to_df(table):
398 |     df = pd.DataFrame()
399 |     for i in range(len(table['captions'])):
400 |         values = []
401 |         for r in table['data']:
402 |             values.append(r[i]['value'])
403 |         df[i] = values
404 |     df.columns = table['captions']
405 |     return df
406 | 
407 | 
408 | 


--------------------------------------------------------------------------------
/bower.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "TabulaRazr-OS",
 3 |   "homepage": "https://github.com/ahirner/TabulaRazr-OS",
 4 |   "authors": [
 5 |     "wongalvis <wongalvis@hotmail.com>"
 6 |   ],
 7 |   "description": "",
 8 |   "main": "",
 9 |   "moduleType": [],
10 |   "license": "MIT",
11 |   "private": true,
12 |   "ignore": [
13 |     "**/.*",
14 |     "node_modules",
15 |     "bower_components",
16 |     "./project/static/bower_components",
17 |     "test",
18 |     "tests"
19 |   ],
20 |   "dependencies" : {
21 |       "materialize": "^0.97.5"
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/bulk_processing/IRR_estimate.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 4,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "# Applies a table filter across all extracted tables from a project and calculates \n",
 12 |     "# the net underwriter discount, and\n",
 13 |     "# the face value\n",
 14 |     "# for a specific type of table (standard case: two column with $ denominated key-value pairs)\n",
 15 |     "\n",
 16 |     "from __future__ import print_function\n",
 17 |     "\n",
 18 |     "import os\n",
 19 |     "import sys\n",
 20 |     "import glob\n",
 21 |     "import codecs\n",
 22 |     "import json\n",
 23 |     "\n",
 24 |     "import string\n",
 25 |     "\n",
 26 |     "sys.path.insert(0, os.path.pardir)\n",
 27 |     "\n",
 28 |     "from backend import *\n",
 29 |     "from data_query import *\n",
 30 |     "\n",
 31 |     "UPLOAD_FOLDER = os.path.join('..', 'static', 'ug')\n",
 32 |     "FILTER_FOLDER = os.path.join('..', 'static', 'filters')\n",
 33 |     "PROJECT = 'muni_bonds_bulk_2'\n",
 34 |     "FILTER = 'funds'\n",
 35 |     "\n",
 36 |     "path = os.path.join(UPLOAD_FOLDER, PROJECT, '*.tables.json')\n",
 37 |     "table_files = glob.glob(path)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 5,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [
 47 |     {
 48 |      "name": "stdout",
 49 |      "output_type": "stream",
 50 |      "text": [
 51 |       "Processing with filter {u'headers': {u'threshold': 0.35, u'terms': [u'USES OF FUNDS']}, u'name': u'Estimated use and sources of funds'}\n",
 52 |       "Procssing with value dictionary {'underwriter_discount': 'Underwriter Discount', 'premium': 'Issue Premium', 'premium_or_discount': 'Premium Discount', 'discount': 'Issue Discount', 'face_value': ['Principal Amount', 'Par Amount', 'Face Amount'], 'cost_of_issuance': 'Costs of Issuance'}\n"
 53 |      ]
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "def clean_string(s):\n",
 58 |     "    lc = s.encode('ascii', errors='ignore').lower()#.translate(remove_punctuation_map)\n",
 59 |     "    return lc.translate(None, string.punctuation + '0123456789').strip()\n",
 60 |     "    \n",
 61 |     "from collections import Counter\n",
 62 |     "\n",
 63 |     "table_counter = Counter()\n",
 64 |     "tables_looked_at = 0\n",
 65 |     "confidences = []\n",
 66 |     "no_table_files = []\n",
 67 |     "no_ud_tables = []\n",
 68 |     "no_fv_tables = []\n",
 69 |     "funny_tables = {}\n",
 70 |     "\n",
 71 |     "salient_values = {}\n",
 72 |     "\n",
 73 |     "# Get those line items sufficient for IRR estimation\n",
 74 |     "# remark: improved query terms from TF analysis and annotation\n",
 75 |     "irr_estimate_dict = {'face_value' : ['Principal Amount', 'Par Amount', 'Face Amount'], \n",
 76 |     "                     'premium' : 'Issue Premium',\n",
 77 |     "                     'discount': 'Issue Discount',\n",
 78 |     "                     'premium_or_discount' : 'Premium Discount', #will match line items that signify either at high confidence on the token level\n",
 79 |     "                     'underwriter_discount' : 'Underwriter Discount',\n",
 80 |     "                     'cost_of_issuance' : 'Costs of Issuance'}\n",
 81 |     "\n",
 82 |     "filter_file = os.path.join(FILTER_FOLDER, FILTER+'.json')\n",
 83 |     "with codecs.open(filter_file, \"r\", \"utf-8\", errors=\"replace\") as file:\n",
 84 |     "    _filter = json.load(file) \n",
 85 |     "\n",
 86 |     "print (\"Processing with filter %s\" % str(_filter))\n",
 87 |     "print (\"Procssing with value dictionary %s\" % str(irr_estimate_dict))"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 6,
 93 |    "metadata": {
 94 |     "collapsed": false,
 95 |     "scrolled": true
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "#Get all tables\n",
100 |     "for i,f in enumerate(table_files):\n",
101 |     "\n",
102 |     "    with codecs.open(f, 'r', 'utf-8') as file:\n",
103 |     "        tables = json.load(file)\n",
104 |     "        tables_looked_at += len(tables)\n",
105 |     "        \n",
106 |     "        filename = f.split(r'/')[-1].replace('.tables.json', '')\n",
107 |     "        \n",
108 |     "        filter_results = []\n",
109 |     "        for t in filter_tables(tables.values(), _filter):\n",
110 |     "            if len(filter_results) == 0 or t[0] >= max(r[0] for r in filter_results):\n",
111 |     "                filter_results.append(t)\n",
112 |     "        \n",
113 |     "        table_counter[len(filter_results)] += 1        \n",
114 |     "        if len(filter_results):\n",
115 |     "\n",
116 |     "            #Only keep first one\n",
117 |     "            confidence, table, _, _ = max( sorted( filter_results, key = lambda t: t[1]['begin_line'] ), \n",
118 |     "                                          key = lambda t: t[0])\n",
119 |     "            confidences.append(confidence)\n",
120 |     "            if len(table['captions']) != 2 or table['subtypes'][1] != 'dollar':\n",
121 |     "                funny_tables[filename] = table['begin_line']\n",
122 |     "            \n",
123 |     "            else:\n",
124 |     "                values = get_key_values(table, irr_estimate_dict, raw_cell=True)\n",
125 |     "                #invert line item if in brackets\n",
126 |     "                if values['premium_or_discount']:\n",
127 |     "                    r = values['premium_or_discount'][1]\n",
128 |     "                    if 'leftover' in r and '(' in r['leftover'][0] and ')' in r['leftover'][1]:\n",
129 |     "                        values['premium_or_discount'][0] = values['premium_or_discount'][0]\n",
130 |     "                \n",
131 |     "                #strip raw rows\n",
132 |     "                values = {k : (v[0] if v else None) for k,v in values.iteritems()}\n",
133 |     "                key = filename+'#'+str(table['begin_line'])\n",
134 |     "                \n",
135 |     "                if not values['face_value']: \n",
136 |     "                    no_fv_tables.append(key)\n",
137 |     "\n",
138 |     "                if not values['underwriter_discount']: \n",
139 |     "                    no_ud_tables.append(key)\n",
140 |     "                    \n",
141 |     "                #maybe problem with ordering guarantee\n",
142 |     "                salient_values[key] = values.values()\n",
143 |     "\n",
144 |     "        else:\n",
145 |     "            no_table_files.append(filename)\n",
146 |     "        \n",
147 |     "    if ( (i+1) % 100 ) == 0:\n",
148 |     "        print (\"%i files and %i tables processed... with %i best matches\" % \\\n",
149 |     "               (i+1, tables_looked_at, len(confidences)))\n",
150 |     "\n",
151 |     "        \n",
152 |     "results = {'high_confidence_candidates' : table_counter.most_common(),\n",
153 |     "           'tables_looked_at' : tables_looked_at,\n",
154 |     "           'tables_canonical' : len(confidences),\n",
155 |     "           'confidence_mean' : sum(confidences) / len(confidences),\n",
156 |     "           'confidences' : confidences, \n",
157 |     "           'no_table_files' : no_table_files,\n",
158 |     "           'no_ud_tables' : no_ud_tables,\n",
159 |     "           'no_fv_tables' : no_fv_tables,\n",
160 |     "           'funny_tables' : funny_tables,\n",
161 |     "           'salient_values' : salient_values\n",
162 |     "          }"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 7,
168 |    "metadata": {
169 |     "collapsed": false
170 |    },
171 |    "outputs": [],
172 |    "source": [
173 |     "#Save intermediate results\n",
174 |     "with codecs.open(\"IRR_estimate.results.json\", \"w\", \"utf-8\") as file:\n",
175 |     "    json.dump(results, file)"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 8,
181 |    "metadata": {
182 |     "collapsed": false
183 |    },
184 |    "outputs": [],
185 |    "source": [
186 |     "#Work from intermediate results\n",
187 |     "with codecs.open(\"IRR_estimate.results.json\", \"r\", \"utf-8\") as file:\n",
188 |     "    results = json.load(file)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 9,
194 |    "metadata": {
195 |     "collapsed": false
196 |    },
197 |    "outputs": [],
198 |    "source": [
199 |     "import xlwt\n",
200 |     "\n",
201 |     "bold = xlwt.Style.easyxf(\"font: bold on\")\n",
202 |     "\n",
203 |     "def write_table(sheet, keys, values, row, c_offset = 0, column_style = bold):\n",
204 |     "    for j, k in enumerate(keys):\n",
205 |     "        sheet.write(row, c_offset+j, k, column_style)\n",
206 |     "    row += 1\n",
207 |     "    for v in values:\n",
208 |     "        for j, vv in enumerate(v):\n",
209 |     "            sheet.write(row, c_offset+j, vv)\n",
210 |     "        row +=1\n",
211 |     "    return row"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 10,
217 |    "metadata": {
218 |     "collapsed": true
219 |    },
220 |    "outputs": [],
221 |    "source": [
222 |     "url_prefix = \"http://tabularazr.eastus.cloudapp.azure.com:7081/show/\"+PROJECT+'/'"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 11,
228 |    "metadata": {
229 |     "collapsed": false
230 |    },
231 |    "outputs": [],
232 |    "source": [
233 |     "def to_xls_url(url, link = None):\n",
234 |     "    f = 'HYPERLINK(\"'+url+'\"' + ('; \"'+link+'\")' if link else ')')\n",
235 |     "    return xlwt.Formula(f)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 12,
241 |    "metadata": {
242 |     "collapsed": false
243 |    },
244 |    "outputs": [],
245 |    "source": [
246 |     "wkb = xlwt.Workbook(encoding='utf-8')\n",
247 |     "s_summary, s_funding_values, s_confidence, s_no_table, s_no_fv_tables, s_no_ud_tables, s_funny_tables = \\\n",
248 |     "    (wkb.add_sheet(s) for s in ['summary', 'funding_values', 'confidence', 'no_table', \n",
249 |     "                                'no_face_value_tables', 'no_underwriter_discount_tables', 'funny_tables'])"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 13,
255 |    "metadata": {
256 |     "collapsed": false
257 |    },
258 |    "outputs": [],
259 |    "source": [
260 |     "i = 0\n",
261 |     "s_summary.write(i,0, 'Filter used', bold)\n",
262 |     "s_summary.write(i,1, str(_filter))\n",
263 |     "i+=1\n",
264 |     "s_summary.write(i,0, 'Value extraction dictionary used', bold)\n",
265 |     "s_summary.write(i,1, str(irr_estimate_dict))\n",
266 |     "i+=2\n",
267 |     "s_summary.write(i,0, 'Distribution of good table matches per document', bold)\n",
268 |     "i+=1\n",
269 |     "i = write_table(s_summary, ['Nr. of Table Candidates', 'Nr. of Documents'], \n",
270 |     "                results[\"high_confidence_candidates\"], i)\n",
271 |     "\n",
272 |     "i+=1\n",
273 |     "s_summary.write(i, 2, 'Total nr. of Table Candidates')\n",
274 |     "s_summary.write(i, 3, 'out of..')\n",
275 |     "i+=1\n",
276 |     "s_summary.write(i, 2, results['tables_canonical'])\n",
277 |     "s_summary.write(i, 3, results['tables_looked_at'])\n",
278 |     "\n",
279 |     "i = write_table(s_confidence, ['Confidence in best Table found'], ([c] for c in results['confidences']), 0)\n",
280 |     "i = write_table(s_no_table, ['Files with no suitable table found', 'URL'], \n",
281 |     "                ( ([c], to_xls_url(url_prefix+c)) for c in results['no_table_files'] ), 0)\n",
282 |     "i = write_table(s_no_ud_tables, ['Tables with no Underwriter Discount found', 'URL'], \n",
283 |     "                ( ([c], to_xls_url(url_prefix+c)) for c in results['no_ud_tables'] ), 0)\n",
284 |     "i = write_table(s_no_fv_tables, ['Tables with no Face Value found', 'URL'], \n",
285 |     "                ( ([c], to_xls_url(url_prefix+c)) for c in results['no_fv_tables'] ), 0)\n",
286 |     "\n",
287 |     "\n",
288 |     "s_funny_tables.write(0,4, \"[as returned by filter but with <> 2 rows, and/or no $ value in the 2nd column]\")\n",
289 |     "i = write_table(s_funny_tables, ['Funny Tables in File', 'Table ID',  'URL'], \n",
290 |     "                ( ( f, t, to_xls_url(url_prefix+f+'#'+str(t)) ) for f, t in results['funny_tables'].iteritems() ), 0)\n",
291 |     "\n",
292 |     "header_funding_values = ['Filename/Table', 'URL'] + irr_estimate_dict.keys()\n",
293 |     "i = write_table(s_funding_values, header_funding_values, \n",
294 |     "               (( [k, to_xls_url(url_prefix+k)] + v) for k, v in results['salient_values'].iteritems()), 0)\n"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 14,
300 |    "metadata": {
301 |     "collapsed": true
302 |    },
303 |    "outputs": [],
304 |    "source": [
305 |     "wkb.save('IRR_estimate.results.xls')"
306 |    ]
307 |   }
308 |  ],
309 |  "metadata": {
310 |   "kernelspec": {
311 |    "display_name": "Python 2",
312 |    "language": "python",
313 |    "name": "python2"
314 |   },
315 |   "language_info": {
316 |    "codemirror_mode": {
317 |     "name": "ipython",
318 |     "version": 2
319 |    },
320 |    "file_extension": ".py",
321 |    "mimetype": "text/x-python",
322 |    "name": "python",
323 |    "nbconvert_exporter": "python",
324 |    "pygments_lexer": "ipython2",
325 |    "version": "2.7.10"
326 |   }
327 |  },
328 |  "nbformat": 4,
329 |  "nbformat_minor": 0
330 | }
331 | 


--------------------------------------------------------------------------------
/bulk_processing/bulk_proc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #FILES = regex for folder
 4 | #PROJECT = project for POST request
 5 | 
 6 | for f in $FILES
 7 | do
 8 | 	echo "Processing $f ..."
 9 | 	curl -X POST -H "Content-Type: application/json" http://localhost:7081/analyze/"$PROJECT"/"$f" > /dev/null
10 | 	echo "Done"
11 | 
12 | done
13 | echo "FINISHED"
14 | 


--------------------------------------------------------------------------------
/data_query.py:
--------------------------------------------------------------------------------
  1 | from datetime import date
  2 | import dateutil.parser as date_parser
  3 | 
  4 | from backend import config
  5 | from fuzzywuzzy import fuzz
  6 | 
  7 | from itertools import product 
  8 | 
  9 | # Cascades:
 10 | # 1) case sensitive partial ratio on character level with penalty
 11 | # 2) case insensitive partial ratio on character level with penalty
 12 | # 3) token sorted case insensitive ratio with penalty
 13 | FUZZY_INV_CASCADES = 1.0 / 3.0
 14 | def fuzzy_str_match(query, string):
 15 | 
 16 |     score = 1.0
 17 |     inv_cascades = FUZZY_INV_CASCADES
 18 |     min_fuzzy_ratio = config["min_fuzzy_ratio"]
 19 | 
 20 |     query = query.encode('ascii', errors='ignore')    
 21 |     string = string.encode('ascii', errors='ignore')
 22 |     
 23 |     #Penalize shorter target strings and early exit on null length strings
 24 |     len_query = len(query)
 25 |     len_string = len(string.strip())
 26 |     if not len_string: return None
 27 |     if not len_query: return score
 28 |     penalty = min(len_string / float(len_query), 1.0)
 29 |     
 30 |     fuzzy_partial = (fuzz.partial_ratio(query, string)/100.0) * penalty
 31 |     #print ("fuzzy_partial of %s vs %s * penalty %.2f" % (query, string, penalty), fuzzy_partial)
 32 |     if fuzzy_partial > min_fuzzy_ratio:
 33 |         f_score = score - (1.0 - (fuzzy_partial - (1.0 - min_fuzzy_ratio)) / min_fuzzy_ratio) * inv_cascades
 34 |         return f_score
 35 |     score -= inv_cascades
 36 | 
 37 |     q_l = query.lower()
 38 |     s_l = string.lower()
 39 | 
 40 |     fuzzy_partial = (fuzz.partial_ratio(q_l, s_l)/100.0) * penalty
 41 |     #print ("fuzzy_partial lower_case of %s vs %s * penalty %.2f" % (query, string, penalty), fuzzy_partial)
 42 |     
 43 |     if fuzzy_partial > min_fuzzy_ratio:
 44 |         f_score = score - (1.0 - (fuzzy_partial - (1.0 - min_fuzzy_ratio)) / min_fuzzy_ratio) * inv_cascades
 45 |         return f_score
 46 |     score -= inv_cascades
 47 | 
 48 |     fuzzy_partial = (fuzz.partial_token_sort_ratio(q_l, s_l)/100.0) * penalty
 49 |     #print ("fuzzy_partial token_sort_lower_case of %s vs %s * penalty %.2f" % (query, string, penalty), fuzzy_partial)    
 50 |     if fuzzy_partial > min_fuzzy_ratio:
 51 |         f_score = score - (1.0 - (fuzzy_partial - (1.0 - min_fuzzy_ratio)) / min_fuzzy_ratio) * inv_cascades
 52 |         return f_score
 53 | 
 54 |     return None
 55 | 
 56 | #Flatmap from tables to sequence of tuples (confidence, table, row or None, value or None)
 57 | def filter_tables(tables, filter_dict, treshold = 0.0, only_max = False):
 58 |     row = None
 59 |     value = None
 60 |     
 61 |     for t in tables:
 62 | 
 63 |         if 'headers' in filter_dict:
 64 |             
 65 |             max_conf, index, best_term = None, None, None 
 66 |             terms = filter_dict['headers']['terms']
 67 |             _threshold = max(treshold, filter_dict['headers']['threshold'])
 68 |             for term in terms:
 69 |                 if t['headers']:
 70 |                     current_max_conf = (max_conf if only_max else _threshold) or _threshold
 71 |                     scores_indices = ((val, idx) for (idx, val) in enumerate(fuzzy_str_match(term, h) for h in t['headers'] ) )
 72 | 
 73 |                     conf, idx = max(scores_indices)
 74 |                 
 75 |                     if conf > max_conf:
 76 |                         max_conf = conf
 77 |                         index = idx
 78 |                         best_term = term
 79 |                         best_header = ""
 80 |             
 81 |             #Todo: other filter criteria like column names, rows etc. and combinatorial confidence score
 82 |             if max_conf:
 83 |                 yield max_conf, t, row, value 
 84 |                 
 85 |                 
 86 | def get_fuzzy_date(string):
 87 |     today = date.today()
 88 |     v_ascii = string.encode("ascii", errors="ignore")
 89 |     try: 
 90 |         dt = date_parser.parse(v_ascii, fuzzy=True, default=today)
 91 |         if dt != today:
 92 |             return dt
 93 |     except:
 94 |         return None
 95 |     
 96 | def get_first_date(lines, query_string, threshold = 0.4):
 97 |     for i, l in enumerate(lines):
 98 |         if fuzzy_str_match(query_string, l) > threshold: 
 99 |             dt = get_fuzzy_date(l)
100 |             if dt:
101 |                 return dt, i, l
102 |                    
103 | def find_row(table, query_string, threshold = 0.4):
104 |     #Find first 'other' typed row
105 |     try:
106 |         index = table['types'].index('other')
107 |     except ValueError:
108 |         print "no column consisting of mainly string data found"
109 |         return None
110 |     
111 |     strings = (s[index]['value'] for s in table['data'])
112 |     
113 |     #query_string can either be a single one or an iterable
114 |     if isinstance(query_string, basestring):
115 |         query_string = [query_string]
116 |     
117 |     scores_indices = ((val, idx) for (idx, val) in ( (s[0], fuzzy_str_match(qs, s[1])) \
118 |                                               for qs, s in product(query_string, enumerate(strings))) )
119 |     val, idx = max(scores_indices)
120 |     if val >= threshold:
121 |         return table['data'][idx]
122 |     else:
123 |         return None
124 | 
125 | 
126 | def closest_row_numeric_value(table, query_string, threshold = 0.4, raw_cell = False):
127 |     row = find_row(table, query_string, threshold)
128 |     if row:
129 |         for c in row:
130 |             if 'type' in c:
131 |                 if c['type'] in ('integer'):
132 |                     v = int(c['value'])
133 |                     return (v, c) if raw_cell else v
134 |                 elif c['type'] in ('large_num', 'small_float'):
135 |                     v = float(c['value'].replace(",", ""))
136 |                     return (v, c) if raw_cell else v
137 | 
138 | def get_key_values(table, key_queries, threshold = 0.4,  raw_cell = False):
139 |     return { k : closest_row_numeric_value(table, kk, threshold, raw_cell) for k, kk in key_queries.iteritems() }
140 | 
141 | 
142 | def find_column(table, query_string, types=None, subtypes=None, threshold = 0.4):
143 |     #Find first column with specific types
144 |     columns = []
145 |     for i, t in enumerate(zip(table['types'], table['subtypes'])):
146 |         t, st = t[0], t[1]
147 |         if t in (types or t) and st in (subtypes or st):
148 |             if fuzzy_str_match(query_string, table['captions'][i]) > threshold: return i
149 | 
150 | def filter_time_series(table, query_string, subtypes = ['dollar'], threshold = 0.4):
151 |     time_index = find_column(table, "", subtypes=['date', 'year'], threshold=threshold)
152 |     value_index = find_column(table, query_string, subtypes=subtypes, threshold=threshold)
153 | 
154 |     for r in table['data']:
155 |         dt = get_fuzzy_date(r[time_index]['value'])
156 |         if dt:
157 |             c = r[value_index]
158 |             v = None
159 |             if c['type'] in ('integer'): 
160 |                 v = int(c['value'])
161 |             elif c['type'] in ('large_num', 'small_float'):
162 |                 v = float(c['value'].replace(",", ""))
163 |             if v: yield dt, v
164 | 


--------------------------------------------------------------------------------
/deprecated/GetMuniBondData.cfg:
--------------------------------------------------------------------------------
1 | [FileLocations]
2 | OutputFileName = D:\munidocs\VA\BondData.csv
3 | OutputColumnSeparator = ,
4 | InputPath = D:\munidocs\VA\?????????-IRIS-*.pdf


--------------------------------------------------------------------------------
/deprecated/GetMuniBondData.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import ConfigParser
  3 | import gc
  4 | import glob
  5 | import io
  6 | import os
  7 | import cStringIO
  8 | import re
  9 | import subprocess
 10 | import string
 11 | import sys
 12 | import numpy as np
 13 | import pandas as pd
 14 | 
 15 | def GetConfigParm(section):
 16 | 	dict1 = {}
 17 | 	options = Config.options(section)
 18 | 	for option in options:
 19 | 		try:
 20 | 			dict1[option] = Config.get(section, option)
 21 | 			if dict1[option] == -1:
 22 | 				DebugPrint("skip: %s" % option)
 23 | 		except:
 24 | 			print("exception on %s!" % option)
 25 | 			dict1[option] = None
 26 | 	return dict1
 27 | 
 28 | # Main Process
 29 | # Read Configuration Parameters
 30 | config = ConfigParser.RawConfigParser()
 31 | config.read('GetMuniBondData.cfg')
 32 | OutputFileName = config.get("FileLocations","OutputFileName")
 33 | OutputColumnSeparator = config.get("FileLocations","OutputColumnSeparator")
 34 | InputPath = config.get("FileLocations","InputPath")
 35 | 
 36 | # Initialize Data Frame
 37 | df = pd.DataFrame(np.zeros(0 , dtype=[('file', 'a99'),('caption', 'a99'),('value', 'a99')]))
 38 | 
 39 | for file in glob.glob(InputPath):
 40 | 
 41 | 	printline = 0
 42 | 	linesleft = 0
 43 | 	blanklines = 0
 44 | 
 45 | 	intxtfilename = file + ".txt"
 46 | 
 47 | 	out, err = subprocess.Popen(["pdftotext", "-layout", file, file + ".txt" ]).communicate()
 48 | 	   
 49 | 	try:	
 50 | 	   intxtfile = io.open(intxtfilename, mode='rb')
 51 | 	except:
 52 | 	   print "Unable to extract text from " + file
 53 | 	   continue
 54 | 
 55 | 	lines = intxtfile.readlines()
 56 | 
 57 | 	topfound = 0
 58 | 	headerline = 0
 59 |         
 60 | 	for line in lines:
 61 | 
 62 | 		strippedline = line.upper().strip()
 63 | 
 64 | 		if topfound == 0 and string.find(line,"       $") > 0:
 65 | 			headerline = 1
 66 | 			topfound = 1
 67 | 	   
 68 | 		if 1 <= headerline <= 3:
 69 | 			caption = "HEADER " + str(headerline)
 70 | 			value = strippedline
 71 | 			df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)
 72 | 			headerline = headerline + 1
 73 | 			continue
 74 | 	       
 75 | 		if strippedline == "SOURCES AND USES OF FUNDS" \
 76 | 		or strippedline == "SOURCES AND USES OF FUNDS*" \
 77 | 		or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS" \
 78 | 		or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS*" \
 79 | 		or strippedline == "SOURCES AND USES OF FUNDS(1)" \
 80 | 		or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS(1)" \
 81 | 		or strippedline == "PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS":
 82 | 			printline = 1
 83 | 			linesleft = 25
 84 | 		
 85 | 		if printline == 1:
 86 | 			dollar_amount_regex = re.compile("[\$]{0,1}[\s]{0,6}[0-9,]{0,15}(\.[0-9]{1,2})$")
 87 | 			dollar_amount_match = re.search(dollar_amount_regex,strippedline)
 88 | 			if dollar_amount_match:
 89 | 				caption = strippedline[:dollar_amount_match.start(0)].strip()
 90 | 				value = strippedline[dollar_amount_match.start(0):].strip()
 91 | 				df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)
 92 | 			if len(line.strip()) < 5 and linesleft < 10:
 93 | 				blanklines = blanklines + 1
 94 | 			linesleft = linesleft - 1
 95 | 		 
 96 | 		if linesleft == 0:
 97 | 			printline = 0
 98 | 
 99 | 	del lines
100 | 	gc.collect()
101 | 
102 | df.to_csv(OutputFileName,OutputColumnSeparator,index=False)
103 | 


--------------------------------------------------------------------------------
/deprecated/GetMuniBondsTest.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 35,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stdout",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "skipping pdf/ER862677-ER674128-ER1075876.pdf, already exists.\n",
 15 |       "skipping pdf/EP849915-EP657701-EP1059361.pdf, already exists.\n",
 16 |       "skipping pdf/ER866175-ER676833-ER1078611.pdf, already exists.\n"
 17 |      ]
 18 |     }
 19 |    ],
 20 |    "source": [
 21 |     "from __future__ import print_function\n",
 22 |     "import re\n",
 23 |     "import os\n",
 24 |     "import codecs\n",
 25 |     "import string\n",
 26 |     "\n",
 27 |     "def create_path(path):\n",
 28 |     "    try: \n",
 29 |     "        os.makedirs(path)\n",
 30 |     "    except OSError:\n",
 31 |     "        if not os.path.isdir(path):\n",
 32 |     "            raise    \n",
 33 |     "\n",
 34 |     "#Convert all pdfs\n",
 35 |     "files = os.listdir('pdf')\n",
 36 |     "create_path \n",
 37 |     "for i,f in enumerate(files):\n",
 38 |     "\n",
 39 |     "    pdf_path = os.path.join('pdf', f)\n",
 40 |     "    txt_path = os.path.join('txt', f+'.txt')\n",
 41 |     "    \n",
 42 |     "    if not os.path.isfile(txt_path):\n",
 43 |     "        #Layout preservation crucial to maintain clues about tabular data\n",
 44 |     "        cmd = \"pdftotext -layout %s %s\" % (pdf_path, txt_path)\n",
 45 |     "        print ('%d/%d %s' % (i, len(files), cmd))\n",
 46 |     "        os.system(cmd)\n",
 47 |     "    else:\n",
 48 |     "        print ('skipping %s, already exists.' % (pdf_path, ))"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 147,
 54 |    "metadata": {
 55 |     "collapsed": false,
 56 |     "scrolled": false
 57 |    },
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "--------ER862677-ER674128-ER1075876.pdf.txt--------\n",
 64 |       "HEADER 1                                                     $95,885,000\n",
 65 |       "HEADER 2                                                     CALIFORNIA MUNICIPAL FINANCE AUTHORITY\n",
 66 |       "HEADER 3                                                     REVENUE BONDS, SERIES 2015-A\n",
 67 |       "--------EP849915-EP657701-EP1059361.pdf.txt--------\n",
 68 |       "HEADER 1                                                     $6,645,000\n",
 69 |       "HEADER 2                                                     CITY OF PALM SPRINGS\n",
 70 |       "HEADER 3                                                     LIMITED OBLIGATION REFUNDING IMPROVEMENT BONDS\n",
 71 |       "--------ER866175-ER676833-ER1078611.pdf.txt--------\n",
 72 |       "HEADER 1                                                     $19,560,000\n",
 73 |       "HEADER 2                                                     RNR SCHOOL FINANCING AUTHORITY\n",
 74 |       "HEADER 3                                                     COMMUNITY FACILITIES DISTRICT NO. 92-1\n",
 75 |       "PRINCIPAL AMOUNT OF 2015 REFUNDING BONDS                     $19,560,000.00\n",
 76 |       "PLUS: NET ORIGINAL ISSUE PREMIUM                             2,550,554.30\n",
 77 |       "PLUS: TRANSFERRED MONEYS FROM FUNDS FOR 2006 BONDS           367,663.99\n",
 78 |       "TOTAL SOURCES                                                $22,302,178.29\n",
 79 |       "DEPOSIT INTO ESCROW FUND (1)                                 $21,893,691.38\n",
 80 |       "DEPOSIT INTO 2015A COSTS OF ISSUANCE ACCOUNT (2)             408,486.91\n",
 81 |       "TOTAL USES                                                   $22,302,178.29\n"
 82 |      ]
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "#Existing Version\n",
 87 |     "for file in os.listdir('txt'):\n",
 88 |     "    \n",
 89 |     "    print (\"--------\" + file + \"--------\")\n",
 90 |     "    \n",
 91 |     "    printline = 0\n",
 92 |     "    linesleft = 0\n",
 93 |     "    blanklines = 0\n",
 94 |     "    \n",
 95 |     "    topfound = 0\n",
 96 |     "    headerline = 0 \n",
 97 |     "    \n",
 98 |     "    with codecs.open('txt/'+file, \"r\", \"utf-8\") as f:\n",
 99 |     "        for i, line in enumerate(f):\n",
100 |     "\n",
101 |     "            strippedline = line.upper().strip()\n",
102 |     "\n",
103 |     "            if topfound == 0 and string.find(line,\"       $\") > 0:\n",
104 |     "                headerline = 1\n",
105 |     "                topfound = 1\n",
106 |     "\n",
107 |     "            if 1 <= headerline <= 3:\n",
108 |     "                caption = \"HEADER \" + str(headerline)\n",
109 |     "                value = strippedline\n",
110 |     "                #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n",
111 |     "                print (u\"{:60s} {:10s}\".format(caption, value))\n",
112 |     "                headerline = headerline + 1\n",
113 |     "                continue\n",
114 |     "\n",
115 |     "            if strippedline == \"SOURCES AND USES OF FUNDS\" \\\n",
116 |     "            or strippedline == \"SOURCES AND USES OF FUNDS*\" \\\n",
117 |     "            or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS\" \\\n",
118 |     "            or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS*\" \\\n",
119 |     "            or strippedline == \"SOURCES AND USES OF FUNDS(1)\" \\\n",
120 |     "            or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS(1)\" \\\n",
121 |     "            or strippedline == \"PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS\":\n",
122 |     "                printline = 1\n",
123 |     "                linesleft = 25\n",
124 |     "\n",
125 |     "            if printline == 1:\n",
126 |     "                dollar_amount_regex = re.compile(\"[\\$]{0,1}[\\s]{0,6}[0-9,]{0,15}(\\.[0-9]{1,2})$\")\n",
127 |     "                dollar_amount_match = re.search(dollar_amount_regex,strippedline)\n",
128 |     "                if dollar_amount_match:\n",
129 |     "                    caption = strippedline[:dollar_amount_match.start(0)].strip()\n",
130 |     "                    value = strippedline[dollar_amount_match.start(0):].strip()\n",
131 |     "                    #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n",
132 |     "                    print (u\"{:60s} {:10s}\".format(caption, value))\n",
133 |     "                if len(line.strip()) < 5 and linesleft < 10:\n",
134 |     "                    blanklines = blanklines + 1\n",
135 |     "                linesleft = linesleft - 1\n",
136 |     "\n",
137 |     "            if linesleft == 0:\n",
138 |     "                printline = 0"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {
145 |     "collapsed": true
146 |    },
147 |    "outputs": [],
148 |    "source": [
149 |     "#Issues:\n",
150 |     "## Doesn't pick up caption in EP1059361 --> add USES OF FUNDS but then no SOURCES OF PAYMENTS\n",
151 |     "## Doesn't pick up line items in ER1075876 --> match sequences of .... to indicate tables as well, plus be more lenient with cents values\n"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 154,
157 |    "metadata": {
158 |     "collapsed": false
159 |    },
160 |    "outputs": [
161 |     {
162 |      "name": "stdout",
163 |      "output_type": "stream",
164 |      "text": [
165 |       "--------ER862677-ER674128-ER1075876.pdf.txt--------\n",
166 |       "HEADER 1                                                     $95,885,000\n",
167 |       "HEADER 2                                                     CALIFORNIA MUNICIPAL FINANCE AUTHORITY\n",
168 |       "HEADER 3                                                     REVENUE BONDS, SERIES 2015-A\n",
169 |       "PRINCIPAL AMOUNT                                             $ 95,885,000\n",
170 |       "BOND PREMIUM                                                 12,984,339\n",
171 |       "OTHER AVAILABLE FUNDS(1)                                     6,600,643 \n",
172 |       "TOTAL SOURCES                                                $115,469,982\n",
173 |       "DEPOSIT TO ACQUISITION FUND                                  $ 41,000,000\n",
174 |       "RETIREMENT OF WATER REVENUE ANTICIPATION NOTES(2)            14,000,000\n",
175 |       "DEPOSIT TO ESCROW FUND FOR REFUNDED 2008 BONDS               52,742,691\n",
176 |       "DISCHARGE OF STATE LOAN                                      7,096,550 \n",
177 |       "COSTS OF ISSUANCE(3)                                         630,741   \n",
178 |       "TOTAL USES                                                   $115,469,982\n",
179 |       "--------EP849915-EP657701-EP1059361.pdf.txt--------\n",
180 |       "HEADER 1                                                     $6,645,000\n",
181 |       "HEADER 2                                                     CITY OF PALM SPRINGS\n",
182 |       "HEADER 3                                                     LIMITED OBLIGATION REFUNDING IMPROVEMENT BONDS\n",
183 |       "TRANSFER TO ESCROW BANK                                      $6,086,693.08\n",
184 |       "RESERVE FUND (1)                                             274,331.25\n",
185 |       "COSTS OF ISSUANCE FUND (2)                                   152,404.72\n",
186 |       "TOTAL USES                                                   $6,513,429.05\n",
187 |       "--------ER866175-ER676833-ER1078611.pdf.txt--------\n",
188 |       "HEADER 1                                                     $19,560,000\n",
189 |       "HEADER 2                                                     RNR SCHOOL FINANCING AUTHORITY\n",
190 |       "HEADER 3                                                     COMMUNITY FACILITIES DISTRICT NO. 92-1\n",
191 |       "PRINCIPAL AMOUNT OF 2015 REFUNDING BONDS                     $19,560,000.00\n",
192 |       "PLUS: NET ORIGINAL ISSUE PREMIUM                             2,550,554.30\n",
193 |       "PLUS: TRANSFERRED MONEYS FROM FUNDS FOR 2006 BONDS           367,663.99\n",
194 |       "TOTAL SOURCES                                                $22,302,178.29\n",
195 |       "DEPOSIT INTO ESCROW FUND (1)                                 $21,893,691.38\n",
196 |       "DEPOSIT INTO 2015A COSTS OF ISSUANCE ACCOUNT (2)             408,486.91\n",
197 |       "TOTAL USES                                                   $22,302,178.29\n"
198 |      ]
199 |     }
200 |    ],
201 |    "source": [
202 |     "#New Version\n",
203 |     "for file in os.listdir('txt'):\n",
204 |     "    \n",
205 |     "    print (\"--------\" + file + \"--------\")\n",
206 |     "    \n",
207 |     "    printline = 0\n",
208 |     "    linesleft = 0\n",
209 |     "    blanklines = 0\n",
210 |     "    \n",
211 |     "    topfound = 0\n",
212 |     "    headerline = 0 \n",
213 |     "    \n",
214 |     "    with codecs.open('txt/'+file, \"r\", \"utf-8\") as f:\n",
215 |     "        for i, line in enumerate(f):\n",
216 |     "\n",
217 |     "            \n",
218 |     "            strippedline = line.upper().strip()\n",
219 |     "\n",
220 |     "            if topfound == 0 and string.find(line,\"       $\") > 0:\n",
221 |     "                headerline = 1\n",
222 |     "                topfound = 1\n",
223 |     "\n",
224 |     "            if 1 <= headerline <= 3:\n",
225 |     "                caption = \"HEADER \" + str(headerline)\n",
226 |     "                value = strippedline\n",
227 |     "                #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n",
228 |     "                print (u\"{:60s} {:10s}\".format(caption, value))\n",
229 |     "                headerline = headerline + 1\n",
230 |     "                continue\n",
231 |     "\n",
232 |     "            if strippedline == \"SOURCES AND USES OF FUNDS\" \\\n",
233 |     "            or strippedline == \"SOURCES AND USES OF FUNDS*\" \\\n",
234 |     "            or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS\" \\\n",
235 |     "            or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS*\" \\\n",
236 |     "            or strippedline == \"SOURCES AND USES OF FUNDS(1)\" \\\n",
237 |     "            or strippedline == \"ESTIMATED SOURCES AND USES OF FUNDS(1)\" \\\n",
238 |     "            or strippedline == \"PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS\" \\\n",
239 |     "            or strippedline == \"ESTIMATED USES OF FUNDS\": #New\n",
240 |     "                printline = 1\n",
241 |     "                linesleft = 25\n",
242 |     "                #print (\"#### line:\", i, \"to\", i+linesleft)\n",
243 |     "\n",
244 |     "            if printline == 1:\n",
245 |     "                #Include a minimum of preceding dots or whitespace\n",
246 |     "                #Group 1 = preceding whitespace\n",
247 |     "                #Group 2 = Dollar value\n",
248 |     "                #Group 3 = $Cents value if existing\n",
249 |     "                dollar_amount_regex = ur\"([\\.]{4,}|[\\s]{4,})[\\s]*\" + \\\n",
250 |     "                                      ur\"([\\$]{0,1}[\\s]{0,6}[0-9,]{2,15})(\\.[0-9]{1,2})?$\"\n",
251 |     "                dollar_amount_regex = re.compile(dollar_amount_regex)\n",
252 |     "                dollar_amount_match = re.search(dollar_amount_regex,strippedline)\n",
253 |     "                \n",
254 |     "                #Check whether we found something tabular and a dollar value\n",
255 |     "                if dollar_amount_match and dollar_amount_match.group(2):\n",
256 |     "                    caption = strippedline[:dollar_amount_match.start(1)].strip()\n",
257 |     "                    value = strippedline[dollar_amount_match.start(2):].strip()\n",
258 |     "                    #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)\n",
259 |     "                    print (u\"{:60s} {:10s}\".format(caption, value))\n",
260 |     "                if len(line.strip()) < 5 and linesleft < 10:\n",
261 |     "                    blanklines = blanklines + 1\n",
262 |     "                linesleft = linesleft - 1\n",
263 |     "\n",
264 |     "            if linesleft == 0:\n",
265 |     "                printline = 0"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 150,
271 |    "metadata": {
272 |     "collapsed": false
273 |    },
274 |    "outputs": [
275 |     {
276 |      "name": "stdout",
277 |      "output_type": "stream",
278 |      "text": [
279 |       "--------ER862677-ER674128-ER1075876.pdf.txt--------\n",
280 |       "27 - issuance of the Bonds. See “PLAN OF FINANCE” and “ESTIMATED SOURCES AND USES OF FUNDS.”\n",
281 |       "\n",
282 |       "229 - ESTIMATED SOURCES AND USES OF FUNDS ................................................................................... 7\n",
283 |       "\n",
284 |       "370 - of the Bonds. See “PLAN OF FINANCE” and “ESTIMATED SOURCES AND USES OF FUNDS.”\n",
285 |       "\n",
286 |       "653 -                                  ESTIMATED SOURCES AND USES OF FUNDS\n",
287 |       "\n",
288 |       "--------EP849915-EP657701-EP1059361.pdf.txt--------\n",
289 |       "--------ER866175-ER676833-ER1078611.pdf.txt--------\n",
290 |       "223 - ESTIMATED SOURCES AND USES OF FUNDS .................................................................................. 13 \n",
291 |       "\n",
292 |       "429 - Bonds. See “ESTIMATED SOURCES AND USES OF FUNDS.”\n",
293 |       "\n",
294 |       "715 - “ESTIMATED SOURCES AND USES OF FUNDS.”\n",
295 |       "\n",
296 |       "983 -                              ESTIMATED SOURCES AND USES OF FUNDS\n",
297 |       "\n"
298 |      ]
299 |     }
300 |    ],
301 |    "source": [
302 |     "#Some exploration\n",
303 |     "max_distance_below = 25\n",
304 |     "max_distance_above = 5\n",
305 |     "context_identifier = u\"SOURCES AND USES OF FUNDS|SOURCES AND USES OF FUNDS*|ESTIMATED SOURCES AND USES OF FUNDS|\" + \\\n",
306 |     "                      \"ESTIMATED SOURCES AND USES OF FUNDS*|SOURCES AND USES OF FUNDS(1)|\" + \\\n",
307 |     "                      \"ESTIMATED SOURCES AND USES OF FUNDS(1)|PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS\"\n",
308 |     "context_identifier = context_identifier.split(u\"|\")\n",
309 |     "\n",
310 |     "for file in os.listdir('txt'):\n",
311 |     "    \n",
312 |     "    print (\"--------\" + file + \"--------\")\n",
313 |     "    with codecs.open('txt/'+file, \"r\", \"utf-8\") as f:\n",
314 |     "        for i, line in enumerate(f):\n",
315 |     "            \n",
316 |     "            #Print Candidates\n",
317 |     "            id_found = reduce(lambda x,y: x or y, ( (id in line) for id in context_identifier ))\n",
318 |     "            if id_found:\n",
319 |     "                print(i, '-', line)\n",
320 |     "            "
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {
327 |     "collapsed": true
328 |    },
329 |    "outputs": [],
330 |    "source": []
331 |   }
332 |  ],
333 |  "metadata": {
334 |   "kernelspec": {
335 |    "display_name": "Python 2",
336 |    "language": "python",
337 |    "name": "python2"
338 |   },
339 |   "language_info": {
340 |    "codemirror_mode": {
341 |     "name": "ipython",
342 |     "version": 2
343 |    },
344 |    "file_extension": ".py",
345 |    "mimetype": "text/x-python",
346 |    "name": "python",
347 |    "nbconvert_exporter": "python",
348 |    "pygments_lexer": "ipython2",
349 |    "version": "2.7.10"
350 |   }
351 |  },
352 |  "nbformat": 4,
353 |  "nbformat_minor": 0
354 | }
355 | 


--------------------------------------------------------------------------------
/deprecated/Procfile:
--------------------------------------------------------------------------------
1 | web: python server.py
2 | 


--------------------------------------------------------------------------------
/deprecated/TableParser.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | #TABLE Parser
  7 | #Infers a table with arbitrary number of columns from reoccuring patterns in text lines
  8 | 
  9 | #Main assumptions Table identificatin:
 10 | #1) each row is either in one line or not a row at all [DONE]
 11 | #2) each column features at least one number (=dollar amount) [MISSING]
 12 | #2a) each column features at least one date-like string
 13 | #3) a table exists if rows are in narrow consecutive order and share similarities --> scoring algo [DONE] 
 14 | #4) each column is separated by more than 2 consecutive whitespace indicators (e.g. '  ' or '..')
 15 | 
 16 | #Feature List:
 17 | #1) Acknowledge Footnotes / make lower meta-data available
 18 | #2) make delimiter length smartly dependent on number of columns (iteration)
 19 | #3) expand non canonical values in tables [DONE] .. but only to the extent that type matches 
 20 | #4) UI: parameterize extraction on the show page on the fly
 21 | #5) more type inference (e.g. date)
 22 | 
 23 | 
 24 | # In[128]:
 25 | 
 26 | import re
 27 | import os
 28 | import codecs
 29 | import string
 30 | from collections import OrderedDict
 31 | 
 32 | config = { "min_delimiter_length" : 3, "min_columns": 2, "min_consecutive_rows" : 3, "max_grace_rows" : 2,
 33 |           "caption_reorder_tolerance" : 10.0, "meta_info_lines_above" : 8, "aggregate_captions_missing" : 0.5}
 34 | 
 35 | 
 36 | # In[129]:
 37 | 
 38 | import json
 39 | import sys
 40 | 
 41 | from flask import Flask, request, redirect, url_for, send_from_directory
 42 | from werkzeug import secure_filename
 43 | 
 44 | from flask import jsonify, render_template, make_response
 45 | import numpy as np
 46 | import pandas as pd
 47 | 
 48 | from pyxley import UILayout
 49 | from pyxley.filters import SelectButton
 50 | from pyxley.charts.mg import LineChart, Figure, ScatterPlot, Histogram
 51 | from pyxley.charts.datatables import DataTable
 52 | 
 53 | 
 54 | # In[3]:
 55 | 
 56 | #Regex tester online: https://regex101.com
 57 | #Contrast with Basic table parsing capabilities of http://docs.astropy.org/en/latest/io/ascii/index.html
 58 | 
 59 | tokenize_pattern = "[.]{%i,}|[\ \$]{%i,}|" % ((config['min_delimiter_length'],)*2)
 60 | tokenize_pattern = "[.\ \$]{%i,}" % (config['min_delimiter_length'],)
 61 | 
 62 | column_pattern = OrderedDict()
 63 | #column_pattern['large_num'] = ur"\d{1,3}(,\d{3})*(\.\d+)?"
 64 | column_pattern['large_num'] = ur"(([0-9]{1,3})(,\d{3})+(\.[0-9]{2})?)"
 65 | column_pattern['small_float'] = ur"[0-9]+\.[0-9]+"
 66 | column_pattern['integer'] = ur"^\s*[0-9]+\s*$"
 67 | column_pattern['other'] = ur"([a-zA-Z0-9]{2,}\w)"
 68 | column_pattern['other'] = ur".+"
 69 | 
 70 | subtype_indicator = OrderedDict()
 71 | subtype_indicator['dollar'] = r".*\$.*"
 72 | subtype_indicator['rate'] = r"[%]"
 73 | subtype_indicator['year'] = "(20[0-9]{2})|(19[0-9]{2})"
 74 | 
 75 | 
 76 | # In[4]:
 77 | 
 78 | #import dateutil.parser as date_parser
 79 | #(type, subtype, value, leftover)
 80 | def tag_token(token, ws):
 81 |     for t, p in column_pattern.iteritems():
 82 |         result = re.search(p, token)
 83 |         if result:
 84 |             leftover = token[:result.start()] + token[result.end():]
 85 |             value = token[result.start():result.end()]
 86 |             
 87 |             #First match on left-overs
 88 |             subtype = "none"
 89 |             for sub, indicator in subtype_indicator.iteritems():
 90 |                 if re.match(indicator, leftover): subtype = sub
 91 |             #Only if no indicator matched there, try on full token
 92 |             if subtype == "none":
 93 |                 for sub, indicator in subtype_indicator.iteritems():
 94 |                     if re.match(indicator, token): subtype = sub                    
 95 |             #Only if no indicator matched again, try on whitespace
 96 |             if subtype == "none":
 97 |                 for sub, indicator in subtype_indicator.iteritems():
 98 |                     if re.match(indicator, ws): subtype = sub
 99 |             #print token, ":", ws, ":", subtype
100 |             
101 |             return t, subtype, value, leftover
102 |     return "unknown", "none", token, ""
103 |     
104 | def row_feature(line):
105 |     features = []
106 |     matches = re.finditer(tokenize_pattern, line)
107 |     start_end = [ (match.start(), match.end()) for match in matches]
108 |     if len(start_end) < 1: 
109 |         return features
110 |     
111 |     tokens = re.split(tokenize_pattern, line)
112 |     if tokens[0] == "": 
113 |         tokens = tokens[1:]
114 |     else:
115 |         start_end = [(0,0)] + start_end
116 |     
117 |     for se, token in zip(start_end, tokens):
118 |         t, subtype, value, _ = tag_token(token, line[se[0]:se[1]])
119 |         feature = {"start" : se[1], "value" : value, "type" : t, "subtype" : subtype}
120 |         features.append(feature)
121 |     return features
122 | 
123 | #date_parser.parse("asdf")
124 | 
125 | 
126 | # In[5]:
127 | 
128 | #Establish whether amount of rows is above a certain threshold and whether there is at least one number
129 | def row_qualifies(row):
130 |     return len(row) >= config['min_columns'] and sum( 1 if c['type'] in ['large_num', 'small_float', 'integer'] else 0 for c in row) > 0
131 | 
132 | def row_equal_types(row1, row2):
133 |     max_len = max(len(row1), len(row2) )
134 |     same_types = sum (map(lambda t: 1 if t[0]==t[1] else 0, ((c1['type'], c2['type']) for c1, c2 in zip(row1, row2))))
135 |     return same_types == max_len
136 | 
137 | 
138 | # In[6]:
139 | 
140 | def filter_row_spans(row_features, row_qualifies):    
141 | 
142 |     min_consecutive = config["min_consecutive_rows"]
143 |     grace_rows = config['max_grace_rows']
144 | 
145 |     last_qualified = None    
146 |     consecutive = 0
147 |     underqualified = 0
148 |     i = 0
149 |     
150 |     for row in row_features:
151 |         if row_qualifies(row):
152 |             underqualified = 0
153 |             if last_qualified is None:
154 |                 last_qualified = i
155 |                 consecutive = 1
156 |             else:
157 |                 consecutive += 1
158 |                 
159 |         else:
160 |             underqualified += 1
161 |             if underqualified > grace_rows:
162 |                 if consecutive >= min_consecutive:
163 |                     yield last_qualified, i-underqualified+1
164 |                     last_qualified = None                
165 |                     consecutive = 0
166 |                 else:
167 |                     last_qualified = None
168 |                     consecutive = 0
169 |                 underqualified = 0
170 |         #print i, underqualified, last_qualified, consecutive#, "" or row
171 |         i += 1
172 |         
173 |     if consecutive >= min_consecutive:
174 |         yield last_qualified, i-underqualified
175 | 
176 | 
177 | # In[126]:
178 | 
179 | from collections import Counter
180 | 
181 | def readjust_cols(feature_row, slots):
182 |         feature_new = [{'value' : 'NaN'}] * len(slots)
183 |         for v in feature_row:
184 |             dist = [ abs((float(v['start'])) - s) for s in slots ]
185 |             val , idx = min((val, idx) for (idx, val) in enumerate(dist))
186 |             if val <= config['caption_reorder_tolerance']: feature_new[idx] = v
187 |         return feature_new
188 | 
189 | def normalize_rows(rows_in, structure):
190 |     
191 |     slots = [c['start'] for c in structure] 
192 |     nrcols = len(structure)
193 |     
194 |     for r in rows_in:
195 |         if len(r) != nrcols:
196 |             if len(r)/float(nrcols) > config['aggregate_captions_missing']:          
197 |                 yield readjust_cols(r, slots)
198 |         else:
199 |             yield r
200 | 
201 | #TODO: make side-effect free
202 | def structure_rows(row_features, meta_features):
203 |     #Determine maximum nr. of columns
204 |     lengths = [len(r) for r in row_features]
205 |     nrcols = max(lengths)
206 |     canonical = filter(lambda r: len(r) == nrcols , row_features)
207 |     
208 |     #print canonical
209 |     
210 |     structure = []
211 |     values = []
212 |     for i in range(nrcols):
213 |         col = {}
214 |         col['start'] = float (sum (c[i]['start'] for c in canonical )) / len(canonical)
215 |     
216 |         types = Counter(c[i]['type'] for c in canonical)
217 |         col['type'] = types.most_common(1)[0][0]
218 |         subtypes = Counter(c[i]['subtype'] for c in canonical if c[i]['subtype'] is not "none")        
219 |         subtype = "none" if len(subtypes) == 0 else subtypes.most_common(1)[0][0]
220 |         col['subtype'] = subtype
221 |         structure.append(col)
222 |     
223 |     #Add the first non canonical rows to the meta_features above data
224 |     for r in row_features:
225 |         if r in canonical:
226 |             break
227 |         else:
228 |             meta_features.append(r)
229 |             row_features.remove(r)
230 |      
231 |     #Try to find caption from first rows above the data, skip one empty row if necessary
232 |     #Todo: make two steps process cleaner and more general
233 |     if len(meta_features[-1]) == 0: meta_features = meta_features[:-1]
234 |     caption = meta_features[-1] if len(meta_features[-1])/float(nrcols) > config['aggregate_captions_missing'] else None 
235 |     if caption:
236 |         slots = [c['start'] for c in structure] 
237 |         meta_features = meta_features[:-1]              
238 |         if len(caption) != nrcols: caption = readjust_cols(caption, slots)
239 |         if len(meta_features[-1])/float(nrcols) > config['aggregate_captions_missing']:
240 |             caption2 = readjust_cols(meta_features[-1], slots)
241 |             for c,c2 in zip(caption, caption2):
242 |                 if c2['value'] != 'NaN':
243 |                     c['value'] = c2['value'] + ' ' + c['value']
244 |             meta_features = meta_features[:-1]
245 |       
246 |         #Assign captions as the value in structure
247 |         for i, c in enumerate(caption):
248 |             structure[i]['value'] = c['value']
249 |     
250 |     headers = []
251 |     for h in meta_features:
252 |         if len(h) == 1:
253 |             headers.append(h[0]['value'])   
254 |     
255 |     #Expand all the non canonical rows with NaN values (Todo: if type matches)
256 |     normalized_data = [r for r in normalize_rows(row_features, structure)]            
257 |     
258 |     return structure, normalized_data, headers
259 | 
260 | 
261 | # In[115]:
262 | 
263 | def output_table_html(txt_path):
264 |     out = []
265 |     out.append("--------" + txt_path + "--------")
266 | 
267 |     with codecs.open(txt_path, "r", "utf-8") as f:
268 | 
269 |         lines = [l.encode('ascii', 'ignore').replace('\n', '') for l in f]
270 |         rows = [row_feature(l) for l in lines]
271 | 
272 |         for b,e in filter_row_spans(rows, row_qualifies):
273 |             out.append("TABLE STARTING FROM LINE %i to %i" % (b,e))
274 |             table = rows[b:e]
275 |             structure, data, headers = structure_rows(table, rows[b-config['meta_info_lines_above']:b])
276 | 
277 |             for h in headers: out.append(h)
278 |             if caption: 
279 |                 out.append("\t".join(caption))
280 |             else:
281 |                 out.append('NO COLUMN NAMES DETECTED')
282 | 
283 |             for f in rows[b:e]:
284 |                 cols = "\t|\t".join([col['value']+" (%s, %s)" % (col['type'], col['subtype']) for col in f])
285 |                 out.append("%i %s" % (len(f), cols) )
286 |     return out
287 | 
288 | def return_tables(txt_path):
289 |     
290 |     #Uniquely identify tables by their first row
291 |     tables = OrderedDict()
292 |     
293 |     with codecs.open(txt_path, "r", "utf-8") as f:
294 |         lines = [l.encode('ascii', 'ignore').replace('\n', '') for l in f]
295 |         rows = [row_feature(l) for l in lines] 
296 |         
297 |         for b,e in filter_row_spans(rows, row_qualifies):
298 |             table = {'begin_line' : b, 'end_line' : e}
299 |             
300 |             data_rows = rows[b:e]
301 |             meta_rows = rows[b-config['meta_info_lines_above']:b]
302 |             
303 |             structure, data, headers = structure_rows(data_rows, meta_rows)
304 |             
305 |             #Construct df
306 |             captions = [(col['value'] if 'value' in col.keys() else "---") +" (%s, %s)" % (col['type'], col['subtype']) for col in structure]
307 |             
308 |             table['captions'] = captions
309 |             table['data'] = data           
310 |             table['header'] = " | ".join(headers)
311 |             
312 |             tables[b] = table
313 |     
314 |     return tables
315 | 
316 | 
317 | # ## Web App ##
318 | 
319 | # In[124]:
320 | 
321 | TITLE = "docX - Table View"
322 | 
323 | scripts = [
324 |     "./bower_components/jquery/dist/jquery.min.js",
325 |     "./bower_components/datatables/media/js/jquery.dataTables.js",
326 |     "./bower_components/d3/d3.min.js",
327 |     "./bower_components/metrics-graphics/dist/metricsgraphics.js",
328 |     "./require.min.js",
329 |     "./bower_components/react/react.js",
330 |     "./bower_components/react-bootstrap/react-bootstrap.min.js",
331 |     "./bower_components/pyxley/build/pyxley.js",
332 | ]
333 | 
334 | css = [
335 |     "./bower_components/bootstrap/dist/css/bootstrap.min.css",
336 |     "./bower_components/metrics-graphics/dist/metricsgraphics.css",
337 |     "./bower_components/datatables/media/css/jquery.dataTables.min.css",
338 |     "./css/main.css"
339 | ]
340 | 
341 | 
342 | UPLOAD_FOLDER = './'
343 | ALLOWED_EXTENSIONS = set(['txt', 'pdf'])
344 | 
345 | app = Flask(__name__)
346 | app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
347 | 
348 | def get_extension(filename):
349 |     return '.' in filename and            filename.rsplit('.', 1)[1] 
350 | 
351 | def allowed_file(filename):
352 |     return get_extension(filename) in ALLOWED_EXTENSIONS
353 | 
354 | @app.route('/', methods=['GET', 'POST'])
355 | def upload_file():
356 |     if request.method == 'POST':
357 |         file = request.files['file']
358 |         min_columns = request.form['min_columns']
359 |         if file and allowed_file(file.filename):
360 |             filename = secure_filename(file.filename)
361 |             extension = get_extension(file.filename)
362 |             file.save(os.path.join(app.config['UPLOAD_FOLDER'], extension, filename))
363 |             return redirect(url_for('uploaded_file',
364 |                                     filename=filename, min_columns=min_columns))
365 |     return '''
366 |     <!doctype html>
367 |     <title>docX - Table Extractor</title>
368 |     <h1>Upload a pdf or txt file</h1>
369 |     <form action="" method=post enctype=multipart/form-data>
370 |       <p><input type=file name=file>
371 |          <input type=submit value=Upload>
372 |       <p>
373 |           <h3>Select the minimum amount of <b>columns</b> tables should have</h3>      
374 |           <select name="min_columns">
375 |             <option value="2">2</option>
376 |             <option value="3">3</option>
377 |             <option value="4">4</option>
378 |           </select>      
379 |     </form>
380 |     '''
381 | 
382 | all_charts = {}
383 | all_uis = {}
384 | 
385 | @app.route('/show/<filename>')
386 | def uploaded_file(filename):
387 |     extension = get_extension(filename)
388 |     path = os.path.join(app.config['UPLOAD_FOLDER'], extension, filename)
389 |     txt_path = os.path.join(app.config['UPLOAD_FOLDER'], 'txt', filename)
390 |     if extension == "pdf":
391 |         txt_path += '.txt'
392 |         if not os.path.isfile(txt_path):
393 |         #Layout preservation crucial to preserve clues about tabular data
394 |             cmd = "pdftotext -layout %s %s" % (path, txt_path)
395 |             os.system(cmd)
396 |             
397 |     min_columns = request.args.get('min_columns')
398 |     tables = return_tables(txt_path)
399 | 
400 |     #Construct histogram
401 |     lines_per_page = 80
402 |     nr_data_rows = []
403 |     for b, t in tables.iteritems():
404 |         e = t['end_line']
405 |         #print b, e
406 |         for l in range(b, e):
407 |             page = l / lines_per_page
408 |             if len(nr_data_rows) <= page:
409 |                 nr_data_rows += ([0]*(page-len(nr_data_rows)+1))
410 |             nr_data_rows[page] += 1
411 |     dr = pd.DataFrame()
412 |     dr['value'] = nr_data_rows
413 |     dr['page'] = range(0, len(dr))
414 |     
415 |     js_layout = filename+".js"
416 |       
417 |     ui_show = UILayout(
418 |     "FilterChart",
419 |     "../static/bower_components/pyxley/build/pyxley.js",
420 |     "component_id",
421 |     filter_style="''")
422 |     
423 |     if filename in all_charts:
424 |         print "old/update ui", filename        
425 |         path_to_fig = '/show/line/'+filename
426 |         #del all_charts[filename]
427 |         #hFig = Figure(path_to_fig, "line")        
428 |         #bc = LineChart(dr, hFig, "page", ["page"], "Rows containing Data per Page")    
429 |     elif True:
430 |         print "new ui", filename
431 |         
432 |         # Make a Button
433 |         cols = ["page"]
434 |         btn = SelectButton("Data", cols, "Data", "Data Rows per Page")
435 | 
436 |         # Make a FilterFrame and add the button to the UI
437 |         ui_show.add_filter(btn)
438 |         
439 |         # Now make a FilterFrame for the histogram
440 |         path_to_fig = '/show/line/'+filename
441 |         hFig = Figure(path_to_fig, "line")
442 |         hFig.layout.set_size(width=1000, height=300)
443 |         hFig.layout.set_margin(left=80, right=80)
444 |         #hFig.graphics.animate_on_load()
445 | 
446 |         bc = LineChart(dr, hFig, "page", ["value"], "Rows containing Data per Page")
447 |         ui_show.add_chart(bc)
448 |         all_charts[filename] = bc
449 | 
450 |         sb = ui_show.render_layout(app, "./static/ug/"+js_layout)
451 |                 
452 |     _scripts = ["ug/"+js_layout]
453 |     notices = ['Extraction Results for ' + filename, 'Ordered by lines']
454 |     
455 |     dfs = (table_to_df(table).to_html() for table in tables.values())
456 |     headers = []
457 |     for t in tables.values():
458 |         if 'header' in t:
459 |             headers.append(t['header'])
460 |         else:
461 |             headers.append('-')
462 |             
463 |     line_nrs = [('line %i-%i' % (t['begin_line'], t['end_line'])) for t in tables.values() ]
464 |     #headers = ['aslkdfjas', ' alsdkfjasoedf']
465 |     
466 |     return render_template('index.html',
467 |         title=TITLE + ' - ' + filename,
468 |         base_scripts=scripts,
469 |         page_scripts=_scripts,
470 |         css=css, notices = notices, tables = dfs, headers=headers, line_nrs=line_nrs)
471 | 
472 | 
473 | # In[ ]:
474 | 
475 | app.run(debug=True, host='0.0.0.0')
476 | 
477 | 
478 | # 
479 | # ## Tests ##
480 | 
481 | # In[123]:
482 | 
483 | def table_to_df(table):
484 |     df = pd.DataFrame()
485 | 
486 |     for i, c in enumerate(table['captions']):
487 |         values = []
488 |         for r in table['data']:
489 |             values.append(r[i]['value'])
490 |         df[c] = values
491 |         
492 |     return df
493 | 
494 | for file in os.listdir('txt'):
495 |     
496 |     print ("--------" + file + "--------")
497 |     tables = return_tables('txt/'+file)
498 |     
499 |     #print tables
500 |     
501 |     #Construct histogram
502 |     lines_per_page = 80
503 |     nr_data_rows = []
504 |     for b, t in tables.iteritems():
505 |         e = t['end_line']
506 |         #print b, e
507 |         for l in range(b, e):
508 |             page = l / lines_per_page
509 |             if len(nr_data_rows) <= page:
510 |                 nr_data_rows += ([0]*(page-len(nr_data_rows)+1))
511 |             nr_data_rows[page] += 1
512 |     dr = pd.DataFrame()
513 |     dr['value'] = nr_data_rows
514 |     dr['page'] = range(0, len(dr))    
515 |     #print dr.head()
516 | 
517 |     line_nrs = [('line %i-%i' % (t['begin_line'], t['end_line'])) for t in tables.values() ]
518 |     print line_nrs
519 |     
520 |     for k, table in tables.iteritems():
521 |         df = table_to_df(table)
522 |         print k, ' !!! ', table['header'], ' -----------------'
523 |         print df.head()
524 | 
525 | 
526 |     #print dr
527 | 
528 | # Make a Button
529 | cols = [c for c in df.columns if c != "Date"]
530 | btn = SelectButton("Data", cols, "Data", "Steps")
531 | 
532 | # Make a FilterFrame and add the button to the UI
533 | ui.add_filter(btn)
534 | 
535 | # Now make a FilterFrame for the histogram
536 | hFig = Figure("/mghist/", "myhist")
537 | hFig.layout.set_size(width=450, height=200)
538 | hFig.layout.set_margin(left=40, right=40)
539 | hFig.graphics.animate_on_load()
540 | # Make a histogram with 20 bins
541 | hc = Histogram(sf, hFig, "value", 20, init_params={"Data": "Steps"})
542 | ui.add_chart(hc)
543 | 
544 | # Let's play with our input
545 | df["Date"] = pd.to_datetime(df["Date"])
546 | df["week"] = df["Date"].apply(lambda x: x.isocalendar()[1])
547 | gf = df.groupby("week").agg({
548 |         "Date": [np.min, np.max],
549 |         "Steps": np.sum,
550 |         "Calories Burned": np.sum,
551 |         "Distance": np.sum
552 |     }).reset_index()
553 | f = lambda x: '_'.join(x) if (len(x[1]) > 0) and x[1] != 'sum' else x[0]
554 | gf.columns = [f(c) for c in gf.columns]
555 | gf = gf.sort_index(by="week", ascending=False)
556 | gf["Date_amin"] = gf["Date_amin"].apply(lambda x: x.strftime("%Y-%m-%d"))
557 | gf["Date_amax"] = gf["Date_amax"].apply(lambda x: x.strftime("%Y-%m-%d"))
558 | 
559 | cols = OrderedDict([
560 |     ("week", {"label": "Week"}),
561 |     ("Date_amin", {"label": "Start Date"}),
562 |     ("Date_amax", {"label": "End Date"}),
563 |     ("Calories Burned", {"label": "Calories Burned"}),
564 |     ("Steps", {"label": "Steps"}),
565 |     ("Distance", {"label": "Distance (mi)", "format": "%5.2f"})
566 | ])
567 | 
568 | tb = DataTable("mytable", "/mytable/", gf, columns=cols, paging=True, pageLength=5)
569 | ui.add_chart(tb)
570 | 
571 | sb = ui.render_layout(app, "./static/layout.js")
572 | # In[ ]:
573 | 
574 | test_string ="""
575 |         The following table sets forth statistical information relating to the Water System during the five
576 | Fiscal Years shown.
577 |                                                  TABLE 1
578 |                                    WATER SYSTEM STATISTICS
579 |                                                                                Fiscal Year Ended June 30
580 |                                                                   2014         2013       2012     2011      2010
581 | Anaheim Population Served ..................................     348,305      346,161   343,793   341,034   336,265
582 | Population Served Outside City (Est.) ...................          8,457        9,000     9,000     9,000     9,000
583 |         Total Population Served ...........................      356,762      355,161   352,793   350,034   345,265
584 | 
585 |   Total Water Sales (Million Gallons) ...................         20,740       20,465    19,672    19,526    20,488
586 | 
587 | Capacity (Million Gallons Per Day)
588 |   From MWD Connections ...................................             110       110       110       110       110
589 |   From Water System Wells (Average) ...............                     79        86        88        81        75
590 |         Total Supply Capacity .............................            189       196       198       191       185
591 | 
592 |    Treatment Plant Capacity ..................................          15        15        15        15        15
593 | 
594 | Peak Day Distribution (Million Gallons) ...............                82.2      78.7     79.2      87.2      87.2
595 | Average Daily Distribution (Million Gallons) .......                   60.3      58.9     57.3      59.4      56.1
596 | Average Daily Sales Per Capita (Gallons) .............                159.3     157.9    152.8     152.8     162.6
597 | __________________
598 | Source: Anaheim
599 | 
600 | Existing Facilities
601 | 
602 | """.decode('ascii', 'ignore').split("\n")
603 | 
604 | 
605 | # In[ ]:
606 | 
607 | rows = [row_feature(l) for l in test_string]
608 | 
609 | tables = [rows[b:e] for b,e in filter_row_spans(rows, row_qualifies)]
610 | table = tables[0]
611 | s = structure_rows(table, rows[b-4:b])
612 | print s[0]
613 | 
614 | 
615 | # In[ ]:
616 | 
617 | test_string ="""
618 |                          CALIFORNIA MUNICIPAL FINANCE AUTHORITY
619 |                                    Revenue Bonds, Series 2015-A
620 |                               (City of Anaheim Water System Project)
621 | 
622 |                                           MATURITY SCHEDULE
623 | 
624 |                                             $58,205,000 Serial Bonds
625 | 
626 |   Maturity Date              Principal                Interest
627 |    (October 1)               Amount                     Rate                   Yield                  CUSIP†
628 |        2015                 $ 775,000                 2.000%                   0.100%             13048TTV5
629 |        2016                  1,575,000                2.000                    0.300              13048TTW3
630 |        2017                  1,620,000                3.000                    0.660              13048TTX1
631 |        2018                  1,675,000                4.000                    0.930              13048TTY9
632 |        2019                  2,045,000                5.000                    1.150              13048TTZ6
633 |        2020                  2,155,000                5.000                    1.320              13048TUA9
634 |        2021                  2,250,000                4.000                    1.520              13048TUB7
635 |        2022                  2,610,000                5.000                    1.670              13048TUC5
636 |        2023                  2,730,000                4.000                    1.810              13048TUD3
637 |        2024                  2,875,000                5.000                    1.920              13048TUE1
638 |        2025                  3,025,000                5.000                    2.030(c)           13048TUF8
639 |        2026                  3,190,000                5.000                    2.200(c)           13048TUG6
640 |        2027                  3,355,000                5.000                    2.320(c)           13048TUH4
641 |        2028                  3,520,000                5.000                    2.450(c)           13048TUJ0
642 |        2029                  3,700,000                5.000                    2.520(c)           13048TUK7
643 |        2030                  3,880,000                5.000                    2.600(c)           13048TUL5
644 |        2031                  4,055,000                4.000                    3.140(c)           13048TUM3
645 |        2032                  4,220,000                4.000                    3.190(c)           13048TUN1
646 |        2033                  4,390,000                4.000                    3.230(c)           13048TUP6
647 |        2034                  4,560,000                4.000                    3.270(c)           13048TUQ4
648 | 
649 |      $24,535,000 4.000% Term Bonds due October 1, 2040 – Yield: 3.400%(c); CUSIP†: 13048TUR2
650 |      $13,145,000 5.250% Term Bonds due October 1, 2045 – Yield: 2.970%(c); CUSIP†: 13048TUS0
651 |           
652 | """.decode('ascii', 'ignore').split("\n")
653 | 
654 | 
655 | # In[ ]:
656 | 
657 | for file in os.listdir('txt'):
658 |     
659 |     print ("--------" + file + "--------")
660 |     
661 |     with codecs.open('txt/'+file, "r", "utf-8") as f:
662 |         
663 |         lines = [l.encode('ascii', 'ignore').replace('\n', '') for l in f]
664 |         rows = [row_feature(l) for l in lines]
665 | 
666 |         for b,e in filter_row_spans(rows, row_qualifies):
667 |             print "TABLE STARTING AT LINE", b
668 |             table = rows[b:e]
669 |             structure, data, headers = structure_rows(table, rows[b-config['meta_info_lines_above']:b])
670 |             print headers
671 |             captions = [(col['value'] if 'value' in col.keys() else "---") +" (%s, %s)" % (col['type'], col['subtype']) for col in structure]
672 |             print captions  
673 |             for r in data:
674 |                 cols = [col['value']+" (%s, %s)" % (col['type'], col['subtype']) for col in r]
675 |                 print len(cols), cols
676 |             
677 | 
678 | 
679 | # In[ ]:
680 | 
681 | rstr ="""
682 | Population Served Outside City (Est.) ...................          8,457        9,000     9,000     9,000     9,000
683 |         Total Population Served ...........................      356,762      355,161   352,793   350,034   345,265
684 | """.decode('ascii', 'ignore').split("\n")
685 | for r in rstr:
686 |     print "split", re.split(tokenize_pattern, r)
687 |     print "token", [v['value'] for v in row_feature(r)], row_feature(r)
688 | 
689 | 
690 | # In[ ]:
691 | 
692 | #subtype_indicator['test'] = r'.*\$.*'
693 | for sub, indicator in subtype_indicator.iteritems():
694 |     print sub, indicator, re.match(indicator, "  ..........................................................     $  ")
695 | 
696 | 
697 | # In[ ]:
698 | 
699 | 
700 | 
701 | 


--------------------------------------------------------------------------------
/deprecated/deploy_notebook.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | SERVLET = "server"
 4 | 
 5 | ipython nbconvert --to python $1
 6 | OFILEP=${1%%ipynb}
 7 | 
 8 | rm "$SERVLET"".py"
 9 | mv "$OFILEP"".py" "$SERVLET"".py"
10 | 
11 | nohup python "$SERVLET"".py" &> nohup.out.log&
12 | 


--------------------------------------------------------------------------------
/deprecated/manifest.yml:
--------------------------------------------------------------------------------
 1 | applications:
 2 | - path: .
 3 |   memory: 128M
 4 |   instances: 1
 5 |   domain: mybluemix.net
 6 |   name: TabulaRazr
 7 |   host: tabularazr
 8 |   disk_quota: 1024M
 9 | 
10 | 


--------------------------------------------------------------------------------
/deprecated/pdf2text_bulk.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {
 7 |     "collapsed": false
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "!pwd"
12 |    ]
13 |   },
14 |   {
15 |    "cell_type": "code",
16 |    "execution_count": 7,
17 |    "metadata": {
18 |     "collapsed": false
19 |    },
20 |    "outputs": [
21 |     {
22 |      "name": "stdout",
23 |      "output_type": "stream",
24 |      "text": [
25 |       "0/4 pdftotext -enc UTF-8 -layout pdf/EA716610-EA562590-EA958701.pdf txt/EA716610-EA562590-EA958701.pdf.txt\n",
26 |       "1/4 pdftotext -enc UTF-8 -layout pdf/EP753324-ER508056-ER910760.pdf txt/EP753324-ER508056-ER910760.pdf.txt\n",
27 |       "2/4 pdftotext -enc UTF-8 -layout pdf/ER544111-ER421289-ER823264.pdf txt/ER544111-ER421289-ER823264.pdf.txt\n",
28 |       "3/4 pdftotext -enc UTF-8 -layout pdf/ER588705-ER457598-ER860368.pdf txt/ER588705-ER457598-ER860368.pdf.txt\n"
29 |      ]
30 |     }
31 |    ],
32 |    "source": [
33 |     "from __future__ import print_function\n",
34 |     "import re\n",
35 |     "import os\n",
36 |     "import codecs\n",
37 |     "import string\n",
38 |     "\n",
39 |     "PDF_SUBFOLDER = 'pdf'\n",
40 |     "TXT_SUBFOLDER = 'txt'\n",
41 |     "\n",
42 |     "def create_path(path):\n",
43 |     "    try: \n",
44 |     "        os.makedirs(path)\n",
45 |     "    except OSError:\n",
46 |     "        if not os.path.isdir(path):\n",
47 |     "            raise    \n",
48 |     "\n",
49 |     "#Convert all pdfs\n",
50 |     "files = os.listdir(PDF_SUBFOLDER)\n",
51 |     "create_path(os.path.join(TXT_SUBFOLDER))\n",
52 |     "\n",
53 |     "for i,f in enumerate(files):\n",
54 |     "\n",
55 |     "    pdf_path = os.path.join(PDF_SUBFOLDER, f)\n",
56 |     "    txt_path = os.path.join(TXT_SUBFOLDER, f+'.txt')\n",
57 |     "    \n",
58 |     "    if not os.path.isfile(txt_path):\n",
59 |     "        #Layout preservation crucial to maintain clues about tabular data\n",
60 |     "        cmd = \"pdftotext -enc UTF-8 -layout %s %s\" % (pdf_path, txt_path)\n",
61 |     "        print ('%d/%d %s' % (i, len(files), cmd))\n",
62 |     "        os.system(cmd)\n",
63 |     "    else:\n",
64 |     "        print ('skipping %s, already exists.' % (pdf_path, ))"
65 |    ]
66 |   },
67 |   {
68 |    "cell_type": "code",
69 |    "execution_count": null,
70 |    "metadata": {
71 |     "collapsed": true
72 |    },
73 |    "outputs": [],
74 |    "source": []
75 |   }
76 |  ],
77 |  "metadata": {
78 |   "kernelspec": {
79 |    "display_name": "Python 2",
80 |    "language": "python",
81 |    "name": "python2"
82 |   },
83 |   "language_info": {
84 |    "codemirror_mode": {
85 |     "name": "ipython",
86 |     "version": 2
87 |    },
88 |    "file_extension": ".py",
89 |    "mimetype": "text/x-python",
90 |    "name": "python",
91 |    "nbconvert_exporter": "python",
92 |    "pygments_lexer": "ipython2",
93 |    "version": "2.7.10"
94 |   }
95 |  },
96 |  "nbformat": 4,
97 |  "nbformat_minor": 0
98 | }
99 | 


--------------------------------------------------------------------------------
/design/1_Home.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/design/1_Home.jpg


--------------------------------------------------------------------------------
/design/2_Show.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/design/2_Show.jpg


--------------------------------------------------------------------------------
/design/3_Browse.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/design/3_Browse.jpg


--------------------------------------------------------------------------------
/design/browse_similar_data_tables_feature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/design/browse_similar_data_tables_feature.png


--------------------------------------------------------------------------------
/design/screenshot_show_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/design/screenshot_show_example.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask
2 | Jinja2
3 | pandas
4 | matplotlib
5 | fuzzywuzzy
6 | python-Levenshtein
7 | 


--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | from __future__ import print_function
  5 | from backend import *
  6 | from data_query import filter_tables
  7 | import os
  8 | import sys
  9 | from collections import OrderedDict
 10 | 
 11 | import json
 12 | from flask import Flask, request, redirect, url_for, send_from_directory
 13 | from werkzeug import secure_filename
 14 | from flask import jsonify, render_template, make_response
 15 | import urllib
 16 | from urlparse import urlparse
 17 | 
 18 | import matplotlib
 19 | #Uncomment the line below to prevent runtime error during file processing
 20 | matplotlib.use('Agg')
 21 | 
 22 | import matplotlib.pyplot as plt
 23 | 
 24 | #######
 25 | 
 26 | TITLE = "TabulaRazr (XIRR for muni_bonds)"
 27 | 
 28 | scripts = [
 29 |     "bower_components/jquery/dist/jquery.min.js",
 30 |     "bower_components/materialize/dist/js/materialize.js"
 31 | ]
 32 | css = [
 33 |     "./css/main.css",
 34 |     "./css/style.css",
 35 |     "bower_components/materialize/dist/css/materialize.css"
 36 | ]
 37 | 
 38 | UPLOAD_FOLDER = './static/ug'
 39 | ALLOWED_EXTENSIONS = set(['txt', 'pdf'])
 40 | 
 41 | TITLE = "TabulaRazr"
 42 | app = Flask(__name__)
 43 | app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 44 | 
 45 | 
 46 | 
 47 | def get_extension(filename):
 48 |     return '.' in filename and filename.rsplit('.', 1)[1] 
 49 | 
 50 | def allowed_file(filename):
 51 |     return get_extension(filename) in ALLOWED_EXTENSIONS
 52 | 
 53 | def create_path(path):
 54 |     try: 
 55 |         os.makedirs(path)
 56 |     except OSError:
 57 |         if not os.path.isdir(path):
 58 |             raise    
 59 | 
 60 | @app.route('/', methods=['GET', 'POST'])
 61 | def upload_file():
 62 | 
 63 |     if request.method == 'POST':
 64 |         
 65 |         file = request.files['file']
 66 |         project = request.form['project']
 67 |         url = request.form['url']
 68 |         path = os.path.join(app.config['UPLOAD_FOLDER'], project)
 69 |         
 70 |         filename = None
 71 |         
 72 |         if url:
 73 |             url_fragments = urlparse(url)
 74 |             filename_temp = url_fragments.path.split(r'/')[-1]
 75 |             if url_fragments.scheme in ('http', 'ftp') and path and allowed_file(filename_temp):
 76 |                 filename = secure_filename(filename_temp)
 77 |                 create_path(path)
 78 |                 path = os.path.join(path, filename)
 79 |                 urllib.urlretrieve (url, path)
 80 |         
 81 |         elif file and allowed_file(file.filename):
 82 |             filename = secure_filename(file.filename)
 83 |             create_path(path)
 84 |             
 85 |             path = os.path.join(path, filename)
 86 |             file.save(path)
 87 |         
 88 |         if filename:
 89 |             return redirect(url_for('analyze', project=project, filename=filename))
 90 | 
 91 |     return render_template('index.html',
 92 |         title=TITLE,
 93 |         css=css)
 94 | 
 95 | 
 96 | def analyze_file(filename, project):
 97 |     
 98 |     if not project or project in ("/", "-"):
 99 |         project = ""  
100 |         
101 |     path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)
102 |     extension = get_extension(filename)
103 |     
104 |     txt_path = path
105 |     if extension == "pdf":
106 |         txt_path += '.txt'
107 |         filename += '.txt'        
108 |         if not os.path.isfile(txt_path):
109 |             #Layout preservation crucial to preserve clues about tabular data
110 |             cmd = "pdftotext -enc UTF-8 -layout %s %s " % (path, txt_path)
111 |             os.system(cmd)     
112 |     
113 |     if not os.path.isfile(txt_path):
114 |         return None, filename, jsonify({'error' : txt_path+' not found' })
115 | 
116 |     #Export tables    
117 |     tables = return_tables(txt_path)
118 | 
119 |     with codecs.open(txt_path + '.tables.json', "w", "utf-8") as file:
120 |         json.dump(tables, file)
121 | 
122 |     #Export chart
123 |     lines_per_page = 80
124 |     nr_data_rows = []
125 |     #for t in tables.values():
126 |     #    print t
127 |     for key, t in tables.iteritems():
128 |         e = t['end_line']
129 |         b = t['begin_line']
130 |         for l in range(b, e):
131 |             page = l / lines_per_page
132 |             if len(nr_data_rows) <= page:
133 |                 nr_data_rows += ([0]*(page-len(nr_data_rows)+1))
134 |             nr_data_rows[page] += 1
135 |     dr = pd.DataFrame()
136 |     dr['value'] = nr_data_rows
137 |     dr['page'] = range(0, len(dr))
138 |     
139 |     #plot the row density
140 |     chart = filename+".png"
141 |     fig, ax = plt.subplots( nrows=1, ncols=1, figsize=(7,2.5) )  # create figure & 1 axis
142 |     ax.set_xlabel('page nr.')
143 |     ax.set_ylabel('number of data rows')
144 |     ax.set_title('Distribution of Rows with Data')
145 |     ax.plot(dr['page'], dr['value'], )
146 |     fig.savefig(txt_path + '.png')   # save the figure to file
147 |     plt.close(fig)                      # close the figure    
148 | 
149 |     return tables, filename,  None
150 |     
151 | #Todo: accetpt URLs
152 | @app.route('/analyze/<project>/<filename>', methods=['GET', 'POST'])
153 | def analyze(filename, project):   
154 | 
155 |     tables, filename_new, error = analyze_file(filename, project)
156 |     if error:
157 |         return error
158 |     
159 |     if request.method == 'POST':
160 |         return jsonify(tables)
161 |     
162 |     return redirect(url_for('show_one_file', filename=filename_new, project=project))
163 |     
164 | 
165 | #Todo: factor out table rendering, overview etc., i.e. make functions more composable
166 | @app.route('/show/<project>/<filename>')
167 | def show_one_file(filename, project):
168 | 
169 |     if not project or project in ("/", "-"):
170 |         project = ""   
171 |     path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)
172 |     
173 |     tables_path = path + '.tables.json'
174 |     chart_path_html = os.path.join('ug', project, filename + '.png')
175 |     if not os.path.isfile(tables_path):
176 |         analyze(filename, project)
177 | 
178 |     with codecs.open(tables_path, "r", "utf-8") as file:
179 |         tables = json.load(file)
180 |         tables = OrderedDict(sorted(tables.iteritems(), key=lambda kv: int(kv[0])))
181 |     #Todo: actually do the filtering
182 |     filter_arg = request.args.get('filter_arg')
183 |         
184 |     #Create HTML
185 |     notices = ['Extraction Results for ' + filename, 'Ordered by lines', 'Applied filter: %s' % filter_arg]    
186 |     dfs = (table_to_df(table).to_html() for table in tables.values())
187 |     
188 |     headers = []
189 |     for t in tables.values():
190 |         if 'headers' in t:
191 |             headers.append(" | ".join(h for h in t['headers']))
192 |         else:
193 |             headers.append('NO HEADER')
194 |     meta_data = [{'begin_line' : t['begin_line'], 'end_line' : t['end_line'], 
195 |                   'margin_top' : (t['begin_line']-t['meta_begin_line']) if 'meta_begin_line' in t else config["meta_info_lines_above"]} for t in tables.values()]
196 | 
197 |     filename_pdf = None
198 | 
199 |     path_pdf = path[:-4]
200 |     if get_extension(path_pdf) == 'pdf' and os.path.isfile(path_pdf):
201 |         filename_pdf = filename[:-4]
202 |     
203 |     return render_template('viewer.html',
204 |         title=TITLE + ' - ' + filename,
205 |         base_scripts=scripts, filename=filename, filename_pdf=filename_pdf, project=project,
206 |         css=css, notices = notices, tables = dfs, headers=headers, meta_data=meta_data, chart=chart_path_html)
207 | 
208 | @app.route('/inspector/<project>/<filename>')
209 | def inspector(filename, project):
210 |     if not project or project in ("/", "-"):
211 |         project = ""   
212 |     path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)
213 |     
214 |     begin_line = int(request.args.get('data_begin'))
215 |     end_line = int(request.args.get('data_end'))
216 |     margin_top = int(request.args.get('margin_top', config["meta_info_lines_above"]))
217 |     margin_bottom = margin_top
218 |     
219 |     #Todo: solve overlap as in more advanced branches
220 |     notices = ['showing data lines from %i to %i with %i meta-lines above and below' % (begin_line, end_line, margin_top)]
221 |     with codecs.open(path, "r", "utf-8", errors="replace") as file:
222 |         lines = [l.encode("utf-8", errors="replace") for l in file][begin_line - margin_top:end_line + margin_bottom]
223 |         top_lines = lines[:margin_top]
224 |         table_lines = lines[margin_top:margin_top+end_line-begin_line]
225 |         bottom_lines = lines[margin_top+end_line-begin_line:]
226 |     
227 |     offset = begin_line-margin_top
228 |     table_id = begin_line
229 |     
230 |     return render_template('inspector.html',
231 |         title=TITLE,
232 |         base_scripts=scripts, css=css, notices = notices, filename=filename, top_lines=top_lines, project=project,
233 |         table_lines=table_lines, bottom_lines=bottom_lines, offset=offset, table_id=begin_line)
234 | 
235 | @app.route('/project_analysis', methods=['POST'])
236 | def project_analysis():
237 |     project = request.form['project']    
238 |     if not project or project in ("/", "-"):
239 |         project = ""  
240 |     filter_arg = request.form['filter']
241 |     return redirect(url_for('filter_tables_web', project=project, filter=filter_arg))
242 | 
243 | @app.route('/filter_tables/<project>', methods=['GET', 'POST'])
244 | def filter_tables_web(project):
245 |     if not project or project in ("/", "-"):
246 |         project = ""   
247 |     path = os.path.join(app.config['UPLOAD_FOLDER'], project)   
248 |     
249 |     filter_arg = request.args.get('filter')
250 |     filter_file = os.path.join('static', 'filters', filter_arg +'.json')
251 |     with codecs.open(filter_file, "r", "utf-8", errors="replace") as file:
252 |         _filter = json.load(file)
253 | 
254 |     #Go through all .txt files in the project, grab tables and return filtered result
255 |     files = os.listdir(path)
256 |     results = {}
257 |     files_analyzed = set()
258 |     nr_tables = 0
259 |     for i,f in enumerate(files):
260 | 
261 |         extension = get_extension(f)
262 |         tables_path = path + '.json' 
263 | 
264 |         if extension == "txt":
265 |             
266 |             tables = None
267 |             if not os.path.isfile(tables_path):
268 |                 #Analyze on the spot:
269 |                 tables, f_new, error = analyze_file(f, project)
270 |                 print ("on the spot", f_new, project, tables_path, error, len(tables))
271 |                 if error:
272 |                     return error
273 |             else:
274 |                 with codecs.open(tables_path, "r", "utf-8") as file:
275 |                     tables = json.load(file)
276 |                     files_analyzed.update(f)
277 |                 
278 |             #Only keep highest results
279 |             for t in filter_tables(tables.values(), _filter):
280 |                 if f not in results:
281 |                     results[f] = [t]
282 |                 else:
283 |                     max_c = max(r[0] for r in results[f])
284 |                     if t[0] >= max_c:
285 |                         results[f].append(t)
286 |             nr_tables += len(tables)
287 |             #Keep all results
288 |             #results[f] = [t for t in filter_tables(tables.values(), _filter)]
289 |     
290 |     #return jsonify(results)
291 |     #Todo: create .csv for download
292 |     for filename, extracted_results in results.iteritems():
293 |         for result in extracted_results:
294 |             t_html = table_to_df(result[1]).to_html()
295 |             result[1]['html'] = t_html
296 |     
297 |     total_best_tables = sum(len(results[r]) for r in results.keys())
298 |     notices = ["Project %s filtered by %s" % (project, filter_arg), 
299 |                "Total of %i tables exist in %i files" % (nr_tables, len(files_analyzed)),
300 |                "%i best tables across %i files" % (total_best_tables, len(results)) ]
301 |     
302 |     return render_template('filtered_project.html',
303 |         title=TITLE + ' - ' + project + ' filtered by ' + filter_arg,
304 |         base_scripts=scripts, project=project,
305 |         css=css, notices = notices, results=results)
306 | 
307 | from xirr_calc import xirr
308 | import traceback
309 | @app.route('/calculate_xirr/<project>/<filename>')
310 | def calculate_xirr(filename, project):
311 | 
312 |     if not project or project in ("/", "-"):
313 |         project = ""   
314 |     path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)
315 |     tables_path = path + '.tables.json'
316 |     
317 |     if not os.path.isfile(tables_path):
318 |         analyze(filename, project)
319 | 
320 |     with codecs.open(tables_path, "r", "utf-8") as file:
321 |         tables = json.load(file)
322 |     
323 |     #Todo: factor out into xirr_calc
324 |     results = {"funds" : [], "maturity_schedule" : [] }
325 |     try: 
326 |         #Todo: factor out into "take_one" function    
327 |         for k, filter_results in results.iteritems():
328 | 
329 |             filter_file = os.path.join('static', 'filters', k+'.json')
330 |             with codecs.open(filter_file, "r", "utf-8", errors="replace") as file:
331 |                 _filter = json.load(file)        
332 | 
333 |             #Only keep highest results
334 |             for t in filter_tables(tables.values(), _filter):
335 |                 if len(filter_results) == 0 or t[0] >= max(r[0] for r in filter_results):
336 |                     filter_results.append(t)
337 |                     t_html = table_to_df(t[1]).to_html()
338 |                     filter_results[-1][1]['html'] = t_html
339 | 
340 |         log = []
341 |         # Get salient tables
342 |         log.append("Found %i candidates for funds and %i for maturity schedule" % \
343 |                    (len(results['funds']), len(results['maturity_schedule'])) )
344 |         funds_table = max( sorted( results['funds'], key = lambda t: t[1]['begin_line'] ), key = lambda t: t[0])[1]
345 |         schedule_table = max( sorted( results['maturity_schedule'], key = lambda t: t[1]['begin_line'] ), key = lambda t: t[0])[1]    
346 |         log.append("Using table %i for funds and table %i for maturity schedule" 
347 |                    % (funds_table['begin_line'], schedule_table['begin_line']))
348 | 
349 |         with codecs.open(path, "r", "utf-8") as file:
350 |             rate, log_list = xirr(file, funds_table, schedule_table)
351 | 
352 |         log += log_list
353 |         if rate: 
354 |             log.append("<h3>Final Rate: <b>%0.2f%%</b></h3><img src=../../static/scrutiny.png>" % rate) 
355 | 
356 |     except Exception as e:
357 |         log.append("... failed with %s" % traceback.format_exception(*sys.exc_info()))                                                                                
358 |     return render_template('view_filtered.html',
359 |         title=TITLE + ' - ' + filename + ' XIRR calculator with filters,' + ", ".join(results.keys()), 
360 |         base_scripts=scripts, filename=filename, project=project,
361 |         css=css, notices = log, results=results)
362 | 
363 |     
364 | def run_from_ipython():
365 |     try:
366 |         __IPYTHON__
367 |         return True
368 |     except NameError:
369 |         return False
370 | 
371 | if __name__ == "__main__":
372 |     if run_from_ipython():
373 |         app.run(host='0.0.0.0', port = 7080)
374 |     else:
375 |         PORT = int(os.getenv('PORT', 7081))
376 |         app.run(debug=True, host='0.0.0.0', port = PORT)
377 | 
378 | 
379 | 
380 | 
381 | 
382 | 


--------------------------------------------------------------------------------
/static/TabulaRazr_Logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/TabulaRazr_Logo.png


--------------------------------------------------------------------------------
/static/center_for_municipal_finance_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/center_for_municipal_finance_logo.png


--------------------------------------------------------------------------------
/static/css/main.css:
--------------------------------------------------------------------------------
  1 | 
  2 | .pyx-slider {
  3 |   width: 200px !important;
  4 | }
  5 | 
  6 | .custom-dt {
  7 |   width: 600px;
  8 | }
  9 | 
 10 | html {
 11 |     height: 100%;
 12 | }
 13 | 
 14 | body {
 15 |   height: 100%;
 16 |   font-style:normal;
 17 |   font-family:"BrandonText-Regular", sans-serif;
 18 | }
 19 | 
 20 | footer {
 21 |     position: absolute;
 22 |     bottom: 0;
 23 |     width: 100%;
 24 | }
 25 | 
 26 | #logos {
 27 |     line-height:100px;
 28 | }
 29 | 
 30 | #logos img {
 31 |     display: inline-block;
 32 |     vertical-align: middle;
 33 | }
 34 | 
 35 | .info-block-icon {
 36 |     display: inline-block;
 37 |     vertical-align: middle;
 38 |     font-size: 3.5em;
 39 |     margin: 0 3% 0 0;
 40 | }
 41 | 
 42 | .info-block-text {
 43 |     display: inline-block;
 44 |     vertical-align: middle;
 45 |     font-size: 1.5em;
 46 | }
 47 | 
 48 | @media only screen and (max-width: 1250px) {
 49 |     .info-block-icon {
 50 |         font-size: 3em;
 51 |     }
 52 |     .info-block-text {
 53 |         font-size: 1.25em;
 54 |     }
 55 | }
 56 | 
 57 | #mychart {
 58 |   display: inline-block;
 59 | }
 60 | 
 61 | #myhist {
 62 |   display: inline-block;
 63 | }
 64 | 
 65 | select {
 66 |     display: initial !important;
 67 | }
 68 | 
 69 | .icon-block .material-icons {
 70 |     font-size: inherit !important;
 71 | }
 72 | 
 73 | .full-width {
 74 |     width: 100%;
 75 | }
 76 | 
 77 | .card-btn {
 78 |     box-shadow: 0 2px 5px 0 rgba(0, 0, 0, 0.16), 0 2px 10px 0 rgba(0, 0, 0, 0.12);
 79 | }
 80 | 
 81 | .card-btn:hover {
 82 |     transition: all .25s;
 83 |     box-shadow: 0 5px 11px 0 rgba(0, 0, 0, 0.18), 0 4px 15px 0 rgba(0, 0, 0, 0.15);
 84 | }
 85 | 
 86 | .card-btn.blue:hover{
 87 |     background-color: #42a5f5 !important;
 88 | }
 89 | 
 90 | h5.truncate {
 91 |     cursor: pointer;
 92 | }
 93 | 
 94 | blockquote {
 95 |     border-color: #2196F3 !important;
 96 | }
 97 | 
 98 | .table-showhide {
 99 |     float: right;
100 | }
101 | 
102 | .dataTables_scroll {
103 |     clear: both;
104 |     overflow: auto;
105 | }
106 | 
107 | .mg-line1-color {
108 |     stroke: #7bc9c1;
109 | }
110 | 
111 | .mg-area1-color {
112 |     fill: #7bc9c1;
113 | }
114 | 
115 | .mg-histogram .mg-bar rect.active {
116 |     fill: #7bc9c1;
117 | }
118 | 
119 | .mg-histogram .mg-bar rect {
120 |     fill:  #7bc9c1 ;
121 |     shape-rendering: auto;
122 | }
123 | 
124 | 
125 | .mg-x-axis text,
126 | .mg-y-axis text,
127 | .mg-histogram .axis text {
128 |     fill: black;
129 |     font-size: 1.2rem;
130 |     opacity: 0.5;
131 | }
132 | 
133 | .mg-x-axis .label,
134 | .mg-y-axis .label,
135 | .mg-axis .label {
136 |     font-size: 1.2rem;
137 |     text-transform: uppercase;
138 |     font-weight: 400;
139 | }
140 | 
141 | /* Temporary fix - Hide footer when screen is not sufficiently tall*/
142 | @media only screen and (max-height: 730px) {
143 |     .page-footer {
144 |       display: none;
145 |     }
146 | }
147 | 
148 | .footer-image {
149 |     width: 60px;
150 |     box-shadow: 0 2px 5px 0 rgba(0, 0, 0, 0.16), 0 2px 10px 0 rgba(0, 0, 0, 0.12);
151 | }
152 | 
153 | .footer-image:hover {
154 |     transition: all .25s;
155 |     box-shadow: 0 5px 11px 0 rgba(0, 0, 0, 0.18), 0 4px 15px 0 rgba(0, 0, 0, 0.15);
156 | }
157 | 
158 | #find-us-list li{
159 |     display: inline-block;
160 |     margin-right: 0.5em;
161 |     vertical-align: top;
162 | }


--------------------------------------------------------------------------------
/static/css/source/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  2 | <html xmlns="http://www.w3.org/1999/xhtml">
  3 | <head>
  4 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  5 | <title>Top 10 Express Table Designs - Smashing Magazine Source</title>
  6 | <style type="text/css">
  7 | <!--
  8 | @import url("style.css");
  9 | -->
 10 | </style>
 11 | </head>
 12 | <body>
 13 | <table id="hor-minimalist-a" summary="Employee Pay Sheet">
 14 |     <thead>
 15 |     	<tr>
 16 |         	<th scope="col">Employee</th>
 17 |             <th scope="col">Salary</th>
 18 |             <th scope="col">Bonus</th>
 19 |             <th scope="col">Supervisor</th>
 20 |         </tr>
 21 |     </thead>
 22 |     <tbody>
 23 |     	<tr>
 24 |         	<td>Stephen C. Cox</td>
 25 |             <td>$300</td>
 26 |             <td>$50</td>
 27 |             <td>Bob</td>
 28 |         </tr>
 29 |         <tr>
 30 |         	<td>Josephin Tan</td>
 31 |             <td>$150</td>
 32 |             <td>-</td>
 33 |             <td>Annie</td>
 34 |         </tr>
 35 |         <tr>
 36 |         	<td>Joyce Ming</td>
 37 |             <td>$200</td>
 38 |             <td>$35</td>
 39 |             <td>Andy</td>
 40 |         </tr>
 41 |         <tr>
 42 |         	<td>James A. Pentel</td>
 43 |             <td>$175</td>
 44 |             <td>$25</td>
 45 |             <td>Annie</td>
 46 |         </tr>
 47 |     </tbody>
 48 | </table>
 49 | 
 50 | <table id="hor-minimalist-b" summary="Employee Pay Sheet">
 51 |     <thead>
 52 |     	<tr>
 53 |         	<th scope="col">Employee</th>
 54 |             <th scope="col">Salary</th>
 55 |             <th scope="col">Bonus</th>
 56 |             <th scope="col">Supervisor</th>
 57 |         </tr>
 58 |     </thead>
 59 |     <tbody>
 60 |     	<tr>
 61 |         	<td>Stephen C. Cox</td>
 62 |             <td>$300</td>
 63 |             <td>$50</td>
 64 |             <td>Bob</td>
 65 |         </tr>
 66 |         <tr>
 67 |         	<td>Josephin Tan</td>
 68 |             <td>$150</td>
 69 |             <td>-</td>
 70 |             <td>Annie</td>
 71 |         </tr>
 72 |         <tr>
 73 |         	<td>Joyce Ming</td>
 74 |             <td>$200</td>
 75 |             <td>$35</td>
 76 |             <td>Andy</td>
 77 |         </tr>
 78 |         <tr>
 79 |         	<td>James A. Pentel</td>
 80 |             <td>$175</td>
 81 |             <td>$25</td>
 82 |             <td>Annie</td>
 83 |         </tr>
 84 |     </tbody>
 85 | </table>
 86 | 
 87 | 
 88 | <table id="ver-minimalist" summary="Most Favorite Movies">
 89 |     <thead>
 90 |     	<tr>
 91 |         	<th scope="col">Comedy</th>
 92 |             <th scope="col">Adventure</th>
 93 |             <th scope="col">Action</th>
 94 |             <th scope="col">Children</th>
 95 |         </tr>
 96 |     </thead>
 97 |     <tbody>
 98 |     	<tr>
 99 |         	<td>Scary Movie</td>
100 |             <td>Indiana Jones</td>
101 |             <td>The Punisher</td>
102 |             <td>Wall-E</td>
103 |         </tr>
104 |         <tr>
105 |         	<td>Epic Movie</td>
106 |             <td>Star Wars</td>
107 |             <td>Bad Boys</td>
108 |             <td>Madagascar</td>
109 |         </tr>
110 |         <tr>
111 |         	<td>Spartan</td>
112 |             <td>LOTR</td>
113 |             <td>Die Hard</td>
114 |             <td>Finding Nemo</td>
115 |         </tr>
116 |         <tr>
117 |         	<td>Dr. Dolittle</td>
118 |             <td>The Mummy</td>
119 |             <td>300</td>
120 |             <td>A Bug's Life</td>
121 |         </tr>
122 |     </tbody>
123 | </table>
124 | 
125 | <table id="box-table-a" summary="Employee Pay Sheet">
126 |     <thead>
127 |     	<tr>
128 |         	<th scope="col">Employee</th>
129 |             <th scope="col">Salary</th>
130 |             <th scope="col">Bonus</th>
131 |             <th scope="col">Supervisor</th>
132 |         </tr>
133 |     </thead>
134 |     <tbody>
135 |     	<tr>
136 |         	<td>Stephen C. Cox</td>
137 |             <td>$300</td>
138 |             <td>$50</td>
139 |             <td>Bob</td>
140 |         </tr>
141 |         <tr>
142 |         	<td>Josephin Tan</td>
143 |             <td>$150</td>
144 |             <td>-</td>
145 |             <td>Annie</td>
146 |         </tr>
147 |         <tr>
148 |         	<td>Joyce Ming</td>
149 |             <td>$200</td>
150 |             <td>$35</td>
151 |             <td>Andy</td>
152 |         </tr>
153 |         <tr>
154 |         	<td>James A. Pentel</td>
155 |             <td>$175</td>
156 |             <td>$25</td>
157 |             <td>Annie</td>
158 |         </tr>
159 |     </tbody>
160 | </table>
161 | 
162 | 
163 | <table id="box-table-b" summary="Most Favorit Movies">
164 |     <thead>
165 |     	<tr>
166 |         	<th scope="col">Comedy</th>
167 |             <th scope="col">Adventure</th>
168 |             <th scope="col">Action</th>
169 |             <th scope="col">Children</th>
170 |         </tr>
171 |     </thead>
172 |     <tbody>
173 |     	<tr>
174 |         	<td>Scary Movie</td>
175 |             <td>Indiana Jones</td>
176 |             <td>The Punisher</td>
177 |             <td>Wall-E</td>
178 |         </tr>
179 |         <tr>
180 |         	<td>Epic Movie</td>
181 |             <td>Star Wars</td>
182 |             <td>Bad Boys</td>
183 |             <td>Madagascar</td>
184 |         </tr>
185 |         <tr>
186 |         	<td>Spartan</td>
187 |             <td>LOTR</td>
188 |             <td>Die Hard</td>
189 |             <td>Finding Nemo</td>
190 |         </tr>
191 |         <tr>
192 |         	<td>Dr. Dolittle</td>
193 |             <td>The Mummy</td>
194 |             <td>300</td>
195 |             <td>A Bug's Life</td>
196 |         </tr>
197 |     </tbody>
198 | </table>
199 | 
200 | 
201 | <table id="hor-zebra" summary="Employee Pay Sheet">
202 |     <thead>
203 |     	<tr>
204 |         	<th scope="col">Employee</th>
205 |             <th scope="col">Salary</th>
206 |             <th scope="col">Bonus</th>
207 |             <th scope="col">Supervisor</th>
208 |         </tr>
209 |     </thead>
210 |     <tbody>
211 |     	<tr class="odd">
212 |         	<td>Stephen C. Cox</td>
213 |             <td>$300</td>
214 |             <td>$50</td>
215 |             <td>Bob</td>
216 |         </tr>
217 |         <tr>
218 |         	<td>Josephin Tan</td>
219 |             <td>$150</td>
220 |             <td>-</td>
221 |             <td>Annie</td>
222 |         </tr>
223 |         <tr class="odd">
224 |         	<td>Joyce Ming</td>
225 |             <td>$200</td>
226 |             <td>$35</td>
227 |             <td>Andy</td>
228 |         </tr>
229 |         <tr>
230 |         	<td>James A. Pentel</td>
231 |             <td>$175</td>
232 |             <td>$25</td>
233 |             <td>Annie</td>
234 |         </tr>
235 |     </tbody>
236 | </table>
237 | 
238 | 
239 | <table id="ver-zebra" summary="Most Favorite Movies">
240 |     <colgroup>
241 |     	<col class="vzebra-odd" />
242 |     	<col class="vzebra-even" />
243 |     	<col class="vzebra-odd" />
244 |         <col class="vzebra-even" />
245 |     </colgroup>
246 |     <thead>
247 |     	<tr>
248 |         	<th scope="col" id="vzebra-comedy">Comedy</th>
249 |             <th scope="col" id="vzebra-adventure">Adventure</th>
250 |             <th scope="col" id="vzebra-action">Action</th>
251 |             <th scope="col" id="vzebra-children">Children</th>
252 |         </tr>
253 |     </thead>
254 |     <tbody>
255 |     	<tr>
256 |         	<td>Scary Movie</td>
257 |             <td>Indiana Jones</td>
258 |             <td>The Punisher</td>
259 |             <td>Wall-E</td>
260 |         </tr>
261 |         <tr>
262 |         	<td>Epic Movie</td>
263 |             <td>Star Wars</td>
264 |             <td>Bad Boys</td>
265 |             <td>Madagascar</td>
266 |         </tr>
267 |         <tr>
268 |         	<td>Spartan</td>
269 |             <td>LOTR</td>
270 |             <td>Die Hard</td>
271 |             <td>Finding Nemo</td>
272 |         </tr>
273 |         <tr>
274 |         	<td>Dr. Dolittle</td>
275 |             <td>The Mummy</td>
276 |             <td>300</td>
277 |             <td>A Bug's Life</td>
278 |         </tr>
279 |     </tbody>
280 | </table>
281 | 
282 | <table id="one-column-emphasis" summary="2007 Major IT Companies' Profit">
283 |     <colgroup>
284 |     	<col class="oce-first" />
285 |     </colgroup>
286 |     <thead>
287 |     	<tr>
288 |         	<th scope="col">Company</th>
289 |             <th scope="col">Q1</th>
290 |             <th scope="col">Q2</th>
291 |             <th scope="col">Q3</th>
292 |             <th scope="col">Q4</th>
293 |         </tr>
294 |     </thead>
295 |     <tbody>
296 |     	<tr>
297 |         	<td>Microsoft</td>
298 |             <td>20.3</td>
299 |             <td>30.5</td>
300 |             <td>23.5</td>
301 |             <td>40.3</td>
302 |         </tr>
303 |         <tr>
304 |         	<td>Google</td>
305 |             <td>50.2</td>
306 |             <td>40.63</td>
307 |             <td>45.23</td>
308 |             <td>39.3</td>
309 |         </tr>
310 |         <tr>
311 |         	<td>Apple</td>
312 |             <td>25.4</td>
313 |             <td>30.2</td>
314 |             <td>33.3</td>
315 |             <td>36.7</td>
316 |         </tr>
317 |         <tr>
318 |         	<td>IBM</td>
319 |             <td>20.4</td>
320 |             <td>15.6</td>
321 |             <td>22.3</td>
322 |             <td>29.3</td>
323 |         </tr>
324 |     </tbody>
325 | </table>
326 | 
327 | 
328 | <table id="newspaper-a" summary="2007 Major IT Companies' Profit">
329 |     <thead>
330 |     	<tr>
331 |         	<th scope="col">Company</th>
332 |             <th scope="col">Q1</th>
333 |             <th scope="col">Q2</th>
334 |             <th scope="col">Q3</th>
335 |             <th scope="col">Q4</th>
336 |         </tr>
337 |     </thead>
338 |     <tbody>
339 |     	<tr>
340 |         	<td>Microsoft</td>
341 |             <td>20.3</td>
342 |             <td>30.5</td>
343 |             <td>23.5</td>
344 |             <td>40.3</td>
345 |         </tr>
346 |         <tr>
347 |         	<td>Google</td>
348 |             <td>50.2</td>
349 |             <td>40.63</td>
350 |             <td>45.23</td>
351 |             <td>39.3</td>
352 |         </tr>
353 |         <tr>
354 |         	<td>Apple</td>
355 |             <td>25.4</td>
356 |             <td>30.2</td>
357 |             <td>33.3</td>
358 |             <td>36.7</td>
359 |         </tr>
360 |         <tr>
361 |         	<td>IBM</td>
362 |             <td>20.4</td>
363 |             <td>15.6</td>
364 |             <td>22.3</td>
365 |             <td>29.3</td>
366 |         </tr>
367 |     </tbody>
368 | </table>
369 | 
370 | 
371 | <table id="newspaper-b" summary="2007 Major IT Companies' Profit">
372 |     <thead>
373 |     	<tr>
374 |         	<th scope="col">Company</th>
375 |             <th scope="col">Q1</th>
376 |             <th scope="col">Q2</th>
377 |             <th scope="col">Q3</th>
378 |             <th scope="col">Q4</th>
379 |         </tr>
380 |     </thead>
381 |         <tfoot>
382 |     	<tr>
383 |         	<td colspan="5"><em>The above data were fictional and made up, please do not sue me</em></td>
384 |         </tr>
385 |     </tfoot>
386 |     <tbody>
387 |     	<tr>
388 |         	<td>Microsoft</td>
389 |             <td>20.3</td>
390 |             <td>30.5</td>
391 |             <td>23.5</td>
392 |             <td>40.3</td>
393 |         </tr>
394 |         <tr>
395 |         	<td>Google</td>
396 |             <td>50.2</td>
397 |             <td>40.63</td>
398 |             <td>45.23</td>
399 |             <td>39.3</td>
400 |         </tr>
401 |         <tr>
402 |         	<td>Apple</td>
403 |             <td>25.4</td>
404 |             <td>30.2</td>
405 |             <td>33.3</td>
406 |             <td>36.7</td>
407 |         </tr>
408 |         <tr>
409 |         	<td>IBM</td>
410 |             <td>20.4</td>
411 |             <td>15.6</td>
412 |             <td>22.3</td>
413 |             <td>29.3</td>
414 |         </tr>
415 |     </tbody>
416 | </table>
417 | 
418 | <table id="newspaper-c" summary="Personal Movie Rating">
419 |     <thead>
420 |     	<tr>
421 |         	<th scope="col">Favorite</th>
422 |             <th scope="col">Great</th>
423 |             <th scope="col">Nice</th>
424 |             <th scope="col">Bad</th>
425 |         </tr>
426 |     </thead>
427 |     <tbody>
428 |     	<tr>
429 |         	<td>Passion of the Christ</td>
430 |             <td>Bourne Ultimatum</td>
431 |             <td>Shoot 'Em Up</td>
432 |             <td>Ali</td>
433 |         </tr>
434 |         <tr>
435 |         	<td>The Big Fish</td>
436 |             <td>The Mummy</td>
437 |             <td>Apocalypto</td>
438 |             <td>Monster</td>
439 |         </tr>
440 |         <tr>
441 |         	<td>Shawshank Redemption</td>
442 |             <td>Cold Mountain</td>
443 |             <td>Indiana Jones</td>
444 |             <td>Dead or Alive</td>
445 |         </tr>
446 |         <tr>
447 |         	<td>Greatest Story Ever Told</td>
448 |             <td>I Am Legend</td>
449 |             <td>Star Wars</td>
450 |             <td>Saw 3</td>
451 |         </tr>
452 |     </tbody>
453 | </table>
454 | 
455 | 
456 | <table id="rounded-corner" summary="2007 Major IT Companies' Profit">
457 |     <thead>
458 |     	<tr>
459 |         	<th scope="col" class="rounded-company">Company</th>
460 |             <th scope="col" class="rounded-q1">Q1</th>
461 |             <th scope="col" class="rounded-q2">Q2</th>
462 |             <th scope="col" class="rounded-q3">Q3</th>
463 |             <th scope="col" class="rounded-q4">Q4</th>
464 |         </tr>
465 |     </thead>
466 |         <tfoot>
467 |     	<tr>
468 |         	<td colspan="4" class="rounded-foot-left"><em>The above data were fictional and made up, please do not sue me</em></td>
469 |         	<td class="rounded-foot-right">&nbsp;</td>
470 |         </tr>
471 |     </tfoot>
472 |     <tbody>
473 |     	<tr>
474 |         	<td>Microsoft</td>
475 |             <td>20.3</td>
476 |             <td>30.5</td>
477 |             <td>23.5</td>
478 |             <td>40.3</td>
479 |         </tr>
480 |         <tr>
481 |         	<td>Google</td>
482 |             <td>50.2</td>
483 |             <td>40.63</td>
484 |             <td>45.23</td>
485 |             <td>39.3</td>
486 |         </tr>
487 |         <tr>
488 |         	<td>Apple</td>
489 |             <td>25.4</td>
490 |             <td>30.2</td>
491 |             <td>33.3</td>
492 |             <td>36.7</td>
493 |         </tr>
494 |         <tr>
495 |         	<td>IBM</td>
496 |             <td>20.4</td>
497 |             <td>15.6</td>
498 |             <td>22.3</td>
499 |             <td>29.3</td>
500 |         </tr>
501 |     </tbody>
502 | </table>
503 | 
504 | <table id="background-image" summary="Meeting Results">
505 |     <thead>
506 |     	<tr>
507 |         	<th scope="col">Employee</th>
508 |             <th scope="col">Division</th>
509 |             <th scope="col">Suggestions</th>
510 |         </tr>
511 |     </thead>
512 |     <tfoot>
513 |     	<tr>
514 |         	<td colspan="4">IE 6 users won't see the transparent background if the hack is not applied</td>
515 |         </tr>
516 |     </tfoot>
517 |     <tbody>
518 |     	<tr>
519 |         	<td>Stephen C. Cox</td>
520 |             <td>Marketing</td>
521 |             <td>Make discount offers</td>
522 |         </tr>
523 |         <tr>
524 |         	<td>Josephin Tan</td>
525 |             <td>Advertising</td>
526 |             <td>Give bonuses</td>
527 |         </tr>
528 |         <tr>
529 |         	<td>Joyce Ming</td>
530 |             <td>Marketing</td>
531 |             <td>New designs</td>
532 |         </tr>
533 |         <tr>
534 |         	<td>James A. Pentel</td>
535 |             <td>Marketing</td>
536 |             <td>Better Packaging</td>
537 |         </tr>
538 |     </tbody>
539 | </table>
540 | 
541 | 
542 | <table id="gradient-style" summary="Meeting Results">
543 |     <thead>
544 |     	<tr>
545 |         	<th scope="col">Employee</th>
546 |             <th scope="col">Division</th>
547 |             <th scope="col">Suggestions</th>
548 |             <th scope="col">Rating</th>
549 |         </tr>
550 |     </thead>
551 |     <tfoot>
552 |     	<tr>
553 |         	<td colspan="4">Give background color to the table cells to achieve seamless transition</td>
554 |         </tr>
555 |     </tfoot>
556 |     <tbody>
557 |     	<tr>
558 |         	<td>Stephen C. Cox</td>
559 |             <td>Marketing</td>
560 |             <td>Make discount offers</td>
561 |             <td>3/10</td>
562 |         </tr>
563 |         <tr>
564 |         	<td>Josephin Tan</td>
565 |             <td>Advertising</td>
566 |             <td>Give bonuses</td>
567 |         	<td>5/10</td>
568 |         </tr>
569 |         <tr>
570 |         	<td>Joyce Ming</td>
571 |             <td>Marketing</td>
572 |             <td>New designs</td>
573 |         	<td>8/10</td>
574 |         </tr>
575 |         <tr>
576 |         	<td>James A. Pentel</td>
577 |             <td>Marketing</td>
578 |             <td>Better Packaging</td>
579 |             <td>8/10</td>
580 |         </tr>
581 |     </tbody>
582 | </table>
583 | 
584 | 
585 | <table id="pattern-style-a" summary="Meeting Results">
586 |    <thead>
587 |     	<tr>
588 |         	<th scope="col">Employee</th>
589 |             <th scope="col">Salary</th>
590 |             <th scope="col">Bonus</th>
591 |             <th scope="col">Supervisor</th>
592 |         </tr>
593 |   </thead>
594 |     <tbody>
595 |     	<tr>
596 |         	<td>Stephen C. Cox</td>
597 |             <td>$300</td>
598 |             <td>$50</td>
599 |             <td>Bob</td>
600 |         </tr>
601 |         <tr>
602 |         	<td>Josephin Tan</td>
603 |             <td>$150</td>
604 |             <td>-</td>
605 |             <td>Annie</td>
606 |         </tr>
607 |         <tr>
608 |         	<td>Joyce Ming</td>
609 |             <td>$200</td>
610 |             <td>$35</td>
611 |             <td>Andy</td>
612 |         </tr>
613 |         <tr>
614 |         	<td>James A. Pentel</td>
615 |             <td>$175</td>
616 |             <td>$25</td>
617 |             <td>Annie</td>
618 |         </tr>
619 |     </tbody>
620 | </table>
621 | 
622 | <table id="pattern-style-b" summary="Meeting Results">
623 |     <thead>
624 |     	<tr>
625 |         	<th scope="col">Nation</th>
626 |             <th scope="col">Capital</th>
627 |             <th scope="col">Language</th>
628 |             <th scope="col">Unique</th>
629 |         </tr>
630 |     </thead>
631 |     <tbody>
632 |     	<tr>
633 |         	<td>Japan</td>
634 |             <td>Tokyo</td>
635 |             <td>Japanese</td>
636 |             <td>Karate</td>
637 |         </tr>
638 |         <tr>
639 |         	<td>South Korea</td>
640 |             <td>Seoul</td>
641 |             <td>Korean</td>
642 |             <td>Ginseng</td>
643 |         </tr>
644 |         <tr>
645 |         	<td>China</td>
646 |             <td>Beijing</td>
647 |             <td>Mandarin</td>
648 |             <td>Kung-Fu</td>
649 |         </tr>
650 |         <tr>
651 |         	<td>Indonesia</td>
652 |             <td>Jakarta</td>
653 |             <td>Indonesian</td>
654 |             <td>Batik</td>
655 |         </tr>
656 |     </tbody>
657 | </table>
658 | 
659 | </body>
660 | </html>


--------------------------------------------------------------------------------
/static/css/source/table-images/back.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/back.png


--------------------------------------------------------------------------------
/static/css/source/table-images/blurry.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/blurry.jpg


--------------------------------------------------------------------------------
/static/css/source/table-images/botleft.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/botleft.png


--------------------------------------------------------------------------------
/static/css/source/table-images/botright.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/botright.png


--------------------------------------------------------------------------------
/static/css/source/table-images/gradback.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/gradback.png


--------------------------------------------------------------------------------
/static/css/source/table-images/gradhead.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/gradhead.png


--------------------------------------------------------------------------------
/static/css/source/table-images/gradhover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/gradhover.png


--------------------------------------------------------------------------------
/static/css/source/table-images/header.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/header.jpg


--------------------------------------------------------------------------------
/static/css/source/table-images/left.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/left.png


--------------------------------------------------------------------------------
/static/css/source/table-images/pattern-head.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/pattern-head.png


--------------------------------------------------------------------------------
/static/css/source/table-images/pattern.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/pattern.gif


--------------------------------------------------------------------------------
/static/css/source/table-images/pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/pattern.png


--------------------------------------------------------------------------------
/static/css/source/table-images/patternb-head.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/patternb-head.png


--------------------------------------------------------------------------------
/static/css/source/table-images/patternb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/patternb.png


--------------------------------------------------------------------------------
/static/css/source/table-images/right.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/css/source/table-images/right.png


--------------------------------------------------------------------------------
/static/css/style.css:
--------------------------------------------------------------------------------
  1 | /* ------------------
  2 |  styling for the tables 
  3 |    ------------------   */
  4 | 
  5 | 
  6 | body
  7 | {
  8 | 	line-height: 1.6em;
  9 | }
 10 | 
 11 | 
 12 | .dataframe
 13 | {
 14 | 	font-family: "Myriad Pro", "Open Sans", Sans-Serif;
 15 | 	font-size: 14px;
 16 | 	background: #fff;
 17 | 	margin: 10px;
 18 | 	width: 100%;
 19 | 	border-collapse: collapse;
 20 | 	text-align: left;
 21 | }
 22 | .dataframe thead th
 23 | {
 24 | 	font-size: 14px;
 25 | 	font-weight: normal;
 26 | 	color: #039;
 27 | 	padding: 10px 8px;
 28 | 	border-bottom: 2px solid #6678b1;
 29 | }
 30 | 
 31 | .dataframe tbody tr{
 32 |     border-bottom: 1px solid #B4C1E8;
 33 | }
 34 | .dataframe td
 35 | {
 36 |     transition: all 0.3s;
 37 | 	color: #669;
 38 | 	padding: 9px 8px 9px 8px;
 39 | }
 40 | 
 41 | .dataframe tbody tr:nth-child(even) {
 42 |     background: #F5F5F5;
 43 | }
 44 | 
 45 | .dataframe tbody tr:hover{
 46 |     background-color: #F1F1F9;
 47 | }
 48 | 
 49 | .dataframe tbody tr:hover td
 50 | {
 51 | 	color: #009;
 52 | }
 53 | 
 54 | .meta-line, .table-line
 55 | {
 56 |     padding: 0px;
 57 |     margin: 0px;
 58 | }
 59 | 
 60 | .meta-line
 61 | {
 62 |     color: grey;
 63 | }
 64 | 
 65 | 
 66 | #hor-minimalist-b
 67 | {
 68 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
 69 | 	font-size: 12px;
 70 | 	background: #fff;
 71 | 	margin: 45px;
 72 | 	width: 480px;
 73 | 	border-collapse: collapse;
 74 | 	text-align: left;
 75 | }
 76 | #hor-minimalist-b th
 77 | {
 78 | 	font-size: 14px;
 79 | 	font-weight: normal;
 80 | 	color: #039;
 81 | 	padding: 10px 8px;
 82 | 	border-bottom: 2px solid #6678b1;
 83 | }
 84 | #hor-minimalist-b td
 85 | {
 86 | 	border-bottom: 1px solid #ccc;
 87 | 	color: #669;
 88 | 	padding: 6px 8px;
 89 | }
 90 | #hor-minimalist-b tbody tr:hover td
 91 | {
 92 | 	color: #009;
 93 | }
 94 | 
 95 | 
 96 | #ver-minimalist
 97 | {
 98 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
 99 | 	font-size: 12px;
100 | 	margin: 45px;
101 | 	width: 480px;
102 | 	text-align: left;
103 | 	border-collapse: collapse;
104 | }
105 | #ver-minimalist th
106 | {
107 | 	padding: 8px 2px;
108 | 	font-weight: normal;
109 | 	font-size: 14px;
110 | 	border-bottom: 2px solid #6678b1;
111 | 	border-right: 30px solid #fff;
112 | 	border-left: 30px solid #fff;
113 | 	color: #039;
114 | }
115 | #ver-minimalist td
116 | {
117 | 	padding: 12px 2px 0px 2px;
118 | 	border-right: 30px solid #fff;
119 | 	border-left: 30px solid #fff;
120 | 	color: #669;
121 | }
122 | 
123 | 
124 | #box-table-a
125 | {
126 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
127 | 	font-size: 12px;
128 | 	margin: 45px;
129 | 	width: 480px;
130 | 	text-align: left;
131 | 	border-collapse: collapse;
132 | }
133 | #box-table-a th
134 | {
135 | 	font-size: 13px;
136 | 	font-weight: normal;
137 | 	padding: 8px;
138 | 	background: #b9c9fe;
139 | 	border-top: 4px solid #aabcfe;
140 | 	border-bottom: 1px solid #fff;
141 | 	color: #039;
142 | }
143 | #box-table-a td
144 | {
145 | 	padding: 8px;
146 | 	background: #e8edff; 
147 | 	border-bottom: 1px solid #fff;
148 | 	color: #669;
149 | 	border-top: 1px solid transparent;
150 | }
151 | #box-table-a tr:hover td
152 | {
153 | 	background: #d0dafd;
154 | 	color: #339;
155 | }
156 | 
157 | 
158 | #box-table-b
159 | {
160 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
161 | 	font-size: 12px;
162 | 	margin: 45px;
163 | 	width: 480px;
164 | 	text-align: center;
165 | 	border-collapse: collapse;
166 | 	border-top: 7px solid #9baff1;
167 | 	border-bottom: 7px solid #9baff1;
168 | }
169 | #box-table-b th
170 | {
171 | 	font-size: 13px;
172 | 	font-weight: normal;
173 | 	padding: 8px;
174 | 	background: #e8edff;
175 | 	border-right: 1px solid #9baff1;
176 | 	border-left: 1px solid #9baff1;
177 | 	color: #039;
178 | }
179 | #box-table-b td
180 | {
181 | 	padding: 8px;
182 | 	background: #e8edff; 
183 | 	border-right: 1px solid #aabcfe;
184 | 	border-left: 1px solid #aabcfe;
185 | 	color: #669;
186 | }
187 | 
188 | 
189 | #hor-zebra
190 | {
191 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
192 | 	font-size: 12px;
193 | 	margin: 45px;
194 | 	width: 480px;
195 | 	text-align: left;
196 | 	border-collapse: collapse;
197 | }
198 | #hor-zebra th
199 | {
200 | 	font-size: 14px;
201 | 	font-weight: normal;
202 | 	padding: 10px 8px;
203 | 	color: #039;
204 | }
205 | #hor-zebra td
206 | {
207 | 	padding: 8px;
208 | 	color: #669;
209 | }
210 | #hor-zebra .odd
211 | {
212 | 	background: #e8edff; 
213 | }
214 | 
215 | 
216 | #ver-zebra
217 | {
218 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
219 | 	font-size: 12px;
220 | 	margin: 45px;
221 | 	width: 480px;
222 | 	text-align: left;
223 | 	border-collapse: collapse;
224 | }
225 | #ver-zebra th
226 | {
227 | 	font-size: 14px;
228 | 	font-weight: normal;
229 | 	padding: 12px 15px;
230 | 	border-right: 1px solid #fff;
231 | 	border-left: 1px solid #fff;
232 | 	color: #039;
233 | }
234 | #ver-zebra td
235 | {
236 | 	padding: 8px 15px;
237 | 	border-right: 1px solid #fff;
238 | 	border-left: 1px solid #fff;
239 | 	color: #669;
240 | }
241 | .vzebra-odd
242 | {
243 | 	background: #eff2ff;
244 | }
245 | .vzebra-even
246 | {
247 | 	background: #e8edff;
248 | }
249 | #ver-zebra #vzebra-adventure, #ver-zebra #vzebra-children
250 | {
251 | 	background: #d0dafd;
252 | 	border-bottom: 1px solid #c8d4fd;
253 | }
254 | #ver-zebra #vzebra-comedy, #ver-zebra #vzebra-action
255 | {
256 | 	background: #dce4ff;
257 | 	border-bottom: 1px solid #d6dfff;
258 | }
259 | 
260 | 
261 | #one-column-emphasis
262 | {
263 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
264 | 	font-size: 12px;
265 | 	margin: 45px;
266 | 	width: 480px;
267 | 	text-align: left;
268 | 	border-collapse: collapse;
269 | }
270 | #one-column-emphasis th
271 | {
272 | 	font-size: 14px;
273 | 	font-weight: normal;
274 | 	padding: 12px 15px;
275 | 	color: #039;
276 | }
277 | #one-column-emphasis td
278 | {
279 | 	padding: 10px 15px;
280 | 	color: #669;
281 | 	border-top: 1px solid #e8edff;
282 | }
283 | .oce-first
284 | {
285 | 	background: #d0dafd;
286 | 	border-right: 10px solid transparent;
287 | 	border-left: 10px solid transparent;
288 | }
289 | #one-column-emphasis tr:hover td
290 | {
291 | 	color: #339;
292 | 	background: #eff2ff;
293 | }
294 | 
295 | 
296 | #newspaper-a
297 | {
298 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
299 | 	font-size: 12px;
300 | 	margin: 45px;
301 | 	width: 480px;
302 | 	text-align: left;
303 | 	border-collapse: collapse;
304 | 	border: 1px solid #69c;
305 | }
306 | #newspaper-a th
307 | {
308 | 	padding: 12px 17px 12px 17px;
309 | 	font-weight: normal;
310 | 	font-size: 14px;
311 | 	color: #039;
312 | 	border-bottom: 1px dashed #69c;
313 | }
314 | #newspaper-a td
315 | {
316 | 	padding: 7px 17px 7px 17px;
317 | 	color: #669;
318 | }
319 | #newspaper-a tbody tr:hover td
320 | {
321 | 	color: #339;
322 | 	background: #d0dafd;
323 | }
324 | 
325 | 
326 | #newspaper-b
327 | {
328 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
329 | 	font-size: 12px;
330 | 	margin: 45px;
331 | 	width: 480px;
332 | 	text-align: left;
333 | 	border-collapse: collapse;
334 | 	border: 1px solid #69c;
335 | }
336 | #newspaper-b th
337 | {
338 | 	padding: 15px 10px 10px 10px;
339 | 	font-weight: normal;
340 | 	font-size: 14px;
341 | 	color: #039;
342 | }
343 | #newspaper-b tbody
344 | {
345 | 	background: #e8edff;
346 | }
347 | #newspaper-b td
348 | {
349 | 	padding: 10px;
350 | 	color: #669;
351 | 	border-top: 1px dashed #fff;
352 | }
353 | #newspaper-b tbody tr:hover td
354 | {
355 | 	color: #339;
356 | 	background: #d0dafd;
357 | }
358 | 
359 | 
360 | #newspaper-c
361 | {
362 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
363 | 	font-size: 12px;
364 | 	margin: 45px;
365 | 	width: 480px;
366 | 	text-align: left;
367 | 	border-collapse: collapse;
368 | 	border: 1px solid #6cf;
369 | }
370 | #newspaper-c th
371 | {
372 | 	padding: 20px;
373 | 	font-weight: normal;
374 | 	font-size: 13px;
375 | 	color: #039;
376 | 	text-transform: uppercase;
377 | 	border-right: 1px solid #0865c2;
378 | 	border-top: 1px solid #0865c2;
379 | 	border-left: 1px solid #0865c2;
380 | 	border-bottom: 1px solid #fff;
381 | }
382 | #newspaper-c td
383 | {
384 | 	padding: 10px 20px;
385 | 	color: #669;
386 | 	border-right: 1px dashed #6cf;
387 | }
388 | 
389 | 
390 | #rounded-corner
391 | {
392 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
393 | 	font-size: 12px;
394 | 	margin: 45px;
395 | 	width: 480px;
396 | 	text-align: left;
397 | 	border-collapse: collapse;
398 | }
399 | #rounded-corner thead th.rounded-company
400 | {
401 | 	background: #b9c9fe url('table-images/left.png') left -1px no-repeat;
402 | }
403 | #rounded-corner thead th.rounded-q4
404 | {
405 | 	background: #b9c9fe url('table-images/right.png') right -1px no-repeat;
406 | }
407 | #rounded-corner th
408 | {
409 | 	padding: 8px;
410 | 	font-weight: normal;
411 | 	font-size: 13px;
412 | 	color: #039;
413 | 	background: #b9c9fe;
414 | }
415 | #rounded-corner td
416 | {
417 | 	padding: 8px;
418 | 	background: #e8edff;
419 | 	border-top: 1px solid #fff;
420 | 	color: #669;
421 | }
422 | #rounded-corner tfoot td.rounded-foot-left
423 | {
424 | 	background: #e8edff url('table-images/botleft.png') left bottom no-repeat;
425 | }
426 | #rounded-corner tfoot td.rounded-foot-right
427 | {
428 | 	background: #e8edff url('table-images/botright.png') right bottom no-repeat;
429 | }
430 | #rounded-corner tbody tr:hover td
431 | {
432 | 	background: #d0dafd;
433 | }
434 | 
435 | 
436 | #background-image
437 | {
438 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
439 | 	font-size: 12px;
440 | 	margin: 45px;
441 | 	width: 480px;
442 | 	text-align: left;
443 | 	border-collapse: collapse;
444 | 	background: url('table-images/blurry.jpg') 330px 59px no-repeat;
445 | }
446 | #background-image th
447 | {
448 | 	padding: 12px;
449 | 	font-weight: normal;
450 | 	font-size: 14px;
451 | 	color: #339;
452 | }
453 | #background-image td
454 | {
455 | 	padding: 9px 12px;
456 | 	color: #669;
457 | 	border-top: 1px solid #fff;
458 | }
459 | #background-image tfoot td
460 | {
461 | 	font-size: 11px;
462 | }
463 | #background-image tbody td
464 | {
465 | 	background: url('table-images/back.png');
466 | }
467 | * html #background-image tbody td
468 | {
469 | 	/* 
470 | 	   ----------------------------
471 | 		PUT THIS ON IE6 ONLY STYLE 
472 | 		AS THE RULE INVALIDATES
473 | 		YOUR STYLESHEET
474 | 	   ----------------------------
475 | 	*/
476 | 	filter:progid:DXImageTransform.Microsoft.AlphaImageLoader(src='table-images/back.png',sizingMethod='crop');
477 | 	background: none;
478 | }	
479 | #background-image tbody tr:hover td
480 | {
481 | 	color: #339;
482 | 	background: none;
483 | }
484 | 
485 | 
486 | #gradient-style
487 | {
488 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
489 | 	font-size: 12px;
490 | 	margin: 45px;
491 | 	width: 480px;
492 | 	text-align: left;
493 | 	border-collapse: collapse;
494 | }
495 | #gradient-style th
496 | {
497 | 	font-size: 13px;
498 | 	font-weight: normal;
499 | 	padding: 8px;
500 | 	background: #b9c9fe url('table-images/gradhead.png') repeat-x;
501 | 	border-top: 2px solid #d3ddff;
502 | 	border-bottom: 1px solid #fff;
503 | 	color: #039;
504 | }
505 | #gradient-style td
506 | {
507 | 	padding: 8px; 
508 | 	border-bottom: 1px solid #fff;
509 | 	color: #669;
510 | 	border-top: 1px solid #fff;
511 | 	background: #e8edff url('table-images/gradback.png') repeat-x;
512 | }
513 | #gradient-style tfoot tr td
514 | {
515 | 	background: #e8edff;
516 | 	font-size: 12px;
517 | 	color: #99c;
518 | }
519 | #gradient-style tbody tr:hover td
520 | {
521 | 	background: #d0dafd url('table-images/gradhover.png') repeat-x;
522 | 	color: #339;
523 | }
524 | 
525 | 
526 | #pattern-style-a
527 | {
528 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
529 | 	font-size: 12px;
530 | 	margin: 45px;
531 | 	width: 480px;
532 | 	text-align: left;
533 | 	border-collapse: collapse;
534 | 	background: url('table-images/pattern.png');
535 | }
536 | #pattern-style-a thead tr
537 | {
538 | 	background: url('table-images/pattern-head.png');
539 | }
540 | #pattern-style-a th
541 | {
542 | 	font-size: 13px;
543 | 	font-weight: normal;
544 | 	padding: 8px;
545 | 	border-bottom: 1px solid #fff;
546 | 	color: #039;
547 | }
548 | #pattern-style-a td
549 | {
550 | 	padding: 8px; 
551 | 	border-bottom: 1px solid #fff;
552 | 	color: #669;
553 | 	border-top: 1px solid transparent;
554 | }
555 | #pattern-style-a tbody tr:hover td
556 | {
557 | 	color: #339;
558 | 	background: #fff;
559 | }
560 | 
561 | 
562 | #pattern-style-b
563 | {
564 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
565 | 	font-size: 12px;
566 | 	margin: 45px;
567 | 	width: 480px;
568 | 	text-align: left;
569 | 	border-collapse: collapse;
570 | 	background: url('table-images/patternb.png');
571 | }
572 | #pattern-style-b thead tr
573 | {
574 | 	background: url('table-images/patternb-head.png');
575 | }
576 | #pattern-style-b th
577 | {
578 | 	font-size: 13px;
579 | 	font-weight: normal;
580 | 	padding: 8px;
581 | 	border-bottom: 1px solid #fff;
582 | 	color: #039;
583 | }
584 | #pattern-style-b td
585 | {
586 | 	padding: 8px; 
587 | 	border-bottom: 1px solid #fff;
588 | 	color: #669;
589 | 	border-top: 1px solid transparent;
590 | }
591 | #pattern-style-b tbody tr:hover td
592 | {
593 | 	color: #339;
594 | 	background: #cdcdee;
595 | }
596 | 


--------------------------------------------------------------------------------
/static/filters/funds.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name" : "Estimated use and sources of funds",
3 |   "headers" : {
4 |     "terms" :   ["USES OF FUNDS"],
5 |     "threshold" : 0.35
6 |   } 
7 | }


--------------------------------------------------------------------------------
/static/filters/maturity_schedule.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name" : "Maturity Schedule / Debt Service",
3 |   "headers" : {
4 |     "terms" :   ["DEBT SERVICE", "Bonds Maturing"],
5 |     "threshold" : 0.35
6 |   } 
7 | }
8 | 


--------------------------------------------------------------------------------
/static/scrutiny.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/scrutiny.png


--------------------------------------------------------------------------------
/static/xirr_calculator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahirner/TabulaRazr-OS/4e37af539be3a7de7beac74a24cdb097e2388908/static/xirr_calculator.png


--------------------------------------------------------------------------------
/templates/filtered_project.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <meta charset="utf-8">
 3 | 
 4 | <html>
 5 | 
 6 |   <head>
 7 |     <title>{{title}}</title>
 8 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 9 |     {% for s in css %}
10 |     <link href={{ url_for('static', filename=s) }} rel="stylesheet" media="screen">
11 |     {% endfor %}
12 | 
13 | 
14 |   </head>
15 | 
16 |   <body leftmargin="50">
17 |     {% for s in base_scripts %}
18 |     <script src={{ url_for('static', filename=s) }}></script>
19 |     {% endfor %}
20 | 
21 |     {% for s in page_scripts %}
22 |     <script src={{url_for('static', filename=s) }}></script>
23 |     {% endfor %}
24 |     
25 |     <a href="/"><img src={{url_for('static', filename="TabulaRazr_Logo.png")}}></a>
26 | 
27 |     <ul>
28 |     {% for n in notices %}
29 |         <li><b>{{n}}</b></li>
30 |     {% endfor %}
31 |     </ul>
32 |     
33 |     <p>
34 |     </p>
35 |       <ul>
36 |       {% for filename, results in results.iteritems() %}
37 |         <li>
38 |             <h3><a href="{{url_for('show_one_file', filename=filename, project=project)}}">{{filename}}</a> has {{results|length}}
39 |                 highest ranking table(s)</h3>
40 |             {% for result in results %}
41 |             {% set table = result[1] %}
42 |                <br>Confidence = {{result[0]}} in Table <a href="{{url_for('show_one_file', filename=filename, project=project)}}#{{table["begin_line"]}}"> {{table["begin_line"]}}</a><br>
43 |                {% for header in table["headers"] %}
44 |                 <b>{{header}}</b><br>
45 |                {% endfor %}
46 |                {{table["html"]|safe}}
47 |             {% endfor %}
48 |         </li>
49 | 
50 |       {% endfor %}
51 | 
52 |       </ul>
53 |   </body>
54 | </html>
55 | 
56 | 


--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <meta charset="utf-8">
  3 | 
  4 | <html>
  5 | 
  6 |   <head>
  7 |     <title>{{title}}</title>
  8 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  9 |     {% for s in css %}
 10 |     <link href={{ url_for('static', filename=s) }} rel="stylesheet" media="screen">
 11 |     {% endfor %}
 12 |     <link href="http://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
 13 |     
 14 |     <!-- Not working for some reason --> 
 15 |     {% for j in scripts %}
 16 |     <script src={{ url_for('static', filename=j) }}></script>
 17 |     {% endfor %}
 18 |     
 19 |     <script src="//code.jquery.com/jquery-1.12.0.min.js"></script>
 20 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/materialize/0.97.5/js/materialize.min.js"></script>
 21 | 
 22 |   </head>
 23 | 
 24 |   <body leftmargin="50">
 25 |   <header></header>
 26 |   <main>
 27 |     <a href="https://github.com/ahirner/TabulaRazr-OS"><img style="position: absolute; top: 0; right: 0; border: 0;" src="https://camo.githubusercontent.com/a6677b08c955af8400f44c6298f40e7d19cc5b2d/68747470733a2f2f73332e616d617a6f6e6177732e636f6d2f6769746875622f726962626f6e732f666f726b6d655f72696768745f677261795f3664366436642e706e67" alt="Fork me on GitHub" data-canonical-src="https://s3.amazonaws.com/github/ribbons/forkme_right_gray_6d6d6d.png"></a>      
 28 |     <a href="/"><div id="logos">
 29 |         <img src={{url_for('static', filename="center_for_municipal_finance_logo.png")}} style="height: 55px">
 30 |         <img src={{url_for('static', filename="TabulaRazr_Logo.png")}} style="height: 75px">
 31 |     </div></a>
 32 |     <div class="container">
 33 |     <div class="row">
 34 |         <h3 align="center" >Extract the real costs from Municipal Bonds</h3>
 35 |         <form action="" method=post enctype=multipart/form-data class="col m6 sm-5">
 36 |             <b>Browse .pdf or .txt file</b><br>
 37 |             <div class="file-field input-field">
 38 |               <div class="btn blue">
 39 |                 <span>File</span>
 40 |                 <input type="file" name=file>
 41 |               </div>
 42 |               <div class="file-path-wrapper">
 43 |                 <input class="file-path validate" type="text" placeholder="Upload a PDF/txt file">
 44 |               </div>
 45 |             </div>
 46 |              .. or paste URL <input type=text name=url size="80" placeholder="URL to the file"> <br>
 47 |               <b>Choose project</b><br>
 48 |               <select id="project" name="project">
 49 |                 <option value="muni_bonds">Municipal Bonds</option>
 50 |                 <option value="muni_bonds_edge_cases">Municipal Bonds [edge cases]</option>
 51 |                 <!--<option value="business_reports">Business Reports</option>
 52 |                 <option value="medical_research">Medical Research</option>
 53 |                 <option value="_other">... anything else</option>                  
 54 |                 <option value="">root</option>-->
 55 |             </select>
 56 |             <br><br>
 57 |              <button class="btn blue waves-effect waves-light" type="submit" name="action">Submit
 58 |                 <i class="material-icons right">send</i>
 59 |               </button>
 60 |         </form>
 61 |         <!-- Info blocks -->
 62 |         <div class="col m5 offset-m1 sm-6">
 63 |           <div class="icon-block">
 64 |             <p class="info-block-icon light-blue-text"><i class="material-icons">cloud_upload</i></p>
 65 |             <p class="info-block-text">Upload PDF documents</p>
 66 |           </div>
 67 |           <div class="icon-block">
 68 |             <p class="info-block-icon light-blue-text"><i class="material-icons">unarchive</i></p>
 69 |             <p class="info-block-text">Extract tabular data</p>
 70 |           </div>
 71 |           <div class="icon-block">
 72 |             <p class="info-block-icon light-blue-text"><i class="material-icons">visibility</i></p>
 73 |             <p class="info-block-text">Analyze and compare tables</p>
 74 |           </div>
 75 |         </div>
 76 |       <!--
 77 |       <hr>
 78 |       <br>
 79 |       <p>   
 80 |       <h1>Filter common data across entire project</h1>
 81 |           <form action="/project_analysis" method=post enctype=multipart/form-data>
 82 |               <b>Choose project and filter</b><br>
 83 |               <select id="project" name="project">
 84 |                   <option value="muni_bonds">Municipal Bonds</option>
 85 |                   <option value="muni_bonds_edge_cases">Municipal Bonds [edge cases]</option>              
 86 |                   <option value="">root</option>
 87 |               </select>
 88 |               <select id="filter" name="filter">
 89 |                   <option value="funds">Estimated Use and Sources of Funds</option>
 90 |                   <option value="maturity_schedule">Maturity Schedule</option>              
 91 |               </select><br>          
 92 |               <input type="radio" name="use-cache" value="true" checked> Use cached results<br>
 93 |               <input type="radio" name="use-cache" value="false"> Re-analyze whole project<br>
 94 |               <br>
 95 |               <input type=submit value="Filter and Analyze">
 96 |           </form>
 97 |         
 98 |         </p>
 99 |      -->
100 |      </div>
101 |      </div>
102 |      </main>
103 |     <footer class="page-footer blue darken-3">
104 |           <div class="container">
105 |             <div class="row">
106 |               <div class="col l6 s12">
107 |                 <h5 class="white-text">About TabulaRazr</h5>
108 |                 <p class="grey-text text-lighten-4"><b>Extract and browse tabular data from legacy financial documents with ease.</b>
109 |                 <br><a class="grey-text text-lighten-3" href="http://accelerate.im/projects/362">DeveloperWeek (Accelerate.im) Project Page</a>
110 |                 </p>
111 |               </div>
112 |               <div class="col l4 offset-l2 s12">
113 |                 <h5 class="white-text">Find Us On</h5>
114 |                 <ul id="find-us-list">
115 |                   <li><a class="grey-text text-lighten-3" href="https://github.com/ahirner/TabulaRazr-OS"><img src="https://assets-cdn.github.com/images/modules/logos_page/GitHub-Mark.png" alt="" class="footer-image circle responsive-img"></a></li>
116 |                   <li><a class="btn-floating btn-large teal lighten-2 circle"><i class="material-icons">email</i></a></li>
117 |                 </ul>
118 |               </div>
119 |             </div>
120 |           </div>
121 |           <div class="footer-copyright">
122 |             <div class="container">
123 |             <!--© 2016 Copyright-->
124 |             </div>
125 |           </div>
126 |         </footer>
127 |   </body>
128 |   
129 | </html>
130 | 


--------------------------------------------------------------------------------
/templates/inspector.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <meta charset="utf-8">
 3 | 
 4 | <html>
 5 | 
 6 |   <head>
 7 |     <title>{{title}}</title>
 8 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 9 |     {% for s in css %}
10 |     <link href={{ url_for('static', filename=s) }} rel="stylesheet" media="screen">
11 |     {% endfor %}
12 |     <link href="http://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
13 |   </head>
14 | 
15 |   <body leftmargin="50">
16 |     <a href="/"><div id="logos">
17 |         <img src={{url_for('static', filename="center_for_municipal_finance_logo.png")}} style="height: 55px">
18 |         <img src={{url_for('static', filename="TabulaRazr_Logo.png")}} style="height: 75px">
19 |     </div></a>
20 |     
21 |     <div class="fixed-action-btn horizontal" style="bottom: 45px; right: 24px;">
22 |       <a class="btn-floating btn-large">
23 |         <i class="large material-icons">add</i>
24 |       </a>
25 |       <ul>
26 |         <li><a class="btn-floating blue" href="/"><i class="material-icons">cloud_upload</i></a></li>
27 |       </ul>
28 |     </div>  
29 |   
30 |     <div class="container">
31 |       {% for s in base_scripts %}
32 |       <script src={{ url_for('static', filename=s) }}></script>
33 |       {% endfor %}
34 |       
35 |       {% for s in page_scripts %}
36 |       <script src={{url_for('static', filename=s) }}></script>
37 |       {% endfor %}
38 |   
39 |       <h3>Viewing Table <a href="{{url_for('show_one_file', filename=filename, project=project)}}#{{table_id}}">@{{table_id}}</a> from <a href="{{url_for('show_one_file', filename=filename, project=project)}}">{{filename}}</a></h3>
40 |    
41 |       {% for s in page_scripts %}
42 |       <script src={{url_for('static', filename=s)}}></script>
43 |       {% endfor %}
44 |   
45 |       {% for n in notices %}
46 |       <br><b>{{n}}</b>
47 |       {% endfor %}
48 |       
49 |       
50 |         <blockquote><h4>Context Rows Above</h4></blockquote>
51 |         {% for l in top_lines %}
52 |         <div><pre class = "meta-line"><a id="{{loop.index0+offset}}" color="grey">{{loop.index0+offset}}</a> {{l.decode('utf-8')}}</pre></div>
53 |         {% endfor %}
54 |         <blockquote><h3><b>Table Rows and Lines</b></h3></blockquote>
55 |         {% for l in table_lines %}
56 |         <div><pre class = "table-line"><a id="{{loop.index0+offset+top_lines|length}}">{{loop.index0+offset+top_lines|length}}</a> {{l.decode('utf-8')}}</pre></div>
57 |         {% endfor %}
58 |         <blockquote><h3>Context Rows Below</h3></blockquote>
59 |         {% for l in bottom_lines %}
60 |         <div><pre class = "meta-line"><a id="{{loop.index0+offset+top_lines|length+table_lines|length}}" color="grey">{{loop.index0+offset+top_lines|length+table_lines|length}}</a> {{l.decode('utf-8')}}</pre></div>
61 |         {% endfor %}        
62 |     </div>
63 |   </body>
64 | </html>
65 | 


--------------------------------------------------------------------------------
/templates/view_filtered.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <meta charset="utf-8">
 3 | 
 4 | <html>
 5 | 
 6 |   <head>
 7 |     <title>{{title}}</title>
 8 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 9 |     {% for s in css %}
10 |     <link href={{ url_for('static', filename=s) }} rel="stylesheet" media="screen">
11 |     <link href="http://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
12 |     {% endfor %}
13 | 
14 | 
15 |   </head>
16 | 
17 |   <body leftmargin="50">
18 |   <a href="/"><div id="logos">
19 |         <img src={{url_for('static', filename="center_for_municipal_finance_logo.png")}} style="height: 55px">
20 |         <img src={{url_for('static', filename="TabulaRazr_Logo.png")}} style="height: 75px">
21 |     </div></a>
22 |   <div class="fixed-action-btn horizontal" style="bottom: 45px; right: 24px;">
23 |       <a class="btn-floating btn-large">
24 |         <i class="large material-icons">add</i>
25 |       </a>
26 |       <ul>
27 |         <li><a class="btn-floating blue" href="/"><i class="material-icons">cloud_upload</i></a></li>
28 |       </ul>
29 |     </div>  
30 |   <div class="container">
31 |     {% for s in base_scripts %}
32 |     <script src={{ url_for('static', filename=s) }}></script>
33 |     {% endfor %}
34 | 
35 |     {% for s in page_scripts %}
36 |     <script src={{url_for('static', filename=s) }}></script>
37 |     {% endfor %}
38 |     
39 |     <h3>Calculate Total Interest Cost for {{filename}}</h3>
40 |     <div class="row">
41 |     <div class="col m8">
42 |     <ul>
43 |     {% for n in notices %}
44 |         <li>{{n|safe}}</li>
45 |     {% endfor %}
46 |     </ul>
47 |     </div>
48 |     <div class="col m3 offset-m1" style="margin-top: 15px">
49 |     <a href="{{url_for('show_one_file', filename=filename, project=project)}}" class="waves-effect waves-ligt btn-large full-width tooltipped" data-position="bottom" data-delay="50" data-tooltip="View all tables from {{filename}}">View all tables</a>
50 |     </div>
51 |     </div>
52 |     
53 |     {% for filter, results in results.iteritems() %}
54 |       
55 |     <blockquote><h4>Filter <a href="../../static/filters/{{filter}}.json">{{filter}}</a> returned those tables with high confidence:</h4></blockquote>
56 |     
57 |         {% for result in results %}
58 |         <li>
59 |         <div class="card-panel">
60 | 
61 |       
62 |             {% set table = result[1] %}
63 |                Confidence = {{result[0]}} in Table <a href="{{url_for('show_one_file', filename=filename, project=project)}}#{{table["begin_line"]}}"> {{table["begin_line"]}}</a><br>
64 |                {% for header in table["headers"] %}
65 |                 <b>{{header}}</b><br>
66 |                {% endfor %}
67 | 
68 |             {{table["html"]|safe}}
69 |         </div>
70 |         </li>
71 |         {% endfor %}
72 | 
73 |       {% endfor %}
74 | 
75 |       </ul>            
76 |   </div>
77 |   </body>
78 | </html>
79 | 


--------------------------------------------------------------------------------
/templates/viewer.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <meta charset="utf-8">
 3 | 
 4 | <html>
 5 | 
 6 |   <head>
 7 |     <title>{{title}}</title>
 8 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 9 |     {% for s in css %}
10 |     <link href={{ url_for('static', filename=s) }} rel="stylesheet" media="screen">
11 |     {% endfor %}
12 | 
13 |     <link href="http://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
14 |     <script src="//code.jquery.com/jquery-1.12.0.min.js"></script>
15 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/materialize/0.97.5/js/materialize.min.js"></script>
16 | 
17 |   </head>
18 | 
19 |   <body leftmargin="50">
20 |   <a href="/"><div id="logos">
21 |         <img src={{url_for('static', filename="center_for_municipal_finance_logo.png")}} style="height: 55px">
22 |         <img src={{url_for('static', filename="TabulaRazr_Logo.png")}} style="height: 75px">
23 |     </div></a>
24 |   <div class="fixed-action-btn horizontal" style="bottom: 45px; right: 24px;">
25 |       <a class="btn-floating btn-large">
26 |         <i class="large material-icons">add</i>
27 |       </a>
28 |       <ul>
29 |         <li><a class="btn-floating blue" href="/"><i class="material-icons">cloud_upload</i></a></li>
30 |       </ul>
31 |     </div>  
32 |       <div class="container">
33 |         {% for s in base_scripts %}
34 |         <script src={{ url_for('static', filename=s) }}></script>
35 |         {% endfor %}
36 |     
37 |         {% for s in page_scripts %}
38 |         <script src={{url_for('static', filename=s) }}></script>
39 |         {% endfor %}
40 |         
41 |         <blockquote>
42 |         {% for n in notices %}
43 |         {% if loop.index==1 %}<h4><b>{{n}}</b></h4> {% endif %}
44 |         {% if loop.index!=1 %}<h6><b>{{n}}</b></h6> {% endif %}
45 |         {% endfor %}
46 |         </blockquote>
47 |         
48 |         <div class="row">
49 |             <div class="col m8">
50 |                 <img src="{{url_for('static', filename=chart)}}" class="full-width"></img>
51 |             </div>
52 |             
53 |             <div class="col m3 offset-m1">
54 |                 {% if filename_pdf %}
55 |                 <a href="../../static/ug/{{project}}/{{filename_pdf}}" class="waves-effect purple darken-3 waves-ligt btn-large full-width">View raw pdf file</a><br>
56 |                 {% endif %}
57 |                 <a href="../../static/ug/{{project}}/{{filename}}" class="waves-effect waves-ligt btn-large full-width">View raw txt file</a>
58 |                 {% if 'muni_bonds' in project %}                
59 |                 <a href="{{url_for('calculate_xirr', project=project, filename=filename)}}" align="center">
60 |                 <div class="card-panel blue card-btn">
61 |                   <span class="white-text">
62 |                     Calculate XIRR [BETA]
63 |                     <img src={{url_for('static', filename="xirr_calculator.png")}}>
64 |                   </span>
65 |                 </div></a>
66 |                 {% endif %}
67 |             </div>
68 |         </div>
69 |         
70 |       
71 |       {% for table in tables %}
72 |         <div class="card-panel">
73 |             <!-- With tooltip-->
74 |             <!--<h5 class="truncate tooltipped" id="{{meta_data[loop.index0].begin_line}}" data-position="top" data-delay="50" data-tooltip='{{headers[loop.index-1]}}'>{{headers[loop.index-1]}}</h5>-->
75 |             <h5 class="truncate" id="{{meta_data[loop.index0].begin_line}}">{{headers[loop.index-1]}}</h5>
76 |             <a class="waves-effect waves-light btn blue" href="{{url_for('inspector',filename=filename, project=project, data_begin=meta_data[loop.index0].begin_line, data_end=meta_data[loop.index0].end_line, margin_top=meta_data[loop.index0].margin_top)}}">
77 |                 lines {{meta_data[loop.index-1].begin_line}} to {{meta_data[loop.index-1].end_line}}</a>
78 |             <a class="btn table-showhide" onclick="$(this).closest('.dataframe').hide();">Show/Hide</a>
79 |             {{ table|safe }}
80 |         </div>
81 |       {% endfor %}
82 |     </div>
83 |   </body>
84 |   <script>
85 |     $(function() {
86 |         $("h5.truncate").click(function(){
87 |             $(this).removeClass('truncate');
88 |             $(this).tooltip('remove');
89 |         });
90 |         $(".table-showhide").click(function(){
91 |             if ($(this).next(".dataframe.hidden").length) $(this).next(".dataframe.hidden").removeClass('hidden').show();
92 |             else $(this).next(".dataframe").addClass('hidden').hide();
93 |         })
94 |     });
95 |   </script>
96 | </html>
97 | 


--------------------------------------------------------------------------------
/xirr_calc.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | #Calculations adapted from Marc Joffe, 2016 
  5 | 
  6 | import os
  7 | import sys
  8 | import json
  9 | from backend import *
 10 | from data_query import *
 11 | 
 12 | import traceback
 13 | import time
 14 | 
 15 | from itertools import chain
 16 | 
 17 | def calc_net_proceeds(table, first_cf_dict, log=None):
 18 |     v = get_key_values(table, first_cf_dict)
 19 |     #Avoid picking up discount twice
 20 |     if v['discount'] == v['underwriter_discount']:
 21 |         v['discount'] = 0.0
 22 |         
 23 |     #Avoid picking up underwriter discount if included in cost of issuance
 24 |     if v['underwriter_discount'] == v['cost_of_issuance']:
 25 |         v['underwriter_discount'] = 0.0
 26 |         
 27 |     if log:
 28 |         log.append("working with these values for calculating net proceeds: %s" % str(v))
 29 |         if not (v['premium'] or v['discount'] or v['underwriter_discount']):
 30 |             log.append("<b>Warning:</b> neither a premium nor discounts found")
 31 |     
 32 |     net_proceeds_calc = + v['face_value'] \
 33 |                         + (v['premium'] or 0.) \
 34 |                         - (v['discount'] or 0.) \
 35 |                         - (v['underwriter_discount'] or 0.) \
 36 |                         - v['cost_of_issuance']   
 37 | 
 38 |     # Added by Marc 20160306 - Calculate and display cost of issuance and underwriter discount data
 39 |     total_cost_of_issuance = v['underwriter_discount'] + v['cost_of_issuance']
 40 |     total_cost_of_issuance_pct_of_face = total_cost_of_issuance / v['face_value']
 41 |     underwriter_discount_pct_of_face = v['underwriter_discount'] / v['face_value']
 42 |     log.append("Underwriter Discount as Percent of Face Value: <b>%s</b>" % '{:5.4f}'.format(underwriter_discount_pct_of_face))
 43 |     log.append("Total Cost of Issuance as Percent of Face Value: <b>%s</b>" % '{:5.4f}'.format(total_cost_of_issuance_pct_of_face))
 44 |     log.append("Total Cost of Issuance (Including Underwiter Discount): <b>%s</b>" % '{:15,.2f}'.format(total_cost_of_issuance))
 45 | 
 46 |     return net_proceeds_calc
 47 | 
 48 | #Todo: refactor into class
 49 | debug_each_guess = True  # Change to True for verbose output
 50 | 
 51 | 
 52 | def newton(func, x0, fprime=None, args=(), tol=1.48e-8, maxiter=50):
 53 |     """Given a function of a single variable and a starting point,
 54 |     find a nearby zero using Newton-Raphson.
 55 | 
 56 |     fprime is the derivative of the function.  If not given, the
 57 |     Secant method is used.
 58 | 
 59 |     # Source: http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.newton.html
 60 |     # File:   scipy.optimize.minpack.py
 61 |     # License: BSD: http://www.scipy.org/License_Compatibility
 62 |     """
 63 | 
 64 |     if fprime is not None:
 65 |         p0 = x0
 66 |         for iter in range(maxiter):
 67 |             myargs = (p0,)+args
 68 |             fval = func(*myargs)
 69 |             fpval = fprime(*myargs)
 70 |             if fpval == 0:
 71 |                 print "Warning: zero-derivative encountered."
 72 |                 return p0
 73 |             p = p0 - func(*myargs)/fprime(*myargs)
 74 |             if abs(p-p0) < tol:
 75 |                 return p
 76 |             p0 = p
 77 |     else: # Secant method
 78 |         p0 = x0
 79 |         p1 = x0*(1+1e-4)
 80 |         q0 = func(*((p0,)+args))
 81 |         q1 = func(*((p1,)+args))
 82 |         for iter in range(maxiter):
 83 |             if q1 == q0:
 84 |                 if p1 != p0:
 85 |                     print "Tolerance of %s reached" % (p1-p0)
 86 |                 return (p1+p0)/2.0
 87 |             else:
 88 |                 p = p1 - q1*(p1-p0)/(q1-q0)
 89 |             if abs(p-p1) < tol:
 90 |                 return p
 91 |             p0 = p1
 92 |             q0 = q1
 93 |             p1 = p
 94 |             q1 = func(*((p1,)+args))
 95 |     raise RuntimeError, "Failed to converge after %d iterations, value is %s" % (maxiter,p)
 96 | 
 97 | class xirr_calc(object):
 98 |     
 99 |     def __init__(self):
100 |         self.guess_num = 0
101 |         self.debug_each_guess = False
102 |         self.guesses = []
103 | 
104 |     def eir_func(self, rate, pmts, dates):
105 |         """Loop through the dates and calculate a discounted cashflow total
106 | 
107 |         This is a simple process, but the debug messages clutter it up to
108 |         make it seem more complex than it is.  With the debug messages removed,
109 |         it is very similar to eir_derivative_func, but with the EIR formula,
110 |         rather than f'rate.
111 | 
112 |         Credit: http://mail.scipy.org/pipermail/numpy-discussion/2009-May/042736.html
113 |         """
114 | 
115 |         # Globals used for debug printing
116 | 
117 |         print_debug_messages = self.debug_each_guess
118 |         if rate not in self.guesses:
119 |             self.guesses.append(rate)
120 |             if print_debug_messages:
121 |                 print "-----------------------------------------------------------------------------------------------"
122 |                 print "Guess #%s:  %s" % (self.guess_num, rate)
123 |                 print ""
124 |                 print "   # DATE          # DAYS  CASHFLOW      DISCOUNTED    Formula: cf * (rate + 1)^(-days/365)"
125 |                 print "   --------------------------------------------------------------------------------------------"
126 |             self.guess_num +=1
127 | 
128 |         dcf=[]
129 |         for i, cf in enumerate(pmts):
130 |             d = dates[i] - dates[0]
131 |             discounted_period = cf * (rate + 1)**(-d.days / 365.)
132 |             dcf.append( discounted_period )
133 | 
134 |             if print_debug_messages:
135 |                 cf = "%.2f" % cf
136 |                 cf = cf.rjust(9, " ")
137 |                 discounted_period = '%.8f' % discounted_period
138 |                 formula = '%s * ((%0.10f + 1)^(-%d /365)) ' % (cf, rate, d.days)
139 |                 discounted_period = discounted_period.rjust(15, " ")
140 |                 print "  %2i %s  %3.0d days %s %s =%s"  %                 (i, dates[i], d.days, cf, discounted_period, formula )
141 | 
142 |         discounted_cashflow = sum(dcf)
143 | 
144 |         if print_debug_messages:
145 |             discounted_cashflow = "%.8f" % discounted_cashflow
146 |             total = "total:".rjust(35, " ")
147 |             print "%s %s" % (total, discounted_cashflow.rjust(15, " "))
148 |             print ""
149 | 
150 |         return discounted_cashflow
151 | 
152 | def eir_derivative_func(rate, pmts, dates):
153 |     """Find the derivative or the EIR function, used for calculating
154 |     Newton's method:
155 | 
156 |     http://en.wikipedia.org/wiki/Newton's_method
157 | 
158 |     EIR = cf*(1+rate)^d
159 |     f'rate = cf*d*(rate+1)^(d-1)
160 | 
161 |     Credit: http://mail.scipy.org/pipermail/numpy-discussion/2009-May/042736.html
162 |     """
163 | 
164 |     dcf=[]
165 |     for i, cf in enumerate(pmts):
166 |         d = dates[i] - dates[0]
167 |         n = (-d.days / 365.)
168 |         dcf.append( cf * n * (rate + 1)**(n - 1) )
169 |     return sum(dcf)
170 | 
171 | def xirr(file_lines, funds_table, schedule_table):
172 |     
173 |     try:
174 |         log = []
175 | 
176 |         # Get due date
177 |         due_date_query = 'deliver'
178 |         log.append("Try fetching due date with first occurrence of fuzzy term: <i>%s</i>" % due_date_query)
179 |         due_date, date_linenr, line_str = get_first_date(file_lines, 'deliver') 
180 | 
181 |         log.append("... succeeded with date <b>%s</b> in line %i" % (str(due_date), date_linenr))
182 | 
183 |         # Get first cash flow
184 |         first_cf_dict = {'face_value' : ['Principal Amount', 'Par Amount', 'Face Amount'], 
185 |                          'premium' : 'Issue Premium',
186 |                          'discount': ['Issue Discount', 'Net Discount'],
187 |                         'underwriter_discount' : 'Underwriter Discount', 'cost_of_issuance' : 'Costs of Issuance'}
188 | 
189 |         log.append("Try calculating first cashflow by fetching with those fuzzy terms: <i>%s</i>" % str(first_cf_dict.values()))
190 |         net_proceeds = calc_net_proceeds(funds_table, first_cf_dict, log)
191 |         log.append("... succeed with first cashflow as net proceeds of <b>%s</b>" % '{:,.2f}'.format(net_proceeds))
192 | 
193 |         # Get the rest of the time series
194 |         payments_column = "Debt Service"
195 |         log.append("Getting remaining time series by looking for first date column and a column of subtype 'dollar' named similar to <i>'%s'</i>" % payments_column)
196 |         cf_time = chain( ((due_date, net_proceeds),) , 
197 |                             ((d, -v) for d,v in filter_time_series(schedule_table, payments_column)))
198 |         dates = {}
199 |         payments = []
200 |         # Convert our sequence of dates and cashflows into random access iterables
201 |         for i, cf_dt in enumerate(cf_time):
202 |             date, cf = cf_dt[0], cf_dt[1]
203 |             dates[i]=date
204 |             payments.append(cf)
205 |         log.append("... succeed and yielded <b>%i</b> date / cashflow tuples" % len(payments))
206 |  
207 |     except Exception as e:
208 |         log.append("... failed with %s" % traceback.format_exception(*sys.exc_info()))
209 |         return None, log
210 |     
211 |     # Begin Main Calculation
212 |     guess = .05
213 |     calculator = xirr_calc()
214 |     
215 |     maxiter=100
216 |     timer_start = time.clock()
217 |     if len(dates) > 1:
218 |         f = lambda x: calculator.eir_func(x, payments, dates)
219 |         derivative = lambda x: eir_derivative_func(x, payments, dates)
220 |         try:
221 |             rate = newton(f, guess, fprime=derivative, args=(),
222 |                 tol=0.00000000001, maxiter=maxiter)
223 |         except RuntimeError:
224 |             log.append("failed to converge after a maxiumum of %i iterations" %maxiter)
225 | 
226 |     timer_end = time.clock()
227 |     # End Main Calculation
228 | 
229 |     elapsed_time = timer_end - timer_start
230 |     final_rate = rate * 100
231 | 
232 |     if not calculator.debug_each_guess:
233 |         log.append("")
234 |         log.append('<span style="text-decoration:underline">Cashflow and Dates</span>')
235 |         #log.append("-------------------------")
236 |         for i, dte in enumerate(dates.values()):
237 |             log.append ("<pre>%i | %s ... $ %s</pre>" % (i, str(dte), '{:,.2f}'.format(payments[i])) )
238 | 
239 |     log.append('<span style="text-decoration:underline">Guesses Summary</span>')
240 |     
241 |     for i, g in enumerate(calculator.guesses):
242 |         log.append("%i guessed %0.10f" % (i +1,  g))
243 | 
244 |     log.append("Calculation time: %s seconds" % elapsed_time)
245 |     return final_rate, log
246 | 
247 | 


--------------------------------------------------------------------------------