├── .gitignore ├── MANIFEST.in ├── README.markdown ├── deps.txt ├── docs ├── Makefile ├── alchemize.rst ├── conf.py ├── db_loader.rst ├── index.rst ├── make.bat └── sqlize.rst ├── scraptils ├── __init__.py ├── alchemize.py ├── datainspector │ ├── __init__.py │ ├── __main__.py │ ├── static │ │ ├── css │ │ │ └── style.css │ │ └── images │ │ │ ├── collapse.gif │ │ │ └── expand.gif │ └── templates │ │ ├── base.html │ │ ├── data.html │ │ ├── datasets.html │ │ └── index.html ├── db_loader.py ├── dump_schema.py ├── io.py ├── sqlize.py ├── tools │ ├── bbox.py │ ├── csvtojson.py │ ├── pdf2csv.py │ ├── pdfmask.sh │ ├── showcells.py │ └── split.py └── utils.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | env 2 | .*swp 3 | *.pyc 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.markdown 2 | include scraptils/tools/bbox.py 3 | include scraptils/tools/csvtojson.py 4 | include scraptils/tools/pdf2csv.py 5 | include scraptils/tools/pdfmask.sh 6 | include scraptils/tools/showcells.py 7 | include scraptils/tools/split.py 8 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | SCRAPTILS 2 | ========= 3 | 4 | 5 | ### Scraping swiss army knife -------------------------------------------------------------------------------- /deps.txt: -------------------------------------------------------------------------------- 1 | sqlalchemy 2 | htsql 3 | flask 4 | 5 | pyPdf 6 | numpy 7 | scipy 8 | matplotlib 9 | PIL 10 | nltk 11 | pbs 12 | pdfminer 13 | python-dateutil 14 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/scraptils.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/scraptils.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/scraptils" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/scraptils" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/alchemize.rst: -------------------------------------------------------------------------------- 1 | 2 | alchemize 3 | ========= 4 | 5 | Generates SQLAlchemy model from JSON objects 6 | 7 | Usage 8 | ----- 9 | 10 | :: 11 | 12 | usage: python -m scraptils.alchemize [-h] [-i FILE] [-t {json,csv}] [-d DB] 13 | 14 | Scraptils JSON SQLizer 15 | 16 | optional arguments: 17 | -h, --help show this help message and exit 18 | -i, --input FILE 19 | Input file - default is STDIN 20 | -t, --input-type {json,csv} 21 | Input file type 22 | -d, --db DB Database connection string - default is 23 | 'sqlite:///data.sqlite' 24 | 25 | 26 | Examples 27 | -------- 28 | 29 | :: 30 | 31 | $ echo '{"_name": "person", "name": "john", "age": 42, "school": {"name": "sch1", "address": "N"}}' | python -m scraptils.sqlize 32 | 33 | 34 | output:: 35 | 36 | from sqlalchemy import create_engine, Column, Integer, String, Float, Boolean, ForeignKey, Table 37 | from sqlalchemy.orm import relationship, scoped_session, sessionmaker, backref 38 | from sqlalchemy.ext.declarative import declarative_base 39 | 40 | engine = create_engine('sqlite:///data.sqlite') 41 | session = scoped_session(sessionmaker(autocommit=False, autoflush=False, bind=engine)) 42 | 43 | Base = declarative_base() 44 | Base.metadata.bind = engine 45 | Base.query = session.query_property() 46 | 47 | person_school = Table('person_school', Base.metadata 48 | ,Column('person_id', Integer, ForeignKey('person.id')) 49 | ,Column('school_id', Integer, ForeignKey('school.id'))) 50 | 51 | class Person(Base): 52 | __tablename__ = 'person' 53 | id = Column(Integer, primary_key=True) 54 | name = Column(String) 55 | age = Column(Integer) 56 | schools = relationship('School', secondary=person_school) 57 | 58 | def __init__(self, age=None, name=None): 59 | self.age = age 60 | self.name = name 61 | 62 | class School(Base): 63 | __tablename__ = 'school' 64 | id = Column(Integer, primary_key=True) 65 | address = Column(String) 66 | name = Column(String) 67 | persons = relationship('Person', secondary=person_school) 68 | 69 | def __init__(self, name=None, address=None): 70 | self.name = name 71 | self.address = address 72 | 73 | 74 | if __name__ == '__main__': 75 | Base.metadata.create_all(bind=engine) 76 | 77 | 78 | Module description 79 | ------------------ 80 | 81 | .. automodule:: scraptils.alchemize 82 | :members: 83 | :undoc-members: 84 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # scraptils documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Apr 9 12:54:19 2012. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | #sys.path.insert(0, os.path.abspath('.')) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # If your documentation needs a minimal Sphinx version, state it here. 24 | #needs_sphinx = '1.0' 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.pngmath', 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode'] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ['_templates'] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = '.rst' 35 | 36 | # The encoding of source files. 37 | #source_encoding = 'utf-8-sig' 38 | 39 | # The master toctree document. 40 | master_doc = 'index' 41 | 42 | # General information about the project. 43 | project = u'scraptils' 44 | copyright = u'2012, Adam Tauber' 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | version = '0.0.1' 52 | # The full version, including alpha/beta/rc tags. 53 | release = '0.0.1' 54 | 55 | # The language for content autogenerated by Sphinx. Refer to documentation 56 | # for a list of supported languages. 57 | #language = None 58 | 59 | # There are two options for replacing |today|: either, you set today to some 60 | # non-false value, then it is used: 61 | #today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | #today_fmt = '%B %d, %Y' 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | exclude_patterns = ['_build'] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | #default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | #add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description 76 | # unit titles (such as .. function::). 77 | #add_module_names = True 78 | 79 | # If true, sectionauthor and moduleauthor directives will be shown in the 80 | # output. They are ignored by default. 81 | #show_authors = False 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # A list of ignored prefixes for module index sorting. 87 | #modindex_common_prefix = [] 88 | 89 | 90 | # -- Options for HTML output --------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. See the documentation for 93 | # a list of builtin themes. 94 | html_theme = 'default' 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | #html_theme_options = {} 100 | 101 | # Add any paths that contain custom themes here, relative to this directory. 102 | #html_theme_path = [] 103 | 104 | # The name for this set of Sphinx documents. If None, it defaults to 105 | # " v documentation". 106 | #html_title = None 107 | 108 | # A shorter title for the navigation bar. Default is the same as html_title. 109 | #html_short_title = None 110 | 111 | # The name of an image file (relative to this directory) to place at the top 112 | # of the sidebar. 113 | #html_logo = None 114 | 115 | # The name of an image file (within the static path) to use as favicon of the 116 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 117 | # pixels large. 118 | #html_favicon = None 119 | 120 | # Add any paths that contain custom static files (such as style sheets) here, 121 | # relative to this directory. They are copied after the builtin static files, 122 | # so a file named "default.css" will overwrite the builtin "default.css". 123 | html_static_path = ['_static'] 124 | 125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 126 | # using the given strftime format. 127 | #html_last_updated_fmt = '%b %d, %Y' 128 | 129 | # If true, SmartyPants will be used to convert quotes and dashes to 130 | # typographically correct entities. 131 | #html_use_smartypants = True 132 | 133 | # Custom sidebar templates, maps document names to template names. 134 | #html_sidebars = {} 135 | html_sidebars = { 136 | '**': ['localtoc.html', 'relations.html', 137 | 'sourcelink.html', 'searchbox.html'] 138 | } 139 | 140 | # Additional templates that should be rendered to pages, maps page names to 141 | # template names. 142 | #html_additional_pages = {} 143 | 144 | # If false, no module index is generated. 145 | #html_domain_indices = True 146 | 147 | # If false, no index is generated. 148 | #html_use_index = True 149 | 150 | # If true, the index is split into individual pages for each letter. 151 | #html_split_index = False 152 | 153 | # If true, links to the reST sources are added to the pages. 154 | #html_show_sourcelink = True 155 | 156 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 157 | #html_show_sphinx = True 158 | 159 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 160 | #html_show_copyright = True 161 | 162 | # If true, an OpenSearch description file will be output, and all pages will 163 | # contain a tag referring to it. The value of this option must be the 164 | # base URL from which the finished HTML is served. 165 | #html_use_opensearch = '' 166 | 167 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 168 | #html_file_suffix = None 169 | 170 | # Output file base name for HTML help builder. 171 | htmlhelp_basename = 'scraptilsdoc' 172 | 173 | 174 | # -- Options for LaTeX output -------------------------------------------------- 175 | 176 | latex_elements = { 177 | # The paper size ('letterpaper' or 'a4paper'). 178 | #'papersize': 'letterpaper', 179 | 180 | # The font size ('10pt', '11pt' or '12pt'). 181 | #'pointsize': '10pt', 182 | 183 | # Additional stuff for the LaTeX preamble. 184 | #'preamble': '', 185 | } 186 | 187 | # Grouping the document tree into LaTeX files. List of tuples 188 | # (source start file, target name, title, author, documentclass [howto/manual]). 189 | latex_documents = [ 190 | ('index', 'scraptils.tex', u'scraptils Documentation', 191 | u'Adam Tauber', 'manual'), 192 | ] 193 | 194 | # The name of an image file (relative to this directory) to place at the top of 195 | # the title page. 196 | #latex_logo = None 197 | 198 | # For "manual" documents, if this is true, then toplevel headings are parts, 199 | # not chapters. 200 | #latex_use_parts = False 201 | 202 | # If true, show page references after internal links. 203 | #latex_show_pagerefs = False 204 | 205 | # If true, show URL addresses after external links. 206 | #latex_show_urls = False 207 | 208 | # Documents to append as an appendix to all manuals. 209 | #latex_appendices = [] 210 | 211 | # If false, no module index is generated. 212 | #latex_domain_indices = True 213 | 214 | 215 | # -- Options for manual page output -------------------------------------------- 216 | 217 | # One entry per manual page. List of tuples 218 | # (source start file, name, description, authors, manual section). 219 | man_pages = [ 220 | ('index', 'scraptils', u'scraptils Documentation', 221 | [u'Adam Tauber'], 1) 222 | ] 223 | 224 | # If true, show URL addresses after external links. 225 | #man_show_urls = False 226 | 227 | 228 | # -- Options for Texinfo output ------------------------------------------------ 229 | 230 | # Grouping the document tree into Texinfo files. List of tuples 231 | # (source start file, target name, title, author, 232 | # dir menu entry, description, category) 233 | texinfo_documents = [ 234 | ('index', 'scraptils', u'scraptils Documentation', 235 | u'Adam Tauber', 'scraptils', 'One line description of project.', 236 | 'Miscellaneous'), 237 | ] 238 | 239 | # Documents to append as an appendix to all manuals. 240 | #texinfo_appendices = [] 241 | 242 | # If false, no module index is generated. 243 | #texinfo_domain_indices = True 244 | 245 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 246 | #texinfo_show_urls = 'footnote' 247 | 248 | 249 | # -- Options for Epub output --------------------------------------------------- 250 | 251 | # Bibliographic Dublin Core info. 252 | epub_title = u'scraptils' 253 | epub_author = u'Adam Tauber' 254 | epub_publisher = u'Adam Tauber' 255 | epub_copyright = u'2012, Adam Tauber' 256 | 257 | # The language of the text. It defaults to the language option 258 | # or en if the language is not set. 259 | #epub_language = '' 260 | 261 | # The scheme of the identifier. Typical schemes are ISBN or URL. 262 | #epub_scheme = '' 263 | 264 | # The unique identifier of the text. This can be a ISBN number 265 | # or the project homepage. 266 | #epub_identifier = '' 267 | 268 | # A unique identification for the text. 269 | #epub_uid = '' 270 | 271 | # A tuple containing the cover image and cover page html template filenames. 272 | #epub_cover = () 273 | 274 | # HTML files that should be inserted before the pages created by sphinx. 275 | # The format is a list of tuples containing the path and title. 276 | #epub_pre_files = [] 277 | 278 | # HTML files shat should be inserted after the pages created by sphinx. 279 | # The format is a list of tuples containing the path and title. 280 | #epub_post_files = [] 281 | 282 | # A list of files that should not be packed into the epub file. 283 | #epub_exclude_files = [] 284 | 285 | # The depth of the table of contents in toc.ncx. 286 | #epub_tocdepth = 3 287 | 288 | # Allow duplicate toc entries. 289 | #epub_tocdup = True 290 | 291 | 292 | # Example configuration for intersphinx: refer to the Python standard library. 293 | intersphinx_mapping = {'http://docs.python.org/': None} 294 | -------------------------------------------------------------------------------- /docs/db_loader.rst: -------------------------------------------------------------------------------- 1 | 2 | db_loader 3 | ========= 4 | 5 | Usage 6 | ----- 7 | 8 | :: 9 | 10 | usage: python -m scraptils.db_loader [-h] [-i FILE] [-t {json,csv}] FILE 11 | 12 | Scraptils SQLAlchemy database insert script 13 | 14 | positional arguments: 15 | FILE Sqlalchemy model file (model.py) 16 | 17 | optional arguments: 18 | -h, --help show this help message and exit 19 | -i FILE, --input FILE 20 | Input file - default is STDIN 21 | -t {json,csv}, --input-type {json,csv} 22 | Input file type 23 | 24 | Examples 25 | -------- 26 | 27 | :: 28 | 29 | $ python -m scraptils.model_insert model.py -i data.json 30 | 31 | :: 32 | 33 | $ echo '{"_name": "person", "name": "john", "age": 42, "school": {"name": "sch1", "address": "N"}}' | python -m scraptils.db_loader model.py 34 | 35 | Module description 36 | ------------------ 37 | 38 | .. automodule:: scraptils.db_loader 39 | :members: 40 | :undoc-members: 41 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Scraptils 2 | ========= 3 | 4 | Scraping swiss army knife 5 | 6 | Utils 7 | ----- 8 | 9 | 10 | .. toctree:: 11 | :maxdepth: 2 12 | 13 | alchemize 14 | db_loader 15 | sqlize 16 | 17 | 18 | 19 | Indices and tables 20 | ------------------ 21 | 22 | * :ref:`genindex` 23 | * :ref:`modindex` 24 | * :ref:`search` 25 | 26 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\scraptils.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\scraptils.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /docs/sqlize.rst: -------------------------------------------------------------------------------- 1 | 2 | sqlize 3 | ====== 4 | 5 | JSON to SQL converter 6 | 7 | module description: 8 | 9 | .. automodule:: scraptils.sqlize 10 | :members: 11 | :undoc-members: 12 | -------------------------------------------------------------------------------- /scraptils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liberit/scraptils/13e6982eeb07a3e86790b94a6ec8485ffecc1d07/scraptils/__init__.py -------------------------------------------------------------------------------- /scraptils/alchemize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | # This file is part of liberit. 5 | 6 | # liberit is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU Affero General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | 11 | # liberit is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU Affero General Public License for more details. 15 | 16 | # You should have received a copy of the GNU Affero General Public License 17 | # along with liberit. If not, see . 18 | 19 | # (C) 2012 Adam Tauber 20 | 21 | 22 | import sys 23 | import re 24 | from sys import stderr 25 | from scraptils.io import parse_json, parse_csv, readlines 26 | 27 | pytypes = (bool, int, float, unicode) 28 | sqltypes = ('Boolean', 'Integer', 'Float', 'String') 29 | 30 | disallowed_chars = re.compile('[^a-zA-Z_]', re.U) 31 | 32 | def clean(field, maxwidth=40): 33 | global disallowed_chars 34 | return disallowed_chars.sub('_', field).lower()[:maxwidth] 35 | 36 | def _mkstruct(): return {'_conns': []} 37 | 38 | def discover(table, data, schema=None): 39 | if not schema: 40 | schema = {table: _mkstruct()} 41 | elif not schema.has_key(table): 42 | schema[table] = _mkstruct() 43 | 44 | for key, value in data.items(): 45 | if key == '_name': 46 | continue 47 | key = clean(key) 48 | if isinstance(value, dict): 49 | sub = discover(key, value, schema) 50 | if len(sub): 51 | schema[table]['_conns'].append(key) 52 | elif isinstance(value, list): 53 | for i in value: 54 | # TODO 55 | pass 56 | elif type(value) in pytypes: 57 | if schema[table].has_key(key): 58 | if sqltypes.index(schema[table][key]) < pytypes.index(type(value)): 59 | schema[table][key] = sqltypes[pytypes.index(type(value))] 60 | else: 61 | schema[table][key] = sqltypes[pytypes.index(type(value))] 62 | else: 63 | print >> stderr, '[!] Error - cannot handle value "%r" - %r' % (value, type(value)) 64 | return schema 65 | 66 | def defcolumn(name, field_type, *args): 67 | attrs = ', '.join(args) 68 | field = ' %s = Column(%s' % (name, field_type) 69 | if attrs: 70 | field += ', ' + attrs 71 | field += ')' 72 | return field 73 | 74 | def assoc_table(t1, t2): 75 | return ['%s_%s = Table(\'%s_%s\', Base.metadata' % (t1, t2, t1, t2) 76 | ,' ,Column(\'%s_id\', Integer, ForeignKey(\'%s.id\'))' % (t1, t1) 77 | ,' ,Column(\'%s_id\', Integer, ForeignKey(\'%s.id\')))' % (t2, t2) 78 | ,'' 79 | ] 80 | 81 | def createschema(schema, connection_string): 82 | ret = ['from sqlalchemy import create_engine, Column, Integer, String, Float, Boolean, ForeignKey, Table' 83 | ,'from sqlalchemy.orm import relationship, scoped_session, sessionmaker, backref' 84 | ,'from sqlalchemy.ext.declarative import declarative_base' 85 | ,'' 86 | ,'engine = create_engine(\'%s\')' % connection_string 87 | ,'session = scoped_session(sessionmaker(autocommit=False, autoflush=False, bind=engine))' 88 | ,'' 89 | ,'Base = declarative_base()' 90 | ,'Base.metadata.bind = engine' 91 | ,'Base.query = session.query_property()' 92 | ,'' 93 | ] 94 | relations = set() 95 | for table_name, attrs in schema.items(): 96 | if attrs.get('_conns') != None: 97 | for rel_table in attrs['_conns']: 98 | relations.add(tuple(sorted((table_name, rel_table)))) 99 | schema[table_name].pop('_conns') 100 | elif attrs.get('_name'): 101 | schema[table_name].pop('_name') 102 | 103 | for t1, t2 in relations: 104 | ret.extend(assoc_table(t1, t2)) 105 | for table_name, attrs in schema.items(): 106 | ret.extend(('class %s(Base):' % table_name.capitalize() 107 | ,' __tablename__ = \'%s\'' % table_name 108 | ,' id = Column(Integer, primary_key=True)' 109 | )) 110 | for field_name, field_type in set(attrs.items()): 111 | ret.append(defcolumn(field_name, field_type)) 112 | for t1, t2 in relations: 113 | if t1 == table_name: 114 | ret.append(' %ss = relationship(\'%s\', secondary=%s_%s)' % (t2, t2.capitalize(), t1, t2)) 115 | elif t2 == table_name: 116 | ret.append(' %ss = relationship(\'%s\', secondary=%s_%s)' % (t1, t1.capitalize(), t1, t2)) 117 | ret.append('') 118 | ret.append(' def __init__(self, %s):' % ', '.join('%s=None' % x for x in attrs.keys())) 119 | ret.extend(' self.%s = %s' % (x, x) for x in attrs.keys()) 120 | ret.append('') 121 | 122 | ret.append('') 123 | ret.append('if __name__ == \'__main__\':') 124 | ret.append(' Base.metadata.create_all(bind=engine)') 125 | ret.append('') 126 | return '\n'.join(ret) 127 | 128 | def argparser(): 129 | import argparse 130 | argp = argparse.ArgumentParser(description='Scraptils JSON SQLizer') 131 | argp.add_argument('-i', '--input' 132 | ,help = 'Input file - default is STDIN' 133 | ,metavar = 'FILE' 134 | ,default = sys.stdin 135 | ,type = argparse.FileType('r') 136 | ) 137 | argp.add_argument('-t', '--input-type' 138 | ,help = 'Input file type' 139 | ,choices = ('json', 'csv') 140 | ,default = 'json' 141 | ) 142 | argp.add_argument('-d', '--db' 143 | ,help = 'Database connection string - default is \'sqlite:///data.sqlite\'' 144 | ,default = 'sqlite:///data.sqlite' 145 | ) 146 | return vars(argp.parse_args()) 147 | 148 | 149 | if __name__ == '__main__': 150 | #r = parse_json(json.dumps({'_name': 'data_table', 'test_int': 6, 'test_float': 4.4, 'test_str': 'asdf', 'conn_table': {'test_bool': True}})) 151 | #print parse_json(json.dumps({'_name': 'data_table', 'test_int': 6, 'test_float': 4.4, 'test_str': 'asdf', 'conn_table': {'test_bool': True}}), r) 152 | args = argparser() 153 | schema = {} 154 | if args['input_type'] == 'json': 155 | for line in readlines(args['input']): 156 | schema = discover(*parse_json(line), schema=schema) 157 | elif args['input_type'] == 'csv': 158 | schema = discover(*parse_csv(args['input']), schema=schema) 159 | print createschema(schema, args['db']) 160 | -------------------------------------------------------------------------------- /scraptils/datainspector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liberit/scraptils/13e6982eeb07a3e86790b94a6ec8485ffecc1d07/scraptils/datainspector/__init__.py -------------------------------------------------------------------------------- /scraptils/datainspector/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | from flask import Flask, request, render_template, redirect, jsonify, Response, abort 5 | from htsql import HTSQL 6 | from os import listdir 7 | import cStringIO, csv, codecs, json 8 | 9 | 10 | DB_DIR = './data' 11 | DBS = {} 12 | def empty_meta(): 13 | return {'scraper' : {'source': '', 'license': ''} 14 | ,'title' : '' 15 | ,'description' : '' 16 | ,'license' : 'ODBL' 17 | ,'author' : {'title': 'liberit', 'url': 'http://liberit.hu'} 18 | ,'deftable' : 'data' 19 | ,'run_every' : '1d' 20 | } 21 | 22 | app = Flask(__name__) 23 | app.secret_key = 'yourveryverysecretkeystring' 24 | 25 | 26 | def connect(db_string): 27 | return HTSQL(db_string) 28 | 29 | def loadsqlites(path): 30 | global DBS, DB_DIR, EMPTY_META 31 | sqlites = dict((db[:-len('.sqlite')], {'connection': 'sqlite:%s/%s' % (path, db), 'meta': {}}) for db in listdir(path) if db.endswith('.sqlite')) 32 | 33 | for db in sqlites.keys(): 34 | sqlites[db]['name'] = db 35 | try: 36 | f=open('%s/%s.json' % (DB_DIR, db),'r') 37 | sqlites[db]['meta'] = json.load(f) 38 | f.close() 39 | except: 40 | sqlites[db]['meta'] = empty_meta() 41 | sqlites[db]['meta']['deftable'] = db 42 | sqlites[db]['meta']['title'] = db 43 | DBS.update(sqlites) 44 | return DBS 45 | 46 | 47 | 48 | 49 | #@app.route('/', methods=['GET']) 50 | #def index(): 51 | # return render_template('index.html') 52 | 53 | @app.route('/q', methods=['POST']) 54 | def query_redirect(): 55 | q_str = request.form.get('query') 56 | return redirect('/q/'+q_str) 57 | 58 | @app.route('/', methods=['GET']) 59 | def datasets(): 60 | global DBS 61 | return render_template('datasets.html', dbs=DBS.values()) 62 | 63 | @app.route('/reload', methods=['GET']) 64 | def reload(): 65 | global DB_DIR 66 | loadsqlites(DB_DIR) 67 | return redirect('/') 68 | 69 | @app.route('/q//', methods=['GET']) 70 | def html_query(db_name, q): 71 | global DBS 72 | db = DBS.get(db_name) 73 | if not db: 74 | abort(404) 75 | (data, columns, q) = query(request, db_name, q) 76 | columns = json.dumps([{'id': '_'.join(x.split()), 'name': x, 'field': x, 'sortable': True} for x in columns]) 77 | return render_template('data.html', data=json.dumps([dict(zip(x.__fields__, [y for y in x])) for x in data]), cols=columns, db=db['name'], query=q, meta=db['meta']) 78 | 79 | @app.route('/csv//', methods=['GET']) 80 | def csv_query(db, q): 81 | (data, columns, q) = query(request, db, q) 82 | fd = cStringIO.StringIO() 83 | writer = UnicodeWriter(fd) 84 | writer.writerow(columns) 85 | writer.writerows(data) 86 | fd.seek(0) 87 | return Response(response=fd.read(), mimetype="text/csv") 88 | 89 | @app.route('/sqlite/', methods=['GET']) 90 | def sqlite_download(db): 91 | try: 92 | f=open('%s/%s.sqlite' % (DB_DIR, db),'r') 93 | except: 94 | abort(404) 95 | return Response(response=f.read(), mimetype="application/sqlite") 96 | 97 | @app.route('/json//', methods=['GET']) 98 | def json_query(db, q): 99 | (data, columns, q) = query(request, db, q) 100 | return jsonify({'count': len(data.records), 'data': data.records}) 101 | 102 | def query(request, db, q): 103 | global DBS 104 | if not DBS.has_key(db): 105 | abort(404) 106 | filters = request.url.replace(request.base_url, '', 1) 107 | htsql = HTSQL(DBS[db]['connection']) 108 | if filters and ('?' in q): 109 | filters = '&%s' % filters.strip()[1:] 110 | q = '/%s%s' % (q, filters or '') 111 | data = htsql.produce(str(q)) 112 | return (data, data.records[0].__fields__, q) 113 | 114 | def __main__(): 115 | global DB_DIR 116 | loadsqlites(DB_DIR) 117 | app.run(debug = True 118 | ,use_debugger = True 119 | ,port = 5001 120 | ) 121 | 122 | class UnicodeWriter: 123 | """ 124 | A CSV writer which will write rows to CSV file "f", 125 | which is encoded in the given encoding. 126 | src: http://docs.python.org/library/csv.html#writer-objects 127 | """ 128 | 129 | def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): 130 | # Redirect output to a queue 131 | self.queue = cStringIO.StringIO() 132 | self.writer = csv.writer(self.queue, dialect=dialect, **kwds) 133 | self.stream = f 134 | self.encoder = codecs.getincrementalencoder(encoding)() 135 | 136 | def writerow(self, row): 137 | self.writer.writerow([s.encode("utf-8") if isinstance(s, basestring) else s for s in row]) 138 | # Fetch UTF-8 output from the queue ... 139 | data = self.queue.getvalue() 140 | data = data.decode("utf-8") 141 | # ... and reencode it into the target encoding 142 | data = self.encoder.encode(data) 143 | # write to the target stream 144 | self.stream.write(data) 145 | # empty queue 146 | self.queue.truncate(0) 147 | 148 | def writerows(self, rows): 149 | for row in rows: 150 | self.writerow(row) 151 | 152 | if __name__ in ('__main__', 'datainspector'): 153 | __main__() 154 | -------------------------------------------------------------------------------- /scraptils/datainspector/static/css/style.css: -------------------------------------------------------------------------------- 1 | body, div, table, tr, td, h1, h2, h3, h4, pre, p, input { padding: 0; margin: 0; font-family: arial,Verdana; } 2 | 3 | table { border-spacing: 0; border-collapse: collapse; border: 1px solid #999999; margin: 0px 20px; } 4 | td, th { font-size: 12px; padding: 4px 8px; border-left: 1px solid #999999; border-right: 1px solid #999999;} 5 | th { border-bottom: 1px solid #000000; } 6 | tr:hover td { background: none repeat scroll 0 0 #333333; color: #FFFFFF } 7 | td:first-child { color: #999999; font-weight: bold; text-align: center; } 8 | 9 | #header { position: absolute; left: 0; top: 0; right: 0; height: 60px; line-height: 60px; background: #333333; color: #FFFFFF; clear: both; } 10 | #header h1 a { text-decoration: none; color: #FFF; } 11 | #header h1 { margin: 0 10px; } 12 | #main { margin-top: 110px; } 13 | 14 | .alt { background: none repeat scroll 0 0 #F2F2F2; } 15 | 16 | .right { float: right; } 17 | .left { float: left; } 18 | 19 | #infobox { position: absolute; left: 0; top: 60px; right: 0; height: 30px; background: none repeat scroll 0 0 #555; color: #FFFFFF; clear: both; text-align: right; padding: 5px; } 20 | #infobox li { list-style-type:none; display: inline; } 21 | #infobox ul { display: inline; padding: 0.5em; } 22 | #infobox a { color: #c2c2c2; } 23 | #infobox li:after { content: " | "; } 24 | -------------------------------------------------------------------------------- /scraptils/datainspector/static/images/collapse.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liberit/scraptils/13e6982eeb07a3e86790b94a6ec8485ffecc1d07/scraptils/datainspector/static/images/collapse.gif -------------------------------------------------------------------------------- /scraptils/datainspector/static/images/expand.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liberit/scraptils/13e6982eeb07a3e86790b94a6ec8485ffecc1d07/scraptils/datainspector/static/images/expand.gif -------------------------------------------------------------------------------- /scraptils/datainspector/templates/base.html: -------------------------------------------------------------------------------- 1 | {% block doctype %}{% endblock %} 2 | 3 | 4 | 5 | 6 | 7 | 8 | LiberitDataInspector {% block title %}{% endblock %} 9 | {% block scripts %} 10 | {% endblock %} 11 | 12 | {% block styles %} 13 | {% endblock %} 14 | 15 | 16 | {% block body %} 17 | 22 |
23 | {% block main %} 24 | {% endblock %} 25 |
26 | {% endblock %} 27 | 28 | 29 | -------------------------------------------------------------------------------- /scraptils/datainspector/templates/data.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block styles %} 3 | 4 | 5 | 61 | {% endblock %} 62 | {% block scripts %} 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | {% endblock %} 71 | {% block main %} 72 |
73 | {%if meta %} 74 | {{meta.title}}, Author: {{meta.author.title}} 75 | Download as: 76 | 83 |
{{meta.description}}
84 |

Group by: 85 | 88 | 89 | 90 |

91 | {% endif %} 92 | 93 |
94 | 95 |
96 | 97 | 264 | {% endblock %} 265 | -------------------------------------------------------------------------------- /scraptils/datainspector/templates/datasets.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block main %} 3 | Available Datasets 4 | 10 | {% endblock %} 11 | -------------------------------------------------------------------------------- /scraptils/datainspector/templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block main %} 3 |

Ohai

4 | {% endblock %} 5 | -------------------------------------------------------------------------------- /scraptils/db_loader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | # This file is part of liberit. 5 | 6 | # liberit is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU Affero General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | 11 | # liberit is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU Affero General Public License for more details. 15 | 16 | # You should have received a copy of the GNU Affero General Public License 17 | # along with liberit. If not, see . 18 | 19 | # (C) 2012 Adam Tauber 20 | 21 | from scraptils.io import parse_json, readlines 22 | from sys import stdin, stderr, exit 23 | import re 24 | 25 | disallowed_chars = re.compile('[^a-zA-Z_]', re.U) 26 | 27 | def clean(field, maxwidth=40): 28 | global disallowed_chars 29 | return disallowed_chars.sub('_', field).lower()[:maxwidth] 30 | 31 | def insert(model, table_name, data): 32 | assert isinstance(data, dict) 33 | ret = {} 34 | relations = {} 35 | for key, value in data.items(): 36 | if key == '_name': 37 | continue 38 | key = clean(key) 39 | if isinstance(value, dict): 40 | item = insert(model, key, value) 41 | model['session'].add(item) 42 | if relations.get(key): 43 | relations[key].append(item) 44 | else: 45 | relations[key] = [item] 46 | elif isinstance(value, list): 47 | for i in value: 48 | item = insert(model, key, i) 49 | model['session'].add(item) 50 | if relations.get(key): 51 | relations[key].append(item) 52 | else: 53 | relations[key] = [item] 54 | else: 55 | ret[key] = value 56 | obj = model.get(table_name.capitalize())(**ret) 57 | for rel_name, rel in relations.items(): 58 | getattr(obj, rel_name+'s').extend(rel) 59 | return obj 60 | 61 | def argparser(): 62 | import argparse 63 | argp = argparse.ArgumentParser(description='Scraptils SQLAlchemy database insert script') 64 | argp.add_argument('-i', '--input' 65 | ,help = 'Input file - default is STDIN' 66 | ,metavar = 'FILE' 67 | ,default = stdin 68 | ,type = argparse.FileType('r') 69 | ) 70 | argp.add_argument('-t', '--input-type' 71 | ,help = 'Input file type' 72 | ,choices = ('json', 'csv') 73 | ,default = 'json' 74 | ) 75 | argp.add_argument(help = 'Sqlalchemy model file (model.py)' 76 | ,metavar = 'FILE' 77 | ,dest = 'model' 78 | ,type = argparse.FileType('r') 79 | ) 80 | return vars(argp.parse_args()) 81 | 82 | if __name__ == '__main__': 83 | args = argparser() 84 | model = {} 85 | try: 86 | exec(args['model'].read(), model) 87 | except: 88 | exit('[!] cannot import model (%s)' % args['model'].name) 89 | 90 | if args['input_type'] == 'json': 91 | for line in readlines(args['input']): 92 | item = insert(model, *parse_json(line)) 93 | model['session'].add(item) 94 | 95 | elif args['input_type'] == 'csv': 96 | print >>stderr, '[!] TODO CSV support' 97 | print model['session'].commit() -------------------------------------------------------------------------------- /scraptils/dump_schema.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # This file is part of composite data analysis tools (cdat) 4 | 5 | # composite data analysis tools (cdat) is free software: you can 6 | # redistribute it and/or modify it under the terms of the GNU 7 | # Affero General Public License as published by the Free Software 8 | # Foundation, either version 3 of the License, or (at your option) 9 | # any later version. 10 | 11 | # composite data analysis tools (cdat) is distributed in the hope 12 | # that it will be useful, but WITHOUT ANY WARRANTY; without even 13 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A 14 | # PARTICULAR PURPOSE. See the GNU Affero General Public License 15 | # for more details. 16 | 17 | # You should have received a copy of the GNU Affero General Public 18 | # License along with composite data analysis tools (cdat) If not, 19 | # see . 20 | 21 | # (C) 2011 by Stefan Marsiske, 22 | # (C) 2013 by Adam Tauber, 23 | 24 | import sys 25 | import pprint 26 | from itertools import izip_longest 27 | from operator import itemgetter 28 | 29 | def unws(txt): 30 | return u' '.join(txt.split()) 31 | 32 | def dump_schema(items, skip=[], title=None, format="text"): 33 | """ 34 | Dump schema: takes a list of data structures and computes a 35 | probabalistic schema out of the samples, it prints out the result 36 | to the output. 37 | @param count is optional and in case your items list is some kind of cursor that has no __len__ 38 | @param skip is an optional list of keys to skip on the top structure 39 | @param title is the name for the data structure to be displayed 40 | @param format - html is default - full html adds a js/css header and a legend 41 | """ 42 | ax={} 43 | count=0 44 | for item in items: 45 | ax=scan(dict([(k,v) for k,v in item.items() if k not in skip]),ax) 46 | count+=1 47 | if format=='text': 48 | print_schema(ax,0,count) 49 | return 50 | elif format=='full-html': 51 | print '%s
%s
%s' % (_html_header(), 52 | '\n'.join([str(x) for x in html_schema(ax,0,count)]), 53 | _html_footer()) 54 | else: 55 | print '
%s
' % '\n'.join([str(x) for x in html_schema(ax,0,count)]) 56 | 57 | def type_name(o): 58 | return str(type(o)).split("'")[1] 59 | 60 | def scan(d, node): 61 | """ helper for dump_schema""" 62 | if not 'types' in node: 63 | node['types']={} 64 | if hasattr(d, 'keys'): 65 | for k, v in d.items(): 66 | if not 'items' in node: 67 | node['items']={} 68 | if not k in node['items']: 69 | node['items'][k]={'name':k} 70 | node['items'][k]=scan(v,node['items'][k]) 71 | elif isinstance(d,str): 72 | d=d.decode('utf8') 73 | elif hasattr(d, '__iter__'): 74 | if not 'elems' in node: 75 | node['elems']={} 76 | for v in d: 77 | stype = type_name(v) 78 | node['elems'][stype]=scan(v,node['elems'].get(stype,{})) 79 | if isinstance(d, unicode): 80 | d=unws(d) or None 81 | mtype=type_name(d) 82 | tmp=node['types'].get(mtype,{'count': 0, 'example': None}) 83 | tmp['count']+=1 84 | if d and not tmp['example'] and not isinstance(d,dict): 85 | tmp['example']=d 86 | node['types'][mtype]=tmp 87 | return node 88 | 89 | def merge_dict_lists(node): 90 | # ultra ugly. see test code in arch 91 | if ('elems' in node and 92 | 'items' in node and 93 | 'items' in node['elems'].values()[0] and 94 | sorted(node['items'].keys())==sorted(node['elems'].values()[0]['items'].keys())): 95 | 96 | node['types']["list"]['count']+=node['types']["dict"]['count'] 97 | node['elems']["dict"]['types']["dict"]['count']+=node['types']["dict"]['count'] 98 | del node['types']["dict"] 99 | 100 | for k,v in node['items'].items(): 101 | if not k in node['elems'].values()[0]['items']: 102 | node['elems'].values()[0]['items'][k]=v 103 | continue 104 | for tk, tv in v['types'].items(): 105 | if tk in node['elems'].values()[0]['items'][k]['types']: 106 | node['elems'].values()[0]['items'][k]['types'][tk]['count']+=tv['count'] 107 | else: 108 | node['elems'].values()[0]['items'][k]['types'][tk]=tv 109 | del node['items'] 110 | return node 111 | 112 | def print_schema(node,indent,parent,after_list=False): 113 | """ helper for dump_schema""" 114 | merge_dict_lists(node) 115 | for k,v in sorted(node['types'].items(),key=lambda x: x[1]['count'],reverse=True): 116 | nodestr = node.get('name', '') 117 | if nodestr: nodestr = '| '+nodestr.ljust(40-indent*2) 118 | # unicode after a list fix 119 | if after_list and nodestr == '' and k == 'unicode': 120 | print "{0:>5}%".format((v['count']*100)/parent), ' '*(indent*2+37), ('[%s]' % k).ljust(10), 121 | else: 122 | print "{0:>5}%".format((v['count']*100)/parent), ' '*indent, nodestr, ('<%s>' % k).ljust(10), 123 | if k=="list": 124 | print '' 125 | for x in node['elems'].values(): 126 | print_schema(x,indent+1,v['count'],True) 127 | elif k=="dict": 128 | print '' 129 | if 'items' in node: 130 | for x in node['items'].values(): 131 | print_schema(x,indent+1,v['count']) 132 | #TODO else: # empty dict 133 | elif k=="unicode": 134 | print v['example'].encode('utf8') 135 | else: 136 | print v['example'] 137 | 138 | schematpl="
{1} ({0}%)
{2}
" 139 | def html_schema(node,indent,parent): 140 | """ helper for dump_schema""" 141 | merge_dict_lists(node) 142 | res=[] 143 | for k,v in sorted(node['types'].items(),key=lambda x: x[1]['count'],reverse=True): 144 | if k=="list": 145 | data="
    {0}
".format(''.join(["
  • {0}
  • ".format(y) for x in node['elems'].values() for y in html_schema(x,indent+1,v['count'])])) 146 | clss='contents' 147 | elif k=="dict": 148 | data="
      {0}
    ".format(''.join(["
  • {0}
  • ".format(y) for x in node['items'].values() for y in html_schema(x,indent+1,v['count'])])) 149 | clss='contents' 150 | elif k=="unicode": 151 | data="Example: {0}".format(v['example'].encode('utf8')) 152 | clss='example' 153 | elif k=="str": 154 | data="Example: {0}".format(v['example']) 155 | clss='example' 156 | else: 157 | data="Example: {0}".format(v['example']) 158 | clss= 'example' 159 | res.append(schematpl.format(int(float(v['count'])/parent*100 if k!="list" else v['count']), 160 | node.get('name','<listitem>'), 161 | data, 162 | 256-int(64*(1 if v['count']>=parent else float(v['count'])/parent)), 163 | clss, 164 | )) 165 | return res 166 | 167 | def _html_header(): 168 | """ helper for html_schema""" 169 | return """ 170 | 171 | 172 | 173 | 174 | 184 | 185 | 194 | 195 | 196 |
    Click on the names to fold/expand levels. Percentages show probability of this field appearing under it's parent. In case of lists, percentage also shows average length of list.
    197 | """ 198 | 199 | def _html_footer(): 200 | """ helper for html_schema""" 201 | return """ 202 | 203 | 204 | """ 205 | 206 | def diff(old, new, path=[]): 207 | """a handy comparison function for composite data structures""" 208 | if old==None and new!=None: 209 | return [{'type': 'added', 'data': new, 'path': path}] 210 | elif new==None and old!=None: 211 | return [{'type': 'deleted', 'data': old, 'path': path}] 212 | if type(old) == str: old=unicode(old,'utf8') 213 | if type(new) == str: new=unicode(new,'utf8') 214 | if not type(old)==type(new): 215 | return [{'type': 'changed', 'data': (old, new), 'path': path}] 216 | elif hasattr(old,'keys'): 217 | res=[] 218 | for k in set(old.keys() + (new or {}).keys()): 219 | r=diff(old.get(k),(new or {}).get(k), path+[k]) 220 | if r: 221 | res.extend(r) 222 | return res 223 | elif hasattr(old,'__iter__'): 224 | res=[] 225 | for item in filter(None,[diff(a,b,path+[(len(old) if len(old)%s' % '\n'.join(["
  • %s
  • " % printdict(v) for v in d]) 239 | if not type(d)==dict: 240 | return "%s" % unicode(d) 241 | res=[''] 242 | for k,v in [(k,v) for k,v in d.items() if k not in ['mepref','comref']]: 243 | res.append(u"
    %s
    %s
    " % (k,printdict(v))) 244 | return '%s' % u'\n'.join(res) 245 | 246 | def formatdiff(data): 247 | """ formats diffs to html """ 248 | res=[] 249 | for di in sorted(sorted(data,key=itemgetter('path'))): 250 | if di['type']=='changed': 251 | res.append(u'change%s%s%s' % ('/'.join([str(x) for x in di['path']]),printdict(di['data'][1]),printdict(di['data'][0]))) 252 | continue 253 | if di['type']=='deleted': 254 | res.append(u"%s%s%s" % (di['type'], '/'.join([str(x) for x in di['path']]), printdict(di['data']))) 255 | if di['type']=='added': 256 | res.append(u"%s%s%s" % (di['type'], '/'.join([str(x) for x in di['path']]), printdict(di['data']))) 257 | 258 | return "%s
    typechange innewold
    " % '\n'.join(res) 259 | 260 | def test_diff(): 261 | d2={ 'a': [ {'aa': 2, 'bb': 3 }, { 'aa': 1, 'bb':3 }, {'AA': 1, 'BB': { 'asdf': { 'asdf': 'qwer'}}}, {'Mm': [ 'a','b','c','d'] } ], 262 | 'c': [ 0,1,2,3,4]} 263 | d1={ 'a': [ { 'aa': 1, 'bb':3 }, {'AA': 1, 'BB': { 'asdf': '2'}}, {'Mm': [ 'a','b','c','d'] } ], 264 | 'b': { 'z': 9, 'x': 8 }, 265 | 'c': [ 1,2,3,4]} 266 | d=diff(d1,d2) 267 | pprint.pprint(d) 268 | print formatdiff(d) 269 | 270 | def stripns(attr): 271 | ns = ['{http://www.w3.org/1999/xlink}', 272 | '{http://intragate.ec.europa.eu/transparencyregister/intws/20110715}'] 273 | for n in ns: 274 | if attr.startswith(n): 275 | return attr[len(n):] 276 | return attr 277 | 278 | def _xml2obj(elem,c=False): 279 | res={} 280 | if elem.text: 281 | #if c: print "text", stripns(elem.tag) 282 | res[stripns(elem.tag)]=unws(elem.text) 283 | if len(elem.attrib)>0: 284 | if stripns(elem.tag) in [stripns(x) for x in elem.attrib]: 285 | print >>sys.stderr, "attribute clashes with element", stripns(elem.tag), "suppressed attribute value", elem.attrib[stripns(elem.tag)] 286 | #if c: print "attr", stripns(elem.tag) 287 | res.update({stripns(attr): elem.attrib[attr] for attr in elem.attrib}) 288 | kids=elem.xpath('./*') 289 | if len(kids)>0: 290 | if len(set((stripns(kid.tag) for kid in kids)))==len(kids): 291 | #if c: print "dict", stripns(elem.tag) 292 | tmp={} 293 | for kid in kids: 294 | kido=_xml2obj(kid, c=True) 295 | name=stripns(kid.tag) 296 | if not kido or not stripns(kid.tag) in kido: 297 | continue 298 | tmp[name]=kido[name] 299 | res[stripns(elem.tag)]=tmp 300 | else: 301 | #if c: print "list", stripns(elem.tag) 302 | res[stripns(elem.tag)]=[_xml2obj(kid) for kid in kids if _xml2obj(kid)] 303 | if c: pprint.pprint(res), stripns(elem.tag) 304 | return res 305 | 306 | def xml2obj(root): 307 | for elem in root.xpath('//t:resultList/*', 308 | namespaces={'t': 'http://intragate.ec.europa.eu/transparencyregister/intws/20110715'}): 309 | yield _xml2obj(elem) 310 | 311 | def test_dump(fname, html_only=False): 312 | from lxml.etree import parse 313 | root=None 314 | with open(fname, 'r') as fd: 315 | root=parse(fd) 316 | elements=xml2obj(root) 317 | if html_only: 318 | dump_schema(elements,title='lobbyregister',format='full-html') 319 | else: 320 | dump_schema(elements,title='lobbyregister',format='text') 321 | 322 | 323 | def argparser(): 324 | import argparse 325 | argp = argparse.ArgumentParser(description='Scraptils data schema analyzer') 326 | argp.add_argument('-i', '--input' 327 | ,help = 'Input file - default is STDIN' 328 | ,metavar = 'FILE' 329 | ,default = sys.stdin 330 | ,type = argparse.FileType('r') 331 | ) 332 | argp.add_argument('-l', '--limit' 333 | ,help = 'Limit the number of input lines' 334 | ,default = 0 335 | ,type = int 336 | ) 337 | argp.add_argument('-f', '--format' 338 | ,help = 'Output type' 339 | ,choices = ('text', 'html', 'full-html') 340 | ,default = 'text' 341 | ) 342 | return vars(argp.parse_args()) 343 | 344 | 345 | if __name__ == "__main__": 346 | #test_diff() 347 | from json import loads 348 | args = argparser() 349 | d = [] 350 | lineno = 0 351 | while True: 352 | if args['limit'] > 0 and args['limit'] <= lineno: 353 | break 354 | 355 | line = args['input'].readline() 356 | 357 | if not line: 358 | break 359 | 360 | d.append(loads(line.strip())) 361 | lineno += 1 362 | 363 | dump_schema(d, format=args['format']) 364 | -------------------------------------------------------------------------------- /scraptils/io.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | # This file is part of liberit. 5 | 6 | # liberit is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU Affero General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | 11 | # liberit is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU Affero General Public License for more details. 15 | 16 | # You should have received a copy of the GNU Affero General Public License 17 | # along with liberit. If not, see . 18 | 19 | # (C) 2012 Adam Tauber 20 | 21 | 22 | import json 23 | from sys import stderr 24 | 25 | def readlines(infile, outfile=None): 26 | l = infile.readline() 27 | while l: 28 | yield l 29 | try: 30 | l = infile.readline() 31 | except Exception, e: 32 | print >> stderr, '[E] Error reading %s: %r' % (infile.name, e.message) 33 | l = None 34 | 35 | def read_json(data): 36 | try: 37 | chunk = json.loads(data) 38 | except Exception, e: 39 | print >> stderr, '[E] Cannot parse %r\n %r' % (data, e.message) 40 | return {} 41 | return chunk 42 | 43 | def parse_json(data, default_name='data'): 44 | chunk = read_json(data) 45 | name = chunk.get('_name') 46 | if name: 47 | chunk.pop('_name') 48 | else: 49 | name = default_name 50 | 51 | return (name, chunk) 52 | 53 | # TODO!! 54 | def parse_csv(infile, default_name='data'): 55 | field_names = map(unicode.strip, infile.readline().decode('utf-8').split(',')) 56 | field_values = map(unicode.strip, infile.readline().decode('utf-8').split(',')) 57 | ret = dict(zip(field_names, field_values)) 58 | name = ret.get('_name') 59 | if name: 60 | ret.pop('_name') 61 | else: 62 | name = default_name 63 | 64 | return (name, ret) 65 | 66 | -------------------------------------------------------------------------------- /scraptils/sqlize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | # This file is part of liberit. 5 | 6 | # liberit is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU Affero General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | 11 | # liberit is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU Affero General Public License for more details. 15 | 16 | # You should have received a copy of the GNU Affero General Public License 17 | # along with liberit. If not, see . 18 | 19 | # (C) 2012 Adam Tauber 20 | 21 | 22 | import json 23 | from sqlalchemy import create_engine, Boolean, Integer, Float, String, Table, MetaData, Column, ForeignKey 24 | import sys 25 | from tempfile import mkstemp 26 | from os import unlink 27 | 28 | pytypes = (bool, int, float, unicode) 29 | sqltypes = (Boolean, Integer, Float, String) 30 | 31 | def discover(table, data, schema=None): 32 | def mkstruct(): return {'_m2m': [], '_m2o': []} 33 | if not schema: 34 | schema = {table: mkstruct()} 35 | elif not schema.has_key(table): 36 | schema[table] = mkstruct() 37 | 38 | for key, value in data.items(): 39 | if isinstance(value, dict): 40 | sub = discover(key, value) 41 | if sub: 42 | schema.update(sub) 43 | schema[table]['_m2o'].append(sub.keys()[0]) 44 | elif isinstance(value, list): 45 | for i in value: 46 | # TODO 47 | pass 48 | elif type(value) in pytypes: 49 | if schema[table].has_key(key): 50 | if sqltypes.index(schema[table][key]) < pytypes.index(type(value)): 51 | schema[table][key] = sqltypes[pytypes.index(type(value))] 52 | else: 53 | schema[table][key] = sqltypes[pytypes.index(type(value))] 54 | else: 55 | print '[!] Error - cannot handle value "%r" - %r' % (value, type(value)) 56 | return schema 57 | 58 | def read_json(data): 59 | try: 60 | chunk = json.loads(data) 61 | except Exception, e: 62 | print '[E] Cannot parse %r\n %r' % (data, e.message) 63 | return {} 64 | return chunk 65 | 66 | def parse_json(data, default_name='data'): 67 | chunk = read_json(data) 68 | name = chunk.get('_name') 69 | if name: 70 | chunk.pop('_name') 71 | else: 72 | name = default_name 73 | 74 | return (name, chunk) 75 | 76 | def readlines(infile, outfile=None): 77 | l = infile.readline() 78 | while l: 79 | if outfile: 80 | outfile.write(l) 81 | yield l 82 | try: 83 | l = infile.readline() 84 | except Exception, e: 85 | print '[E] Error reading %s: %r' % (infile.name, e.message) 86 | l = None 87 | 88 | def createschema(struct, metadata): 89 | for table_name, fields in struct.items(): 90 | columns = [Column(x, y) for x,y in fields.items() if not x.startswith('_')] 91 | if fields['_m2o']: 92 | columns.extend(Column(x+'id', Integer, ForeignKey('%s.id' % x)) for x in fields['_m2o']) 93 | t = Table(table_name 94 | ,metadata 95 | ,Column('id', Integer, primary_key=True) 96 | ,*columns 97 | ) 98 | 99 | 100 | 101 | def insert_flat(table, data, meta, db): 102 | #connections = [] 103 | for key, value in data.items(): 104 | if isinstance(value, dict): 105 | #connections.append((key, value)) 106 | data.pop(key) 107 | elif isinstance(value, list): 108 | data.pop(key) 109 | if not len(data): return 110 | d = meta.tables[table].insert(data) 111 | db.execute(d) 112 | # session.add(d) 113 | return d 114 | 115 | def argparser(): 116 | import argparse 117 | argp = argparse.ArgumentParser(description='Scraptils JSON SQLizer') 118 | argp.add_argument('-i', '--input' 119 | ,help = 'Input file - default is STDIN' 120 | ,metavar = 'FILE' 121 | ,default = sys.stdin 122 | ,type = argparse.FileType('r') 123 | ) 124 | argp.add_argument('-d', '--db' 125 | ,help = 'Database connection string - default is sqlite:///data.sqlite' 126 | ,default = 'sqlite:///data.sqlite' 127 | ) 128 | argp.add_argument('-t', '--type' 129 | ,help = 'Insertion type' 130 | ,choices = ('flat', 'recursive') 131 | ,default = 'flat' 132 | ) 133 | argp.add_argument('-v', '--verbose' 134 | ,action = 'count' 135 | ,help = 'Verbosity level - default is 3' 136 | ,default = 3 137 | ) 138 | return vars(argp.parse_args()) 139 | 140 | 141 | if __name__ == '__main__': 142 | #r = parse_json(json.dumps({'_name': 'data_table', 'test_int': 6, 'test_float': 4.4, 'test_str': 'asdf', 'conn_table': {'test_bool': True}})) 143 | #print parse_json(json.dumps({'_name': 'data_table', 'test_int': 6, 'test_float': 4.4, 'test_str': 'asdf', 'conn_table': {'test_bool': True}}), r) 144 | args = argparser() 145 | schema = {} 146 | tmp_file_name = mkstemp(prefix='sqlize_')[1] 147 | tmp_file = open(tmp_file_name, 'w') 148 | for line in readlines(args['input'], tmp_file): 149 | schema = discover(*parse_json(line), schema=schema) 150 | tmp_file.close() 151 | print schema 152 | engine = create_engine(args['db'], echo=True) 153 | meta = MetaData() 154 | meta.bind = engine 155 | createschema(schema, meta) 156 | print meta.sorted_tables 157 | meta.create_all() 158 | tmp_file = open(tmp_file_name) 159 | if args['type'] == 'flat': 160 | with engine.begin() as trans: 161 | for line in readlines(tmp_file): 162 | insert_flat(*parse_json(line), meta=meta, db=trans) 163 | else: 164 | #TODO 165 | pass 166 | unlink(tmp_file_name) 167 | -------------------------------------------------------------------------------- /scraptils/tools/bbox.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # finds bounding boxes, used by pdfmask 4 | 5 | import scipy.ndimage as ndimage 6 | import scipy.spatial as spatial 7 | import scipy.misc as misc 8 | import matplotlib.pyplot as plt 9 | import matplotlib.patches as patches 10 | import sys 11 | 12 | class BBox(object): 13 | def __init__(self, x1, y1, x2, y2): 14 | ''' 15 | (x1, y1) is the upper left corner, 16 | (x2, y2) is the lower right corner, 17 | with (0, 0) being in the upper left corner. 18 | ''' 19 | if x1 > x2: x1, x2 = x2, x1 20 | if y1 > y2: y1, y2 = y2, y1 21 | self.x1 = x1 22 | self.y1 = y1 23 | self.x2 = x2 24 | self.y2 = y2 25 | def taxicab_diagonal(self): 26 | ''' 27 | Return the taxicab distance from (x1,y1) to (x2,y2) 28 | ''' 29 | return self.x2 - self.x1 + self.y2 - self.y1 30 | def overlaps(self, other): 31 | ''' 32 | Return True iff self and other overlap. 33 | ''' 34 | return not ((self.x1 > other.x2) 35 | or (self.x2 < other.x1) 36 | or (self.y1 > other.y2) 37 | or (self.y2 < other.y1)) 38 | def __eq__(self, other): 39 | return (self.x1 == other.x1 40 | and self.y1 == other.y1 41 | and self.x2 == other.x2 42 | and self.y2 == other.y2) 43 | def __repr__(self): 44 | return "(%s,%s) -> (%s, %s)" % (self.x1, self.y1, self.x2, self.y2) 45 | 46 | def find_paws(data, smooth_radius = 5, threshold = 0.0001): 47 | # http://stackoverflow.com/questions/4087919/how-can-i-improve-my-paw-detection 48 | """Detects and isolates contiguous regions in the input array""" 49 | # Blur the input data a bit so the paws have a continous footprint 50 | data = ndimage.uniform_filter(data, smooth_radius) 51 | # Threshold the blurred data (this needs to be a bit > 0 due to the blur) 52 | thresh = data > threshold 53 | # Fill any interior holes in the paws to get cleaner regions... 54 | filled = ndimage.morphology.binary_fill_holes(thresh) 55 | # Label each contiguous paw 56 | coded_paws, num_paws = ndimage.label(filled) 57 | # Isolate the extent of each paw 58 | # find_objects returns a list of 2-tuples: (slice(...), slice(...)) 59 | # which represents a rectangular box around the object 60 | data_slices = ndimage.find_objects(coded_paws) 61 | return data_slices 62 | 63 | def slice_to_bbox(slices): 64 | for s in slices: 65 | dy, dx = s[:2] 66 | yield BBox(dx.start, dy.start, dx.stop+1, dy.stop+1) 67 | 68 | def remove_overlaps(bboxes): 69 | ''' 70 | Return a set of BBoxes which contain the given BBoxes. 71 | When two BBoxes overlap, replace both with the minimal BBox that contains both. 72 | ''' 73 | # list upper left and lower right corners of the Bboxes 74 | corners = [] 75 | 76 | # list upper left corners of the Bboxes 77 | ulcorners = [] 78 | 79 | # dict mapping corners to Bboxes. 80 | bbox_map = {} 81 | 82 | for bbox in bboxes: 83 | ul = (bbox.x1, bbox.y1) 84 | lr = (bbox.x2, bbox.y2) 85 | bbox_map[ul] = bbox 86 | bbox_map[lr] = bbox 87 | ulcorners.append(ul) 88 | corners.append(ul) 89 | corners.append(lr) 90 | 91 | # Use a KDTree so we can find corners that are nearby efficiently. 92 | tree = spatial.KDTree(corners) 93 | new_corners = [] 94 | for corner in ulcorners: 95 | bbox = bbox_map[corner] 96 | # Find all points which are within a taxicab distance of corner 97 | indices = tree.query_ball_point( 98 | corner, bbox_map[corner].taxicab_diagonal(), p = 1) 99 | for near_corner in tree.data[indices]: 100 | near_bbox = bbox_map[tuple(near_corner)] 101 | if bbox != near_bbox and bbox.overlaps(near_bbox): 102 | # Expand both bboxes. 103 | # Since we mutate the bbox, all references to this bbox in 104 | # bbox_map are updated simultaneously. 105 | bbox.x1 = near_bbox.x1 = min(bbox.x1, near_bbox.x1) 106 | bbox.y1 = near_bbox.y1 = min(bbox.y1, near_bbox.y1) 107 | bbox.x2 = near_bbox.x2 = max(bbox.x2, near_bbox.x2) 108 | bbox.y2 = near_bbox.y2 = max(bbox.y2, near_bbox.y2) 109 | return set(bbox_map.values()) 110 | 111 | if __name__ == '__main__': 112 | fig = plt.figure() 113 | ax = fig.add_subplot(111) 114 | 115 | data = misc.imread(sys.argv[1], flatten=True) 116 | im = ax.imshow(data) 117 | data_slices = find_paws(255-data, smooth_radius = 5, threshold = 5) 118 | 119 | bboxes = remove_overlaps(slice_to_bbox(data_slices)) 120 | #print bboxes 121 | for bbox in bboxes: 122 | xwidth = bbox.x2 - bbox.x1 123 | ywidth = bbox.y2 - bbox.y1 124 | p = patches.Rectangle((bbox.x1, bbox.y1), xwidth, ywidth, 125 | fc = 'none', ec = 'red') 126 | ax.add_patch(p) 127 | fig.savefig('out.png') 128 | m=max(bboxes,key=lambda x: (x.x2 - x.x1)*(x.y2 - x.y1)) 129 | print "-x %s -y %s -H %s -W %s" % (m.x1,m.y1,m.y2-m.y1,m.x2-m.x1) 130 | -------------------------------------------------------------------------------- /scraptils/tools/csvtojson.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import csv, json, sys, chardet 4 | 5 | def UnicodeDictReader(utf8_data, **kwargs): 6 | csv_reader = csv.DictReader(utf8_data, **kwargs) 7 | for row in csv_reader: 8 | try: 9 | yield dict([(key, unicode(value, chardet.detect(value)['encoding'] or "ascii")) for key, value in row.iteritems()]) 10 | except UnicodeDecodeError: 11 | yield dict([(key, unicode(value, "latin2")) for key, value in row.iteritems()]) 12 | 13 | if len(sys.argv)>1: 14 | delim=sys.argv[1] 15 | else: 16 | delim=';' 17 | headers = csv.reader(sys.stdin, delimiter=delim).next() 18 | reader = UnicodeDictReader(sys.stdin,fieldnames=headers, delimiter=delim) 19 | for row in reader: 20 | print json.dumps(row).replace('\n','').encode('utf8') 21 | -------------------------------------------------------------------------------- /scraptils/tools/pdf2csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # converts a pdf into a csv file 4 | 5 | from pdfminer.pdfparser import PDFParser, PDFDocument 6 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 7 | from pdfminer.layout import LAParams, LTRect 8 | from pdfminer.converter import PDFPageAggregator 9 | from itertools import islice 10 | import sys, csv, cStringIO, codecs 11 | from pbs import pdftotext 12 | 13 | class UnicodeWriter: 14 | """ 15 | A CSV writer which will write rows to CSV file "f", 16 | which is encoded in the given encoding. 17 | src: http://docs.python.org/library/csv.html#writer-objects 18 | """ 19 | 20 | def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): 21 | # Redirect output to a queue 22 | self.queue = cStringIO.StringIO() 23 | self.writer = csv.writer(self.queue, dialect=dialect, **kwds) 24 | self.stream = f 25 | self.encoder = codecs.getincrementalencoder(encoding)() 26 | 27 | def writerow(self, row): 28 | self.writer.writerow([s.encode("utf-8") if isinstance(s, basestring) else s 29 | for s in row]) 30 | # Fetch UTF-8 output from the queue ... 31 | data = self.queue.getvalue() 32 | data = data.decode("utf-8") 33 | # ... and reencode it into the target encoding 34 | data = self.encoder.encode(data) 35 | # write to the target stream 36 | self.stream.write(data) 37 | # empty queue 38 | self.queue.truncate(0) 39 | 40 | def writerows(self, rows): 41 | for row in rows: 42 | self.writerow(row) 43 | 44 | 45 | def pdf2csv(pdf): 46 | fp = open(pdf, 'rb') 47 | parser = PDFParser(fp) 48 | doc = PDFDocument() 49 | parser.set_document(doc) 50 | doc.set_parser(parser) 51 | # Supply the password for initialization. 52 | # (If no password is set, give an empty string.) 53 | doc.initialize('') 54 | rsrcmgr = PDFResourceManager() 55 | # Set parameters for analysis. 56 | laparams = LAParams() 57 | device = PDFPageAggregator(rsrcmgr, laparams=laparams) 58 | interpreter = PDFPageInterpreter(rsrcmgr, device) 59 | 60 | writer = UnicodeWriter(sys.stdout) 61 | for pageno, page in enumerate(doc.get_pages()): 62 | interpreter.process_page(page) 63 | layout = device.get_result() 64 | hlines=[] 65 | vlines=[] 66 | for i in layout: 67 | if not type(i) == LTRect: continue 68 | hlines.append(int(i.x0)) 69 | hlines.append(int(i.x1)) 70 | vlines.append(int(layout.height - i.y0)) 71 | vlines.append(int(layout.height - i.y1)) 72 | hlines=filterclose(sorted(set(hlines))) 73 | vlines=filterclose(sorted(set(vlines))) 74 | i=0 75 | while(i10: 77 | i=i+1 78 | continue 79 | j=0 80 | row=[] 81 | while(j10: 83 | j=j+1 84 | continue 85 | row.append(' '.join(get_region(pdf, 86 | pageno+1, 87 | hlines[j]+1, 88 | vlines[i], 89 | hlines[j+1]-1, 90 | vlines[i+1]).split())) 91 | j=j+1 92 | writer.writerow(row) 93 | i=i+1 94 | fp.close() 95 | 96 | def filterclose(lst): 97 | tmp=[lst[0]] 98 | for elem in islice(lst, 1, None): 99 | if elem - 2 > tmp[-1]: 100 | tmp.append(elem) 101 | return tmp 102 | 103 | def get_region(pdf, page, x1,y1,x2,y2): 104 | # this is an extremely ugly hack. should be reimplemented with 105 | # some poppler like lib, which itself only supports getting 106 | # "selected" text, having some different logic than the 107 | # simple one used in pdftotext 108 | return pdftotext('-nopgbrk', 109 | '-f', page, 110 | '-l', page, 111 | '-x', x1, 112 | '-y', y1, 113 | '-H', abs(y2-y1), 114 | '-W', abs(x2-x1), 115 | pdf, 116 | '-' 117 | ) 118 | 119 | if __name__=='__main__': 120 | pdf2csv(sys.argv[1]) 121 | -------------------------------------------------------------------------------- /scraptils/tools/pdfmask.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ksh 2 | # calculates bounding box for pdftotext, ommitting any headers/footers. 3 | 4 | bindir=$(realpath "${0%/*}") 5 | pdf=$(realpath "$1") 6 | dir=$(basename "${1%%.pdf}") 7 | mkdir -p "$dir" 8 | cd "$dir" 9 | i=1 10 | while true; do 11 | python ${bindir}/split.py "$pdf" $i $i.pdf || break 12 | convert -monochrome -transparent white $i.pdf $i.png 13 | print "processing page $i" 14 | [[ ! -f $i.png ]] && break 15 | dim=$(file $i.png 2>/dev/null | cut -d, -f2 | tr -d ' ') 16 | [[ -r ${dim}.png ]] && 17 | convert ${dim}.png $i.png -flatten -define png:bit-depth=8 ${dim%%.files}.png || 18 | #else 19 | convert $i.png -flatten -define png:bit-depth=8 ${dim%%.files}.png 20 | rm $i.pdf 21 | i=$((i+1)) 22 | done 23 | for dim in *x*.png; do 24 | python ${bindir}/bbox.py ${dim} && mv out.png ${dim%%.png}_bbox.png 25 | done 26 | feh *x*_bbox.png 27 | cd - 28 | -------------------------------------------------------------------------------- /scraptils/tools/showcells.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # shows the cells as calculated by pdf2csv in pngs 4 | 5 | from pdfminer.pdfparser import PDFParser, PDFDocument 6 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 7 | from pdfminer.pdfdevice import PDFDevice 8 | from pdfminer.layout import LAParams, LTTextBoxHorizontal, LTRect, LTLine 9 | from pdfminer.converter import PDFPageAggregator 10 | from operator import itemgetter 11 | import sys, os 12 | import Image, ImageDraw 13 | 14 | def pdf2csv(fp): 15 | # Create a PDF parser object associated with the file object. 16 | parser = PDFParser(fp) 17 | # Create a PDF document object that stores the document structure. 18 | doc = PDFDocument() 19 | # Connect the parser and document objects. 20 | parser.set_document(doc) 21 | doc.set_parser(parser) 22 | # Supply the password for initialization. 23 | # (If no password is set, give an empty string.) 24 | doc.initialize('') 25 | # Check if the document allows text extraction. If not, abort. 26 | if not doc.is_extractable: 27 | raise PDFTextExtractionNotAllowed 28 | # Create a PDF resource manager object that stores shared resources. 29 | rsrcmgr = PDFResourceManager() 30 | # Set parameters for analysis. 31 | laparams = LAParams() 32 | # Create a PDF page aggregator object. 33 | device = PDFPageAggregator(rsrcmgr, laparams=laparams) 34 | interpreter = PDFPageInterpreter(rsrcmgr, device) 35 | 36 | for pageno, page in enumerate(doc.get_pages()): 37 | interpreter.process_page(page) 38 | # receive the LTPage object for the page. 39 | layout = device.get_result() 40 | #import code; code.interact(local=locals()); 41 | hlines=[] 42 | vlines=[] 43 | for i in layout: 44 | if not type(i) in (LTRect, LTLine): continue 45 | hlines.append(int(i.x0)) 46 | hlines.append(int(i.x1)) 47 | vlines.append(int(layout.height - i.y0)) 48 | vlines.append(int(layout.height - i.y1)) 49 | hlines=filterclose(sorted(set(hlines))) 50 | vlines=filterclose(sorted(set(vlines))) 51 | print hlines 52 | print vlines 53 | print (layout.width, layout.height) 54 | i=0 55 | im = Image.new('1', (int(layout.width), int(layout.height))) 56 | draw = ImageDraw.Draw(im) 57 | while(i5: 59 | i=i+1 60 | continue 61 | j=0 62 | while(j5: 64 | j=j+1 65 | continue 66 | draw.rectangle([(int(hlines[j]),int(vlines[i])),(int(hlines[j+1]),int(vlines[i+1]))], outline=1) 67 | j=j+1 68 | i=i+1 69 | del draw 70 | fp=open("out%s.png" % pageno,'wb') 71 | im.save(fp,"PNG") 72 | fp.close() 73 | 74 | def filterclose(lst): 75 | if not lst: return lst 76 | i=1 77 | tmp=[lst[0]] 78 | while itmp[-1]: 80 | tmp.append(lst[i]) 81 | i=i+1 82 | return tmp 83 | 84 | if __name__=='__main__': 85 | fp = open(sys.argv[1], 'rb') 86 | pdf2csv(fp) 87 | fp.close() 88 | -------------------------------------------------------------------------------- /scraptils/tools/split.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # extracts single pages from pdfs. 4 | 5 | from pyPdf import PdfFileWriter, PdfFileReader 6 | import sys 7 | 8 | output = PdfFileWriter() 9 | input1 = PdfFileReader(file(sys.argv[1], "rb")) 10 | 11 | # add page 1 from input1 to output document, unchanged 12 | output.addPage(input1.getPage(int(sys.argv[2]))) 13 | 14 | # print how many pages input1 has: 15 | #print "document1.pdf has %s pages." % (input1.getNumPages()) 16 | 17 | # finally, write "output" to document-output.pdf 18 | outputStream = file(sys.argv[3], "wb") 19 | output.write(outputStream) 20 | -------------------------------------------------------------------------------- /scraptils/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | # This file is part of liberit. 5 | 6 | # liberit is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU Affero General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | 11 | # liberit is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU Affero General Public License for more details. 15 | 16 | # You should have received a copy of the GNU Affero General Public License 17 | # along with liberit. If not, see . 18 | 19 | # (C) 2012 Stefan Marsiske 20 | 21 | import urllib2, cookielib, time, sys, json 22 | from lxml.html.soupparser import parse 23 | from lxml.etree import tostring 24 | #opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar())) 25 | opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()), 26 | urllib2.ProxyHandler({'http': 'http://localhost:8123/'})) 27 | opener.addheaders = [('User-agent', 'liberit/0.1')] 28 | 29 | def fetch(url, retries=5, ignore=[], params=None): 30 | # url to etree 31 | try: 32 | f=opener.open(url, params) 33 | except (urllib2.HTTPError, urllib2.URLError), e: 34 | if hasattr(e, 'code') and e.code>=400 and e.code not in [504, 502]+ignore: 35 | print >>sys.stderr, "[!] %d %s" % (e.code, url) 36 | raise 37 | if retries>0: 38 | timeout=4*(6-retries) 39 | print >>sys.stderr, "[!] failed: %d %s, sleeping %ss" % (e.code, url, timeout) 40 | time.sleep(timeout) 41 | f=fetch(url,retries-1, ignore=ignore) 42 | else: 43 | raise 44 | return parse(f) 45 | 46 | def dateJSONhandler(obj): 47 | if hasattr(obj, 'isoformat'): 48 | return unicode(obj.isoformat()) 49 | else: 50 | raise TypeError, 'Object of type %s with value of %s is not JSON serializable' % (type(obj), repr(obj)) 51 | 52 | def jdump(d): 53 | return json.dumps(d, indent=1, default=dateJSONhandler, ensure_ascii=False) 54 | 55 | def unws(txt): 56 | return u' '.join(txt.split()) 57 | 58 | def getFrag(url, path): 59 | return fetch(url).xpath(path) 60 | 61 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | # Utility function to read the README file. 5 | # Used for the long_description. It's nice, because now 1) we have a top level 6 | # README file and 2) it's easier to type in the README file than to put a raw 7 | # string in below ... 8 | def read(fname): 9 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 10 | 11 | setup( 12 | name = "scraptils", 13 | version = "0.2.2", 14 | author = "Stefan Marsiske", 15 | author_email = "s@ctrlc.hu", 16 | description = ("Scraping and datamangling Utilities"), 17 | license = "AGPLv3+", 18 | keywords = "scraping data", 19 | packages = find_packages(), 20 | url = "http://packages.python.org/scraptils", 21 | py_modules=['scraptils' ], 22 | entry_points={ 23 | "console_scripts": ["dump_schema=scraptils.dump_schema:__main__"] 24 | }, 25 | long_description=read('README.markdown'), 26 | classifiers = ["Development Status :: 4 - Beta", 27 | "License :: OSI Approved :: GNU Affero General Public License v3", 28 | "Environment :: Web Environment", 29 | ], 30 | ) 31 | --------------------------------------------------------------------------------