├── .coveragerc ├── .gitignore ├── .travis.yml ├── CHANGES.md ├── LICENSE ├── README.md ├── Vagrantfile ├── conda_recipes ├── cherrypy │ ├── bld.bat │ ├── build.sh │ └── meta.yaml ├── ddt-word2vec │ ├── build.sh │ └── meta.yaml ├── elasticnutch │ ├── NUTCH-2152.git.patch │ ├── PubSub_routingkey.patch │ ├── build.sh │ ├── meta.yaml │ ├── nutch-site.xml │ └── regex-urlfilter.txt ├── elasticsearch-py │ ├── bld.bat │ ├── build.sh │ └── meta.yaml ├── elasticsearch │ ├── bld.bat │ ├── build.sh │ ├── elasticsearch.yml │ └── meta.yaml ├── kibana │ ├── linux32 │ │ ├── bld.bat │ │ ├── build.sh │ │ └── meta.yaml │ ├── linux64 │ │ ├── bld.bat │ │ ├── build.sh │ │ └── meta.yaml │ └── osx │ │ ├── bld.bat │ │ ├── build.sh │ │ └── meta.yaml ├── maven │ ├── build.sh │ └── meta.yaml ├── nosexcover │ ├── bld.bat │ ├── build.sh │ └── meta.yaml ├── pyelasticsearch │ ├── bld.bat │ ├── build.sh │ └── meta.yaml ├── simplejson │ ├── bld.bat │ ├── build.sh │ └── meta.yaml ├── tika │ ├── bld.bat │ ├── build.sh │ └── meta.yaml └── urllib3 │ ├── bld.bat │ ├── build.sh │ └── meta.yaml ├── conftest.py ├── deploy ├── deploy_ec2.sh ├── deploy_environment.yml ├── docker.conf ├── dot-htpasswd ├── ec2-fabfile.py ├── gunicorn_start.sh ├── keys │ └── empty.pem ├── nginx.conf ├── supervisor.conf └── tad.cfg ├── docker ├── Dockerfile ├── docker_settings.py ├── nginx.conf ├── populate_elasticsearch.sh ├── supervisord.conf ├── tad.cfg └── tad_run ├── docs ├── Makefile ├── make.bat ├── requirements.txt └── source │ ├── _static │ └── img │ │ ├── DbVisualizer.png │ │ ├── ache-buttons.png │ │ ├── ache-dashboard.png │ │ ├── ache_stats.png │ │ ├── add-crawl-model.png │ │ ├── add-crawl.png │ │ ├── crawl_settings.png │ │ ├── crawler_control.png │ │ ├── create-seeds.png │ │ ├── edit-seeds.png │ │ ├── homepage-view.png │ │ ├── nutch-buttons.png │ │ ├── nutch-dashboard.png │ │ ├── nutch_stats.png │ │ ├── plots.png │ │ ├── project-form.png │ │ ├── project-page.png │ │ ├── seeds-from-trail.png │ │ ├── seeds-page.png │ │ ├── seeds_head.png │ │ ├── testing_guide │ │ ├── add_index_kibana.png │ │ ├── crawl_added_success.png │ │ ├── crawl_buttons_success.png │ │ ├── crawler_nutch.png │ │ ├── edit_index_link.png │ │ ├── index_creation_success.png │ │ ├── kibana_data.png │ │ ├── logio_running.png │ │ ├── nutch_dashboard_initial.png │ │ ├── project_edit_success.png │ │ ├── project_success_message.png │ │ └── settings.png │ │ ├── upload-files.png │ │ └── upload-success.png │ ├── conf.py │ ├── crawler_guide.rst │ ├── dev_guide.rst │ ├── glossary.rst │ ├── index.rst │ ├── manual_testing_guide.rst │ └── user_guide.rst ├── environment.yml ├── pytest.ini └── source ├── app_setup.sh ├── apps ├── __init__.py └── crawl_space │ ├── __init__.py │ ├── admin.py │ ├── forms.py │ ├── migrations │ ├── 0001_initial.py │ └── __init__.py │ ├── models.py │ ├── settings.py │ ├── templates │ └── crawl_space │ │ ├── add_crawl.html │ │ ├── add_crawl_model.html │ │ ├── add_crawl_model_form.html │ │ ├── crawl.html │ │ ├── crawl_update_form.html │ │ └── crawls.html │ ├── tests │ ├── functional_crawl_test.py │ ├── test_crawl.py │ ├── test_crawl_model.py │ ├── test_plots.py │ ├── test_rest_crawl.py │ └── test_rest_crawl_model.py │ ├── urls.py │ ├── utils.py │ ├── views.py │ └── viz │ ├── __init__.py │ ├── domain.py │ ├── harvest.py │ ├── plot.py │ └── stream.py ├── base ├── __init__.py ├── admin.py ├── deploy_templates │ ├── docker-compose.yml.jinja2 │ └── nginx-reverse-proxy.conf.jinja2 ├── forms.py ├── management │ ├── __init__.py │ └── commands │ │ ├── __init__.py │ │ └── testbase.py ├── migrations │ ├── 0001_initial.py │ └── __init__.py ├── models.py ├── static │ └── base │ │ ├── css │ │ ├── bokeh.min.css │ │ ├── bootstrap-table.min.css │ │ ├── bootstrap-theme.css.map │ │ ├── bootstrap-theme.min.css │ │ ├── bootstrap.css.map │ │ ├── bootstrap.min.css │ │ ├── codemirror-theme.css │ │ ├── codemirror.css │ │ ├── custom.css │ │ ├── dashboard.css │ │ ├── dashboard.css.map │ │ ├── dataTables.css │ │ ├── dropzone.css │ │ ├── font-awesome.min.css │ │ ├── forms.css │ │ ├── jquery-ui.css │ │ ├── mixins.css │ │ ├── sweet-alert.css │ │ └── update.css │ │ ├── favicon.ico │ │ ├── fonts │ │ ├── FontAwesome.otf │ │ ├── Lato-Regular.ttf │ │ ├── fontawesome-webfont.eot │ │ ├── fontawesome-webfont.svg │ │ ├── fontawesome-webfont.ttf │ │ ├── fontawesome-webfont.woff │ │ ├── glyphicons-halflings-regular.eot │ │ ├── glyphicons-halflings-regular.svg │ │ ├── glyphicons-halflings-regular.ttf │ │ └── glyphicons-halflings-regular.woff │ │ ├── images │ │ ├── continuum-logo.png │ │ ├── continuum_analytics_logo.png │ │ ├── favicon.ico │ │ ├── kitware.png │ │ ├── nasa_jpl_logo.png │ │ └── nyu-logo.png │ │ ├── imgs │ │ ├── spritemap.png │ │ └── spritemap@2x.png │ │ └── js │ │ ├── ajax.js │ │ ├── backbone-min.js │ │ ├── backbone-min.map │ │ ├── backbone │ │ ├── base_views.js │ │ ├── crawl_models.js │ │ ├── crawls.js │ │ ├── edit_seeds_backbone.js │ │ ├── project_router.js │ │ ├── project_seeds.js │ │ ├── projects.js │ │ ├── seeds.js │ │ ├── seeds_router.js │ │ └── trails_backbone.js │ │ ├── bokeh.min.js │ │ ├── bootstrap-table.min.js │ │ ├── bootstrap.min.js │ │ ├── codemirror-compressed.js │ │ ├── crawl.js │ │ ├── crawl_statuses.js │ │ ├── dataTables.bootstrap.js │ │ ├── delete_index.js │ │ ├── doc.min.js │ │ ├── holder.js │ │ ├── ie-emulation-modes-warning.js │ │ ├── ie10-viewport-bug-workaround.js │ │ ├── jquery-1.10.2.js │ │ ├── jquery-ui.min.js │ │ ├── jquery.dataTables.min.js │ │ ├── jquery.min.js │ │ ├── project.js │ │ ├── source_form_check.js │ │ ├── sweet-alert.min.js │ │ ├── tad.js │ │ ├── underscore-min.js │ │ ├── underscore-min.map │ │ └── upload_ajax.js ├── templates │ └── base │ │ ├── about.html │ │ ├── add_index.html │ │ ├── add_project.html │ │ ├── base.html │ │ ├── edit_seeds.html │ │ ├── index.html │ │ ├── index_update_form.html │ │ ├── indices.html │ │ ├── project.html │ │ ├── project_update_form.html │ │ ├── seeds_list.html │ │ ├── sidebar.html │ │ ├── tad.html │ │ └── underscore_templates │ │ ├── crawls_underscore.html │ │ ├── seeds_underscore.html │ │ └── underscore_templates.html ├── tests │ ├── functional_project_test.py │ ├── test_base.py │ ├── test_index.py │ ├── test_rest_project.py │ └── test_rest_seedslist.py ├── urls.py └── views.py ├── container_volumes ├── elasticsearch │ └── data │ │ └── elasticsearch.yml └── solr │ ├── Dockerfile │ ├── output.json │ └── solr_entry.sh ├── manage.py ├── memex ├── __init__.py ├── celery.py ├── common_settings.py ├── local_settings.py ├── logs │ └── __init__.py ├── rest.py ├── settings_files │ ├── deploy_settings.py │ └── dev_settings.py ├── supervisor_services.py ├── test_settings.py ├── test_utils │ ├── __init__.py │ └── unit_test_utils.py ├── urls.py └── wsgi.py ├── resources ├── configs │ └── config_default │ │ ├── crawler │ │ └── crawler.cfg │ │ ├── form_storage │ │ └── form_storage.cfg │ │ ├── link_storage │ │ ├── backlink.cfg │ │ └── link_storage.cfg │ │ ├── stoplist.txt │ │ └── target_storage │ │ └── target_storage.cfg ├── logs │ └── README.md └── profiles │ ├── af │ ├── ar │ ├── bg │ ├── bn │ ├── cs │ ├── da │ ├── de │ ├── el │ ├── en │ ├── es │ ├── et │ ├── fa │ ├── fi │ ├── fr │ ├── gu │ ├── he │ ├── hi │ ├── hr │ ├── hu │ ├── id │ ├── it │ ├── ja │ ├── kn │ ├── ko │ ├── lt │ ├── lv │ ├── mk │ ├── ml │ ├── mr │ ├── ne │ ├── nl │ ├── no │ ├── pa │ ├── pl │ ├── pt │ ├── ro │ ├── ru │ ├── sk │ ├── sl │ ├── so │ ├── sq │ ├── sv │ ├── sw │ ├── ta │ ├── te │ ├── th │ ├── tl │ ├── tr │ ├── uk │ ├── ur │ ├── vi │ ├── zh-cn │ └── zh-tw ├── supervisord.conf ├── task_manager ├── __init__.py ├── crawl_tasks.py ├── file_tasks.py ├── migrations │ ├── 0001_initial.py │ └── __init__.py ├── models.py ├── tasks.py ├── tests.py └── tika_tasks.py ├── templates ├── 403.html ├── 404.html └── 500.html └── test_resources ├── configs └── config_default │ ├── crawler │ └── crawler.cfg │ ├── form_storage │ └── form_storage.cfg │ ├── link_storage │ ├── backlink.cfg │ └── link_storage.cfg │ ├── stoplist.txt │ └── target_storage │ └── target_storage.cfg ├── sample.zip ├── sample2.zip ├── test_crawl_data ├── .DS_Store ├── cats.seeds ├── crawledpages.csv ├── harvestinfo.csv ├── nonrelevantpages.csv └── relevantpages.csv └── test_model ├── pageclassifier.features ├── pageclassifier.model └── sample.seeds /.coveragerc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/.coveragerc -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Memex Explorer project 2 | resources/ 3 | test_resources/ 4 | source/memex/settings.py 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | .sass-cache/ 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | env/ 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff 54 | *.log 55 | db.sqlite3 56 | 57 | # Sphinx documentation 58 | docs/_build/ 59 | 60 | # PyBuilder 61 | target/ 62 | 63 | # OSX 64 | .DS_Store 65 | 66 | # Node.js 67 | node_modules 68 | 69 | #EC2 Keys 70 | deploy/keys/*.pem 71 | deploy/keys/latest.pem 72 | deploy/keys/production.pem 73 | 74 | #deployment script with AWS credentials 75 | deploy/nocommit.sh 76 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | 3 | language: python 4 | 5 | python: 6 | - "2.7" 7 | 8 | branches: 9 | only: 10 | - master 11 | 12 | before_install: 13 | - "export DISPLAY=:99.0" 14 | - "sh -e /etc/init.d/xvfb start" 15 | 16 | install: 17 | - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh 18 | - bash miniconda.sh -b -p $HOME/miniconda 19 | - export PATH="$HOME/miniconda/bin:$PATH" 20 | - conda config --set always_yes yes 21 | - conda env create -n memex -f environment.yml 22 | - source activate memex 23 | - cp source/memex/settings_files/dev_settings.py source/memex/settings.py 24 | - redis-server & 25 | - cd source 26 | - celery -A memex worker -l info & 27 | - cd ../ 28 | 29 | script: 30 | - coverage run $(which py.test) 31 | - coverage report --show-missing 32 | 33 | after_success: 34 | - coveralls 35 | 36 | notifications: 37 | email: false 38 | flowdock: "8736cc7601ed0761f6c9339ec747ded6" 39 | on_success: "change" 40 | on_failure: "always" 41 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | # 0.4 2 | 3 | Released November 2015 4 | 5 | * Nutch streaming visualizations 6 | * Nutch REST API support 7 | * Removed log.io 8 | * DataWake trail support 9 | * Bug fixes 10 | 11 | # 0.3 12 | 13 | Released September 2015 14 | 15 | * Backbone.js support for crawls, project, and crawl models 16 | * Django REST API 17 | * conda Vagrantfile support 18 | * Bug fixes 19 | 20 | # 0.2.1 21 | 22 | * Fix environment.yml naming 23 | 24 | # 0.2 25 | 26 | Released August 2015 27 | 28 | * Ache and Nutch crawlers now both dump into an Elasticsearch index. 29 | * Upload files directly into a Tika parser / Elasticsearch index 30 | * Celery task management for crawls/parsing 31 | * Integration for: Kibana, Tika, log.io, Domain Discovery Tool, Temporal Anomaly Detection 32 | * conda packages for all core packages on OS X and Linux 33 | * modal (AJAX) user interface for more responsive task management 34 | * Other general user interface improvements 35 | 36 | # 0.1.1 37 | 38 | Released January 2015 39 | 40 | Minor UI Fixes 41 | 42 | # 0.1 43 | 44 | Released January 2015 45 | 46 | Initial Release 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014-2015 Continuum Analytics, Inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/memex-explorer/memex-explorer.svg?branch=master)](https://travis-ci.org/memex-explorer/memex-explorer) 2 | [![Coverage Status](https://coveralls.io/repos/ContinuumIO/memex-explorer/badge.svg?branch=master)](https://coveralls.io/r/ContinuumIO/memex-explorer?branch=master) 3 | 4 | # *DISCLAIMER* 5 | 6 | *Memex explorer has currently been put on hold. Support and development on this project has ceased for the immediate future.* 7 | 8 | # memex-explorer 9 | 10 | Memex Explorer is a web application that provides easy-to-use interfaces for gathering, analyzing, and graphing web crawl data. 11 | 12 | # Local Development 13 | To setup your machine, you will need Anaconda or Miniconda installed. Miniconda is a minimal Anaconda installation that bootstraps conda and Python on any operating system. Install Anaconda from http://continuum.io/downloads or Miniconda from http://conda.pydata.org/miniconda.html 14 | 15 | Clone the repository, then: 16 | 17 | ```bash 18 | cd memex-explorer/source 19 | ``` 20 | 21 | Run the following commands: 22 | 23 | ```bash 24 | $ ./app_setup.sh 25 | $ source activate memex 26 | $ supervisord 27 | ``` 28 | 29 | This script will set up a conda environment named memex, prepare the application by creating an empty database, then launch all of the necessary services for the application. If there are any problems with any of these commands, please report them as a [GitHub issue](https://github.com/memex-explorer/memex-explorer/issues). 30 | 31 | If you have already run the install script, simply run `supervisord` from the `memex-explorer/source` directory to restart all of the services. 32 | 33 | The supervisord will start supervisord in the foreground, which will in turn ensure that all services associated with the core Memex Explorer environment are running. To stop supervisord and the associated services, send an interrupt to the process with `Ctrl-c`. 34 | 35 | **Memex Explorer will now be running locally at http://localhost:8000** 36 | 37 | # Testing 38 | 39 | To run memex-explorer tests, use the following command from within an active environment: 40 | 41 | ``` 42 | $ py.test 43 | ``` 44 | 45 | # Building the Documentation 46 | The project documentation is written in [reStructuredText](http://docutils.sf.net/rst.html) and can be built using [Sphinx](http://sphinx-doc.org/). 47 | 48 | ``` 49 | $ cd docs 50 | $ make html 51 | ``` 52 | 53 | The documentation is then available within `build/html/index.html` 54 | 55 | # Administration 56 | 57 | To access the administration panel, navigate to http://localhost:8000/admin (or the equivalent deployed URL) after starting Memex Explorer. Here you will be able to view and make manual changes to the database. 58 | 59 | -------------------------------------------------------------------------------- /conda_recipes/cherrypy/bld.bat: -------------------------------------------------------------------------------- 1 | "%PYTHON%" setup.py install 2 | if errorlevel 1 exit 1 3 | 4 | :: Add more build steps here, if they are necessary. 5 | 6 | :: See 7 | :: http://docs.continuum.io/conda/build.html 8 | :: for a list of environment variables that are set during the build process. 9 | -------------------------------------------------------------------------------- /conda_recipes/cherrypy/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | $PYTHON setup.py install 4 | 5 | # Add more build steps here, if they are necessary. 6 | 7 | # See 8 | # http://docs.continuum.io/conda/build.html 9 | # for a list of environment variables that are set during the build process. 10 | -------------------------------------------------------------------------------- /conda_recipes/cherrypy/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: cherrypy 3 | version: "3.8.0" 4 | 5 | source: 6 | fn: CherryPy-3.8.0.tar.gz 7 | url: https://pypi.python.org/packages/source/C/CherryPy/CherryPy-3.8.0.tar.gz 8 | md5: 542b96b2cd825e8120e8cd822bc18f4b 9 | # patches: 10 | # List any patch files here 11 | # - fix.patch 12 | 13 | # build: 14 | # noarch_python: True 15 | # preserve_egg_dir: True 16 | # entry_points: 17 | # Put any entry points (scripts to be generated automatically) here. The 18 | # syntax is module:function. For example 19 | # 20 | # - cherrypy = cherrypy:main 21 | # 22 | # Would create an entry point called cherrypy that calls cherrypy.main() 23 | 24 | 25 | # If this is a new build for the same version, increment the build 26 | # number. If you do not include this key, it defaults to 0. 27 | # number: 1 28 | 29 | requirements: 30 | build: 31 | - python 32 | - setuptools 33 | 34 | run: 35 | - python 36 | 37 | test: 38 | # Python imports 39 | imports: 40 | - cherrypy 41 | - cherrypy.lib 42 | - cherrypy.process 43 | - cherrypy.scaffold 44 | - cherrypy.test 45 | - cherrypy.tutorial 46 | - cherrypy.wsgiserver 47 | 48 | # commands: 49 | # You can put test commands to be run here. Use this to test that the 50 | # entry points work. 51 | 52 | 53 | # You can also put a file called run_test.py in the recipe that will be run 54 | # at test time. 55 | 56 | # requires: 57 | # Put any additional test requirements here. For example 58 | # - nose 59 | 60 | about: 61 | home: http://www.cherrypy.org 62 | license: BSD License 63 | summary: 'Object-Oriented HTTP framework' 64 | 65 | # See 66 | # http://docs.continuum.io/conda/build.html for 67 | # more information about meta.yaml 68 | -------------------------------------------------------------------------------- /conda_recipes/ddt-word2vec/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -vp ${PREFIX}/data; 4 | 5 | curl https://s3.amazonaws.com/vida-nyu/DDT/D_cbow_pdw_8B.pkl -o ${PREFIX}/data/D_cbow_pdw_8B.pkl 6 | -------------------------------------------------------------------------------- /conda_recipes/ddt-word2vec/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: ddt-word2vec 3 | version: 0.0.1 4 | 5 | build: 6 | number: 1 7 | 8 | requirements: 9 | build: 10 | - curl 11 | 12 | # Source 13 | # from: https://s3.amazonaws.com/vida-nyu/DDT/D_cbow_pdw_8B.pkl 14 | 15 | # Contact the authors of the below paper for more information about this dataset 16 | about: 17 | home: http://arxiv.org/abs/1504.00548 18 | license: CC BY 19 | summary: 'word2vec model compiled using https://code.google.com/p/word2vec/source/browse/trunk/demo-train-big-model-v1.sh' 20 | -------------------------------------------------------------------------------- /conda_recipes/elasticnutch/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$(uname)" == "Darwin" ]; then 4 | export JAVA_HOME=$(/usr/libexec/java_home) 5 | export JRE_HOME=${JAVA_HOME}/jre 6 | else 7 | export JAVA_HOME="/usr/lib/jvm/java-7-openjdk-amd64/" 8 | export JRE_HOME="/usr/lib/jvm/java-7-openjdk-amd64/jre" 9 | fi 10 | 11 | mkdir -vp ${PREFIX}/bin; 12 | mkdir -vp ${PREFIX}/lib/nutch/bin; 13 | mkdir -vp ${PREFIX}/lib/nutch/lib; 14 | mkdir -vp ${PREFIX}/lib/nutch/plugins; 15 | mkdir -vp ${PREFIX}/lib/nutch/conf; 16 | 17 | # apply the pub/sub enabling patch 18 | patch -p0 < ${RECIPE_DIR}/PubSub_routingkey.patch 19 | # apply the common crawl dump service patch 20 | patch -p1 < ${RECIPE_DIR}/NUTCH-2152.git.patch 21 | 22 | # build nutch 23 | ant 24 | 25 | pushd runtime/local/ 26 | cp -r bin/* ${PREFIX}/lib/nutch/bin/ 27 | cp -r lib/* ${PREFIX}/lib/nutch/lib/ 28 | cp -r plugins/* ${PREFIX}/lib/nutch/plugins/ 29 | cp -r conf/* ${PREFIX}/lib/nutch/conf/ 30 | popd 31 | 32 | cp ${RECIPE_DIR}/nutch-site.xml ${PREFIX}/lib/nutch/conf/ 33 | 34 | pushd "${PREFIX}/bin" 35 | 36 | cat > ${PREFIX}/bin/nutch < ${PREFIX}/bin/crawl<=1.8,<2.0 34 | 35 | run: 36 | - python 37 | - urllib3 >=1.8,<2.0 38 | 39 | test: 40 | # Python imports 41 | imports: 42 | - elasticsearch 43 | - elasticsearch.client 44 | - elasticsearch.connection 45 | - elasticsearch.connection.esthrift 46 | - elasticsearch.helpers 47 | 48 | # commands: 49 | # You can put test commands to be run here. Use this to test that the 50 | # entry points work. 51 | 52 | 53 | # You can also put a file called run_test.py in the recipe that will be run 54 | # at test time. 55 | 56 | requires: 57 | - coverage 58 | - mock 59 | - nose 60 | - nosexcover 61 | - pyaml 62 | - pylibmc ==1.4.1 63 | - requests >=1.0.0,<3.0.0 64 | # Put any additional test requirements here. For example 65 | # - nose 66 | 67 | about: 68 | home: https://github.com/elastic/elasticsearch-py 69 | license: Apache Software License 70 | summary: 'Python client for Elasticsearch' 71 | 72 | # See 73 | # http://docs.continuum.io/conda/build.html for 74 | # more information about meta.yaml 75 | -------------------------------------------------------------------------------- /conda_recipes/elasticsearch/bld.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/conda_recipes/elasticsearch/bld.bat -------------------------------------------------------------------------------- /conda_recipes/elasticsearch/build.sh: -------------------------------------------------------------------------------- 1 | mkdir -vp ${PREFIX}/bin; 2 | mkdir -vp ${PREFIX}/lib/elasticsearch/bin; 3 | mkdir -vp ${PREFIX}/lib/elasticsearch/lib; 4 | mkdir -vp ${PREFIX}/lib/elasticsearch/config; 5 | mkdir ${PREFIX}/lib/elasticsearch/lib/sigar; 6 | 7 | mvn clean package -DskipTests; 8 | 9 | cp -r target/lib/* ${PREFIX}/lib/elasticsearch/lib/; 10 | cp target/elasticsearch-1.6.0.jar ${PREFIX}/lib/elasticsearch/lib/elasticsearch-1.6.0.jar; 11 | cp -r target/bin/* ${PREFIX}/lib/elasticsearch/bin/; 12 | cp -r config/* ${PREFIX}/lib/elasticsearch/config/; 13 | cp ${RECIPE_DIR}/elasticsearch.yml ${PREFIX}/lib/elasticsearch/config/elasticsearch.yml; 14 | 15 | rm ${PREFIX}/lib/elasticsearch/bin/*.exe; 16 | 17 | chmod +x ${PREFIX}/lib/elasticsearch/bin/elasticsearch; 18 | 19 | pushd "${PREFIX}/bin" 20 | ln -vs "../lib/elasticsearch/bin/elasticsearch" elasticsearch 21 | -------------------------------------------------------------------------------- /conda_recipes/elasticsearch/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: elasticsearch 3 | version: "1.6.0" 4 | 5 | source: 6 | git_rev: v1.6.0 7 | git_url: https://github.com/elastic/elasticsearch.git 8 | 9 | build: 10 | number: 0 11 | 12 | requirements: 13 | build: 14 | - apache-maven 15 | -------------------------------------------------------------------------------- /conda_recipes/kibana/linux32/bld.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/conda_recipes/kibana/linux32/bld.bat -------------------------------------------------------------------------------- /conda_recipes/kibana/linux32/build.sh: -------------------------------------------------------------------------------- 1 | mkdir -vp ${PREFIX}/bin; 2 | mkdir -vp ${PREFIX}/lib/kibana/bin; 3 | mkdir -vp ${PREFIX}/lib/kibana/config; 4 | mkdir -vp ${PREFIX}/lib/kibana/plugins; 5 | mkdir -vp ${PREFIX}/lib/kibana/node; 6 | mkdir -vp ${PREFIX}/lib/kibana/src; 7 | 8 | cp -r src/* ${PREFIX}/lib/kibana/src/; 9 | cp -r plugins/* ${PREFIX}/lib/kibana/plugins/; 10 | cp -r node/* ${PREFIX}/lib/kibana/node/; 11 | cp -r config/* ${PREFIX}/lib/kibana/config/; 12 | 13 | cp bin/kibana ${PREFIX}/lib/kibana/bin/kibana; 14 | cp README.txt ${PREFIX}/lib/kibana/; 15 | cp LICENSE.txt ${PREFIX}/lib/kibana/; 16 | 17 | chmod +x ${PREFIX}/lib/kibana/bin/kibana; 18 | 19 | pushd "${PREFIX}/bin"; 20 | ln -vs "../lib/kibana/bin/kibana" kibana; 21 | -------------------------------------------------------------------------------- /conda_recipes/kibana/linux32/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: kibana 3 | version: "4.1.1" 4 | 5 | source: 6 | fn: kibana-4.1.1-linux-x86.tar.gz 7 | url: https://download.elastic.co/kibana/kibana/kibana-4.1.1-linux-x86.tar.gz 8 | sha1: 8f756e680189f991ebdb081178613ec39381a0c9 9 | -------------------------------------------------------------------------------- /conda_recipes/kibana/linux64/bld.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/conda_recipes/kibana/linux64/bld.bat -------------------------------------------------------------------------------- /conda_recipes/kibana/linux64/build.sh: -------------------------------------------------------------------------------- 1 | mkdir -vp ${PREFIX}/bin; 2 | mkdir -vp ${PREFIX}/lib/kibana/bin; 3 | mkdir -vp ${PREFIX}/lib/kibana/config; 4 | mkdir -vp ${PREFIX}/lib/kibana/plugins; 5 | mkdir -vp ${PREFIX}/lib/kibana/node; 6 | mkdir -vp ${PREFIX}/lib/kibana/src; 7 | 8 | cp -r src/* ${PREFIX}/lib/kibana/src/; 9 | cp -r plugins/* ${PREFIX}/lib/kibana/plugins/; 10 | cp -r node/* ${PREFIX}/lib/kibana/node/; 11 | cp -r config/* ${PREFIX}/lib/kibana/config/; 12 | 13 | cp bin/kibana ${PREFIX}/lib/kibana/bin/kibana; 14 | cp README.txt ${PREFIX}/lib/kibana/; 15 | cp LICENSE.txt ${PREFIX}/lib/kibana/; 16 | 17 | chmod +x ${PREFIX}/lib/kibana/bin/kibana; 18 | 19 | pushd "${PREFIX}/bin"; 20 | ln -vs "../lib/kibana/bin/kibana" kibana; 21 | -------------------------------------------------------------------------------- /conda_recipes/kibana/linux64/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: kibana 3 | version: "4.1.1" 4 | 5 | source: 6 | fn: kibana-4.1.1-linux-x64.tar.gz 7 | url: https://download.elastic.co/kibana/kibana/kibana-4.1.1-linux-x64.tar.gz 8 | sha1: d43e039adcea43e1808229b9d55f3eaee6a5edb9 9 | -------------------------------------------------------------------------------- /conda_recipes/kibana/osx/bld.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/conda_recipes/kibana/osx/bld.bat -------------------------------------------------------------------------------- /conda_recipes/kibana/osx/build.sh: -------------------------------------------------------------------------------- 1 | mkdir -vp ${PREFIX}/bin; 2 | mkdir -vp ${PREFIX}/lib/kibana/bin; 3 | mkdir -vp ${PREFIX}/lib/kibana/config; 4 | mkdir -vp ${PREFIX}/lib/kibana/plugins; 5 | mkdir -vp ${PREFIX}/lib/kibana/node; 6 | mkdir -vp ${PREFIX}/lib/kibana/src; 7 | 8 | cp -r src/* ${PREFIX}/lib/kibana/src/; 9 | cp -r plugins/* ${PREFIX}/lib/kibana/plugins/; 10 | cp -r node/* ${PREFIX}/lib/kibana/node/; 11 | cp -r config/* ${PREFIX}/lib/kibana/config/; 12 | 13 | cp bin/kibana ${PREFIX}/lib/kibana/bin/kibana; 14 | cp README.txt ${PREFIX}/lib/kibana/; 15 | cp LICENSE.txt ${PREFIX}/lib/kibana/; 16 | 17 | chmod +x ${PREFIX}/lib/kibana/bin/kibana; 18 | 19 | pushd "${PREFIX}/bin"; 20 | ln -vs "../lib/kibana/bin/kibana" kibana; 21 | -------------------------------------------------------------------------------- /conda_recipes/kibana/osx/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: kibana 3 | version: "4.1.1" 4 | 5 | source: 6 | fn: kibana-4.1.1-darwin-x64.tar.gz 7 | url: https://download.elastic.co/kibana/kibana/kibana-4.1.1-darwin-x64.tar.gz 8 | sha1: 8b9dbf67ea9c56029ee211a6ef4ff3396be5cf7c 9 | -------------------------------------------------------------------------------- /conda_recipes/maven/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Build dependencies: 4 | # - > Java 1.7 5 | 6 | 7 | mkdir -vp ${PREFIX}/bin; 8 | mkdir -vp ${PREFIX}/lib; 9 | mkdir -vp ${PREFIX}/lib/maven/bin; 10 | mkdir -vp ${PREFIX}/lib/maven/boot; 11 | mkdir -vp ${PREFIX}/lib/maven/conf; 12 | mkdir -vp ${PREFIX}/lib/maven/lib; 13 | 14 | cp -va bin/* ${PREFIX}/lib/maven/bin/; 15 | cp -va boot/* ${PREFIX}/lib/maven/boot/; 16 | cp -va conf/* ${PREFIX}/lib/maven/conf/; 17 | cp -va lib/* ${PREFIX}/lib/maven/lib/; 18 | 19 | chmod +x ${PREFIX}/lib/maven/bin/*; 20 | 21 | pushd "${PREFIX}/bin" 22 | ln -vs "../lib/maven/bin/mvn" mvn 23 | ln -vs "../lib/maven/bin/mvnDebug" mvnDebug 24 | ln -vs "../lib/maven/bin/mvnyjp" mvnyjp 25 | popd 26 | -------------------------------------------------------------------------------- /conda_recipes/maven/meta.yaml: -------------------------------------------------------------------------------- 1 | 2 | package: 3 | name: maven 4 | version: 3.3.3 5 | 6 | source: 7 | fn: apache-maven-3.3.3-bin.zip 8 | url: http://www.webhostingjams.com/mirror/apache/maven/maven-3/3.3.3/binaries/apache-maven-3.3.3-bin.zip 9 | md5: 6e5da03a3324f616493a0fd09d6383fc 10 | 11 | build: 12 | number: 0 13 | 14 | requirements: 15 | build: 16 | 17 | run: 18 | 19 | test: 20 | commands: 21 | - mvn --version 22 | 23 | # install maven to /prefix/lib/maven instead of /prefix/ to isolate it from other installers 24 | # these redundancies can be removed by capturing maven dependencies 25 | 26 | about: 27 | home: http://maven.apache.org/ 28 | license: Apache License v2.0 29 | 30 | -------------------------------------------------------------------------------- /conda_recipes/nosexcover/bld.bat: -------------------------------------------------------------------------------- 1 | "%PYTHON%" setup.py install 2 | if errorlevel 1 exit 1 3 | 4 | :: Add more build steps here, if they are necessary. 5 | 6 | :: See 7 | :: http://docs.continuum.io/conda/build.html 8 | :: for a list of environment variables that are set during the build process. 9 | -------------------------------------------------------------------------------- /conda_recipes/nosexcover/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | $PYTHON setup.py install 4 | 5 | # Add more build steps here, if they are necessary. 6 | 7 | # See 8 | # http://docs.continuum.io/conda/build.html 9 | # for a list of environment variables that are set during the build process. 10 | -------------------------------------------------------------------------------- /conda_recipes/nosexcover/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: nosexcover 3 | version: "1.0.10" 4 | 5 | source: 6 | fn: nosexcover-1.0.10.tar.gz 7 | url: https://pypi.python.org/packages/source/n/nosexcover/nosexcover-1.0.10.tar.gz 8 | md5: 12bf494a801b376debeb6a167c247391 9 | # patches: 10 | # List any patch files here 11 | # - fix.patch 12 | 13 | build: 14 | # noarch_python: True 15 | preserve_egg_dir: True 16 | # entry_points: 17 | # Put any entry points (scripts to be generated automatically) here. The 18 | # syntax is module:function. For example 19 | # 20 | # - nosexcover = nosexcover:main 21 | # 22 | # Would create an entry point called nosexcover that calls nosexcover.main() 23 | 24 | 25 | # If this is a new build for the same version, increment the build 26 | # number. If you do not include this key, it defaults to 0. 27 | # number: 1 28 | 29 | requirements: 30 | build: 31 | - python 32 | - setuptools 33 | - nose 34 | - coverage >=3.4 35 | 36 | run: 37 | - python 38 | - setuptools 39 | - nose 40 | - coverage >=3.4 41 | 42 | test: 43 | # Python imports 44 | imports: 45 | - nosexcover 46 | 47 | # commands: 48 | # You can put test commands to be run here. Use this to test that the 49 | # entry points work. 50 | 51 | 52 | # You can also put a file called run_test.py in the recipe that will be run 53 | # at test time. 54 | 55 | # requires: 56 | # Put any additional test requirements here. For example 57 | # - nose 58 | 59 | about: 60 | home: http://github.com/cmheisel/nose-xcover/ 61 | license: BSD License 62 | summary: 'Extends nose.plugins.cover to add Cobertura-style XML reports' 63 | 64 | # See 65 | # http://docs.continuum.io/conda/build.html for 66 | # more information about meta.yaml 67 | -------------------------------------------------------------------------------- /conda_recipes/pyelasticsearch/bld.bat: -------------------------------------------------------------------------------- 1 | "%PYTHON%" setup.py install 2 | if errorlevel 1 exit 1 3 | 4 | :: Add more build steps here, if they are necessary. 5 | 6 | :: See 7 | :: http://docs.continuum.io/conda/build.html 8 | :: for a list of environment variables that are set during the build process. 9 | -------------------------------------------------------------------------------- /conda_recipes/pyelasticsearch/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | $PYTHON setup.py install 4 | 5 | # Add more build steps here, if they are necessary. 6 | 7 | # See 8 | # http://docs.continuum.io/conda/build.html 9 | # for a list of environment variables that are set during the build process. 10 | -------------------------------------------------------------------------------- /conda_recipes/pyelasticsearch/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: pyelasticsearch 3 | version: "1.4" 4 | 5 | source: 6 | fn: pyelasticsearch-1.4.tar.gz 7 | url: https://pypi.python.org/packages/source/p/pyelasticsearch/pyelasticsearch-1.4.tar.gz 8 | md5: ed61ebb7b253364e55b4923d11e17049 9 | # patches: 10 | # List any patch files here 11 | # - fix.patch 12 | 13 | build: 14 | noarch_python: True 15 | # preserve_egg_dir: True 16 | # entry_points: 17 | # Put any entry points (scripts to be generated automatically) here. The 18 | # syntax is module:function. For example 19 | # 20 | # - pyelasticsearch = pyelasticsearch:main 21 | # 22 | # Would create an entry point called pyelasticsearch that calls pyelasticsearch.main() 23 | 24 | 25 | # If this is a new build for the same version, increment the build 26 | # number. If you do not include this key, it defaults to 0. 27 | # number: 1 28 | 29 | requirements: 30 | build: 31 | - python 32 | - setuptools 33 | - certifi 34 | - elasticsearch-py >=1.3.0,<2.0.0 35 | - urllib3 >=1.8,<2.0 36 | - simplejson >=3.0 37 | - six >=1.4.0,<2.0 38 | 39 | run: 40 | - python 41 | - certifi 42 | - elasticsearch-py >=1.3.0,<2.0.0 43 | - urllib3 >=1.8,<2.0 44 | - simplejson >=3.0 45 | - six >=1.4.0,<2.0 46 | 47 | test: 48 | # Python imports 49 | imports: 50 | - pyelasticsearch 51 | - pyelasticsearch.tests 52 | 53 | # commands: 54 | # You can put test commands to be run here. Use this to test that the 55 | # entry points work. 56 | 57 | 58 | # You can also put a file called run_test.py in the recipe that will be run 59 | # at test time. 60 | 61 | requires: 62 | - mock 63 | - nose >=1.2.1 64 | # Put any additional test requirements here. For example 65 | # - nose 66 | 67 | about: 68 | home: https://github.com/pyelasticsearch/pyelasticsearch 69 | license: BSD License 70 | summary: 'Flexible, high-scale API to elasticsearch' 71 | 72 | # See 73 | # http://docs.continuum.io/conda/build.html for 74 | # more information about meta.yaml 75 | -------------------------------------------------------------------------------- /conda_recipes/simplejson/bld.bat: -------------------------------------------------------------------------------- 1 | "%PYTHON%" setup.py install 2 | if errorlevel 1 exit 1 3 | 4 | :: Add more build steps here, if they are necessary. 5 | 6 | :: See 7 | :: http://docs.continuum.io/conda/build.html 8 | :: for a list of environment variables that are set during the build process. 9 | -------------------------------------------------------------------------------- /conda_recipes/simplejson/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | $PYTHON setup.py install 4 | 5 | # Add more build steps here, if they are necessary. 6 | 7 | # See 8 | # http://docs.continuum.io/conda/build.html 9 | # for a list of environment variables that are set during the build process. 10 | -------------------------------------------------------------------------------- /conda_recipes/simplejson/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: simplejson 3 | version: "3.8.0" 4 | 5 | source: 6 | fn: simplejson-3.8.0.tar.gz 7 | url: https://pypi.python.org/packages/source/s/simplejson/simplejson-3.8.0.tar.gz 8 | md5: 72f3b93a6f9808df81535f79e79565a2 9 | # patches: 10 | # List any patch files here 11 | # - fix.patch 12 | 13 | # build: 14 | # noarch_python: True 15 | # preserve_egg_dir: True 16 | # entry_points: 17 | # Put any entry points (scripts to be generated automatically) here. The 18 | # syntax is module:function. For example 19 | # 20 | # - simplejson = simplejson:main 21 | # 22 | # Would create an entry point called simplejson that calls simplejson.main() 23 | 24 | 25 | # If this is a new build for the same version, increment the build 26 | # number. If you do not include this key, it defaults to 0. 27 | # number: 1 28 | 29 | requirements: 30 | build: 31 | - python 32 | - setuptools 33 | 34 | run: 35 | - python 36 | 37 | test: 38 | # Python imports 39 | imports: 40 | - simplejson 41 | - simplejson.tests 42 | 43 | # commands: 44 | # You can put test commands to be run here. Use this to test that the 45 | # entry points work. 46 | 47 | 48 | # You can also put a file called run_test.py in the recipe that will be run 49 | # at test time. 50 | 51 | # requires: 52 | # Put any additional test requirements here. For example 53 | # - nose 54 | 55 | about: 56 | home: http://github.com/simplejson/simplejson 57 | license: MIT License or Academic Free License (AFL) 58 | summary: 'Simple, fast, extensible JSON encoder/decoder for Python' 59 | 60 | # See 61 | # http://docs.continuum.io/conda/build.html for 62 | # more information about meta.yaml 63 | -------------------------------------------------------------------------------- /conda_recipes/tika/bld.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/conda_recipes/tika/bld.bat -------------------------------------------------------------------------------- /conda_recipes/tika/build.sh: -------------------------------------------------------------------------------- 1 | mkdir -vp ${PREFIX}/bin; 2 | mkdir -vp ${PREFIX}/lib/tika-rest-server/bin; 3 | mkdir -vp ${PREFIX}/lib/tika-rest-server/lib; 4 | 5 | cp tika-server-1.9.jar ${PREFIX}/lib/tika-rest-server/lib/; 6 | 7 | echo -e '#!/bin/bash\nexec java -jar $(dirname $(dirname $(which tika-rest-server)))/lib/tika-server-1.9.jar "$@"' > ${PREFIX}/lib/tika-rest-server/bin/tika-rest-server; 8 | chmod +x "${PREFIX}/lib/tika-rest-server/bin/tika-rest-server"; 9 | 10 | echo -e '#!/bin/bash\nexec java -jar $(dirname $(dirname $(which tika-rest-server)))/lib/tika-rest-server/lib/tika-server-1.9.jar "$@"' > ${PREFIX}/bin/tika-rest-server; 11 | chmod +x "${PREFIX}/bin/tika-rest-server"; 12 | -------------------------------------------------------------------------------- /conda_recipes/tika/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: tika-rest-server 3 | version: "1.9.0" 4 | 5 | source: 6 | fn: tika-server-1.9.jar 7 | url: http://repo1.maven.org/maven2/org/apache/tika/tika-server/1.9/tika-server-1.9.jar 8 | sha1: 24a9d553ec95c55be8bd06090e9593f874ac5f10 9 | -------------------------------------------------------------------------------- /conda_recipes/urllib3/bld.bat: -------------------------------------------------------------------------------- 1 | "%PYTHON%" setup.py install 2 | if errorlevel 1 exit 1 3 | 4 | :: Add more build steps here, if they are necessary. 5 | 6 | :: See 7 | :: http://docs.continuum.io/conda/build.html 8 | :: for a list of environment variables that are set during the build process. 9 | -------------------------------------------------------------------------------- /conda_recipes/urllib3/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | $PYTHON setup.py install 4 | 5 | # Add more build steps here, if they are necessary. 6 | 7 | # See 8 | # http://docs.continuum.io/conda/build.html 9 | # for a list of environment variables that are set during the build process. 10 | -------------------------------------------------------------------------------- /conda_recipes/urllib3/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: urllib3 3 | version: "1.11" 4 | 5 | source: 6 | fn: urllib3-1.11.tar.gz 7 | url: https://pypi.python.org/packages/source/u/urllib3/urllib3-1.11.tar.gz 8 | md5: 4cdfe9b51c181700f9c7beb84bb42770 9 | # patches: 10 | # List any patch files here 11 | # - fix.patch 12 | 13 | # build: 14 | # noarch_python: True 15 | # preserve_egg_dir: True 16 | # entry_points: 17 | # Put any entry points (scripts to be generated automatically) here. The 18 | # syntax is module:function. For example 19 | # 20 | # - urllib3 = urllib3:main 21 | # 22 | # Would create an entry point called urllib3 that calls urllib3.main() 23 | 24 | 25 | # If this is a new build for the same version, increment the build 26 | # number. If you do not include this key, it defaults to 0. 27 | # number: 1 28 | 29 | requirements: 30 | build: 31 | - python 32 | - setuptools 33 | 34 | run: 35 | - python 36 | 37 | test: 38 | # Python imports 39 | imports: 40 | - urllib3 41 | - urllib3.contrib 42 | - urllib3.packages 43 | - urllib3.packages.ssl_match_hostname 44 | - urllib3.util 45 | 46 | # commands: 47 | # You can put test commands to be run here. Use this to test that the 48 | # entry points work. 49 | 50 | 51 | # You can also put a file called run_test.py in the recipe that will be run 52 | # at test time. 53 | 54 | requires: 55 | - mock 56 | - nose 57 | - tornado 58 | # Put any additional test requirements here. For example 59 | # - nose 60 | 61 | about: 62 | home: http://urllib3.readthedocs.org/ 63 | license: MIT License 64 | summary: 'HTTP library with thread-safe connection pooling, file post, and more.' 65 | 66 | # See 67 | # http://docs.continuum.io/conda/build.html for 68 | # more information about meta.yaml 69 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | def pytest_addoption(parser): 4 | """Add `--runslow` option to py.test.""" 5 | parser.addoption("--runslow", action="store_true", 6 | help="run slow tests") 7 | 8 | def pytest_runtest_setup(item): 9 | """pytest items marked `slow` should not run by default.""" 10 | if 'slow' in item.keywords and not item.config.getoption("--runslow"): 11 | pytest.skip("need --runslow option to run") 12 | -------------------------------------------------------------------------------- /deploy/deploy_ec2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source activate memex_deploy 3 | GIT_BRANCH='production' SECURITY_GROUP='memex-explorer-prod' HTPASSWD_PATH='XXX' AWS_ID='XXX' AWS_SECRET='XXX' python ec2-fabfile.py 4 | -------------------------------------------------------------------------------- /deploy/deploy_environment.yml: -------------------------------------------------------------------------------- 1 | name : memex_deploy 2 | dependencies: 3 | - boto 4 | - fabric 5 | -------------------------------------------------------------------------------- /deploy/docker.conf: -------------------------------------------------------------------------------- 1 | DOCKER_OPTS="-s aufs" 2 | -------------------------------------------------------------------------------- /deploy/dot-htpasswd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/deploy/dot-htpasswd -------------------------------------------------------------------------------- /deploy/gunicorn_start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NAME="memex_explorer" # Name of the application 4 | DJANGODIR=/vagrant/source # Django project directory 5 | SOCKFILE=/home/vagrant/gunicorn.sock # we will communicte using this unix socket 6 | USER=vagrant 7 | GROUP=vagrant 8 | NUM_WORKERS=3 # how many worker processes should Gunicorn spawn 9 | DJANGO_SETTINGS_MODULE=memex.settings # which settings file should Django use 10 | DJANGO_WSGI_MODULE=memex.wsgi # WSGI module name 11 | 12 | echo "Starting $NAME as `whoami`" 13 | 14 | # Activate the virtual environment 15 | cd $DJANGODIR 16 | export DJANGO_SETTINGS_MODULE=$DJANGO_SETTINGS_MODULE 17 | export PYTHONPATH=$DJANGODIR:$PYTHONPATH 18 | 19 | # Create the run directory if it doesn't exist 20 | RUNDIR=$(dirname $SOCKFILE) 21 | test -d $RUNDIR || mkdir -p $RUNDIR 22 | 23 | echo $DJANGODIR 24 | echo gunicorn ${DJANGO_WSGI_MODULE}:application 25 | echo `pwd` 26 | # Start your Django Unicorn 27 | # Programs meant to be run under supervisor should not daemonize themselves (do not use --daemon) 28 | gunicorn ${DJANGO_WSGI_MODULE}:application \ 29 | --name $NAME \ 30 | --workers $NUM_WORKERS \ 31 | --user $USER --group $GROUP \ 32 | --bind unix:$SOCKFILE \ 33 | # --log-level=debug \ 34 | # --log-file=- 35 | -------------------------------------------------------------------------------- /deploy/keys/empty.pem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/deploy/keys/empty.pem -------------------------------------------------------------------------------- /deploy/nginx.conf: -------------------------------------------------------------------------------- 1 | server { 2 | listen 80; 3 | server_name explorer.continuum.io; 4 | client_max_body_size 100M; 5 | 6 | auth_basic "Restricted"; 7 | auth_basic_user_file /etc/nginx/.htpasswd; 8 | 9 | location / { 10 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 11 | proxy_set_header X-Real-IP $remote_addr; 12 | proxy_set_header Host $host; 13 | proxy_pass http://unix:/home/vagrant/gunicorn.sock; 14 | proxy_set_header Authorization "Basic a2luZzppc25ha2Vk"; 15 | } 16 | 17 | location /kibana/ { 18 | rewrite /kibana/(.*) /$1 break; 19 | proxy_pass http://0.0.0.0:9999/; 20 | proxy_redirect off; 21 | proxy_set_header Host $host; 22 | } 23 | 24 | location /logio/ { 25 | rewrite /logio/(.*) /$1 break; 26 | proxy_pass http://0.0.0.0:28778/; 27 | proxy_redirect off; 28 | proxy_set_header Host $host; 29 | } 30 | 31 | location /static/ { 32 | rewrite ^/static/(.*)$ /$1 break; 33 | root /vagrant/source/base/static/; 34 | } 35 | } 36 | server { 37 | listen 8000; 38 | server_name localhost 0.0.0.0 127.0.0.1; 39 | client_max_body_size 100M; 40 | 41 | location / { 42 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 43 | proxy_set_header X-Real-IP $remote_addr; 44 | proxy_set_header Host $host; 45 | proxy_pass http://unix:/home/vagrant/gunicorn.sock; 46 | proxy_set_header Authorization "Basic a2luZzppc25ha2Vk"; 47 | } 48 | 49 | location /kibana/ { 50 | rewrite /kibana/(.*) /$1 break; 51 | proxy_pass http://0.0.0.0:9999/; 52 | proxy_redirect off; 53 | proxy_set_header Host $host; 54 | } 55 | 56 | location /logio/ { 57 | rewrite /logio/(.*) /$1 break; 58 | proxy_pass http://0.0.0.0:28778/; 59 | proxy_redirect off; 60 | proxy_set_header Host $host; 61 | } 62 | 63 | location /static/ { 64 | rewrite ^/static/(.*)$ /$1 break; 65 | root /vagrant/source/base/static/; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /deploy/supervisor.conf: -------------------------------------------------------------------------------- 1 | [unix_http_server] 2 | file=/home/vagrant/supervisor.sock ; (the path to the socket file) 3 | chmod=0700 ; sockef file mode (default 0700) 4 | 5 | [supervisord] 6 | logfile=/home/vagrant/supervisord.log ; (main log file;default $CWD/supervisord.log) 7 | pidfile=/home/vagrant/supervisord.pid ; (supervisord pidfile;default supervisord.pid) 8 | ; childlogdir=/home/vagrant; ('AUTO' child log dir, default $TEMP) 9 | user=vagrant 10 | 11 | ; the below section must remain in the config file for RPC 12 | ; (supervisorctl/web interface) to work, additional interfaces may be 13 | ; added by defining them in separate rpcinterface: sections 14 | [rpcinterface:supervisor] 15 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface 16 | 17 | [supervisorctl] 18 | serverurl=unix:///home/vagrant/supervisor.sock ; use a unix:// URL for a unix socket 19 | 20 | ; The [include] section can just contain the "files" setting. This 21 | ; setting can list multiple files (separated by whitespace or 22 | ; newlines). It can also contain wildcards. The filenames are 23 | ; interpreted as relative to this file. Included files *cannot* 24 | ; include files themselves. 25 | 26 | [program:memex_explorer] 27 | command = /vagrant/deploy/gunicorn_start.sh 28 | user = vagrant 29 | cwd = /vagrant/source 30 | stdout_logfile = /home/vagrant/memex_explorer.log 31 | redirect_stderr = true 32 | autostart = true 33 | killasgroup = true 34 | -------------------------------------------------------------------------------- /deploy/tad.cfg: -------------------------------------------------------------------------------- 1 | [ElasticSearch] 2 | protocol = http 3 | host = explorer.continuum.io 4 | port = 9200 5 | 6 | default_index = yahoo-scrapes 7 | time_field = dt 8 | -------------------------------------------------------------------------------- /docker/docker_settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for memex project. 3 | 4 | For more information on this file, see 5 | https://docs.djangoproject.com/en/1.7/topics/settings/ 6 | 7 | For the full list of settings and their values, see 8 | https://docs.djangoproject.com/en/1.7/ref/settings/ 9 | """ 10 | 11 | from common_settings import * 12 | import os 13 | 14 | # SECURITY WARNING: Not setting VIRTUAL_HOST prevents Django from being able to verify headers 15 | ALLOWED_HOSTS = [os.environ.get('VIRTUAL_HOST', '*')] 16 | 17 | # SECURITY WARNING: keep the secret key used in production secret! 18 | SECRET_KEY = '0#t((zq66&3*87djaltu-pn34%0p!*v_332f2p!$2i)w5y17f8' 19 | 20 | # SECURITY WARNING: don't run with debug turned on in production! 21 | PRODUCTION = os.environ.get('PRODUCTION', False) 22 | 23 | # when INLINE is true point to local sources for changes/documentation instead of remote ones 24 | INLINE = os.environ.get('INLINE', False) 25 | 26 | if PRODUCTION: 27 | DEBUG = False 28 | TEMPLATE_DEBUG = False 29 | DEPLOYMENT = True 30 | else: 31 | DEBUG = True 32 | TEMPLATE_DEBUG = True 33 | INSTALLED_APPS += ('debug_toolbar',) 34 | DEPLOYMENT = False 35 | 36 | 37 | MEDIA_ROOT = os.path.join(BASE_DIR, 'resources') 38 | PROJECT_PATH = os.path.join(MEDIA_ROOT, "projects") 39 | 40 | VIRTUAL_HOST = os.environ.get('VIRTUAL_HOST', 'localhost') 41 | PROTOCOL = os.environ.get('HTTP_PROTOCOL', 'http') 42 | 43 | REST_FRAMEWORK = { 44 | 'DEFAULT_AUTHENTICATION_CLASSES': ( 45 | 'rest_framework.authentication.SessionAuthentication', 46 | ), 47 | 'DEFAULT_PERMISSION_CLASSES': ( 48 | 'rest_framework.permissions.AllowAny', 49 | ), 50 | 'DEFAULT_FILTER_BACKENDS': ('rest_framework.filters.DjangoFilterBackend',) 51 | } 52 | 53 | # ddt is treated as an external service for now 54 | 55 | EXTERNAL_APP_LOCATIONS = { 56 | 'bokeh-server': '/bokeh', 57 | 'ddt': PROTOCOL + '://' + VIRTUAL_HOST + ':8084', 58 | 'tad': '/tad', 59 | 'kibana': '/kibana', 60 | } 61 | -------------------------------------------------------------------------------- /docker/nginx.conf: -------------------------------------------------------------------------------- 1 | server { 2 | listen 80; 3 | server_name explorer.continuum.io; 4 | 5 | client_max_body_size 100M; 6 | 7 | location / { 8 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 9 | proxy_set_header X-Real-IP $remote_addr; 10 | proxy_set_header Host $host; 11 | #todo: make this a variable below 12 | proxy_pass http://127.0.0.1:8000; 13 | } 14 | 15 | location /kibana/ { 16 | rewrite /kibana/(.*) /$1 break; 17 | proxy_pass http://127.0.0.1:5601/; 18 | proxy_redirect off; 19 | proxy_set_header Host $host; 20 | } 21 | 22 | location /bokeh/ { 23 | proxy_pass http://127.0.0.1:5006/; 24 | proxy_cache off; 25 | proxy_cookie_domain localhost explorer.continuum.io; 26 | sub_filter 'http://localhost:5006' 'https://explorer.continuum.io'; 27 | } 28 | 29 | location /bokeh/sub { 30 | proxy_pass http://localhost:5006; 31 | proxy_http_version 1.1; 32 | proxy_set_header Upgrade $http_upgrade; 33 | proxy_set_header Connection "upgrade"; 34 | } 35 | 36 | location /static { 37 | rewrite ^/static/(.*)$ /$1 break; 38 | root /home/explorer/memex-explorer/source/base/static/ ; 39 | } 40 | } -------------------------------------------------------------------------------- /docker/populate_elasticsearch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source activate memex 4 | echo "memex activated" 5 | elasticsearch & 6 | ES_PID=$! 7 | echo "elasticsearch launched with pid: $ES_PID" 8 | elasticdump --bulk=true --input=elasticdump.json --output=http://localhost:9200/ 9 | echo "elasticdump to localhost:9200 complete!" 10 | kill $ES_PID 11 | echo "elasticsearch with pid: $ES_PID has been killed" 12 | -------------------------------------------------------------------------------- /docker/supervisord.conf: -------------------------------------------------------------------------------- 1 | [supervisorctl] 2 | username=cloud-user 3 | 4 | [supervisord] 5 | childlogdir=resources/logs 6 | logfile=supervisord.log ; (main log file;default $CWD/supervisord.log) 7 | logfile_maxbytes=50MB ; (max main logfile bytes b4 rotation;default 50MB) 8 | logfile_backups=10 ; (num of main logfile rotation backups;default 10) 9 | loglevel=info ; (log level;default info; others: debug,warn,trace) 10 | pidfile=supervisord.pid ; (supervisord pidfile;default supervisord.pid) 11 | nodaemon=true ; (start in foreground if true;default false) 12 | minfds=1024 ; (min. avail startup file descriptors;default 1024) 13 | minprocs=200 ; (min. avail process descriptors;default 200) 14 | 15 | [inet_http_server] 16 | port = 127.0.0.1:9001 17 | 18 | [program:nginx] 19 | command=sudo nginx -g 'daemon off;' 20 | priority=1 21 | autostart=true 22 | 23 | [program:redis] 24 | command=redis-server 25 | priority=1 26 | 27 | [program:elasticsearch] 28 | command=elasticsearch 29 | priority=1 30 | 31 | [program:nutch] 32 | command=nutch startserver 33 | priority=1 34 | 35 | [program:rabbitmq] 36 | command=sudo rabbitmq-server ; need administrative access on Linux systems 37 | priority=1 38 | autostart=true 39 | 40 | [program:bokeh-server] 41 | command=bokeh-server --backend memory --port 5006 --ws-conn-string %(ENV_WS_PROTOCOL)s://explorer.continuum.io/bokeh/sub 42 | priority=1 43 | autostart=true 44 | 45 | [program:tika] 46 | command=tika-rest-server 47 | priority=2 48 | 49 | [program:kibana] 50 | command=kibana 51 | priority=2 52 | 53 | [program:ddt] 54 | command=ddt 55 | priority=5 56 | autostart=true 57 | 58 | [program:tad] 59 | command=tad 60 | priority=5 61 | autostart=false 62 | 63 | # have to bring up celery after other applications 64 | [program:celery] 65 | command=celery -A memex worker -l info --loglevel=debug --logfile=resources/logs/celery-worker.log 66 | priority=10 67 | killasgroup=true 68 | 69 | [program:django] 70 | command=gunicorn memex.wsgi:application -b 0.0.0.0:8000 71 | priority=20 72 | stopasgroup=true 73 | 74 | [rpcinterface:supervisor] 75 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface 76 | -------------------------------------------------------------------------------- /docker/tad.cfg: -------------------------------------------------------------------------------- 1 | [ElasticSearch] 2 | protocol = http 3 | host = localhost 4 | username = 5 | password = 6 | port = 9200 7 | 8 | default_index = yahoo-scrapes 9 | time_field = dt 10 | -------------------------------------------------------------------------------- /docker/tad_run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export C_FORCE_ROOT="true" 4 | sudo rabbitmq-server -detached 5 | sleep 2 6 | cd /service/tad 7 | ./start_worker & 8 | ./start_service 9 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinx-rtd-theme 3 | -------------------------------------------------------------------------------- /docs/source/_static/img/DbVisualizer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/DbVisualizer.png -------------------------------------------------------------------------------- /docs/source/_static/img/ache-buttons.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/ache-buttons.png -------------------------------------------------------------------------------- /docs/source/_static/img/ache-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/ache-dashboard.png -------------------------------------------------------------------------------- /docs/source/_static/img/ache_stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/ache_stats.png -------------------------------------------------------------------------------- /docs/source/_static/img/add-crawl-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/add-crawl-model.png -------------------------------------------------------------------------------- /docs/source/_static/img/add-crawl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/add-crawl.png -------------------------------------------------------------------------------- /docs/source/_static/img/crawl_settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/crawl_settings.png -------------------------------------------------------------------------------- /docs/source/_static/img/crawler_control.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/crawler_control.png -------------------------------------------------------------------------------- /docs/source/_static/img/create-seeds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/create-seeds.png -------------------------------------------------------------------------------- /docs/source/_static/img/edit-seeds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/edit-seeds.png -------------------------------------------------------------------------------- /docs/source/_static/img/homepage-view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/homepage-view.png -------------------------------------------------------------------------------- /docs/source/_static/img/nutch-buttons.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/nutch-buttons.png -------------------------------------------------------------------------------- /docs/source/_static/img/nutch-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/nutch-dashboard.png -------------------------------------------------------------------------------- /docs/source/_static/img/nutch_stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/nutch_stats.png -------------------------------------------------------------------------------- /docs/source/_static/img/plots.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/plots.png -------------------------------------------------------------------------------- /docs/source/_static/img/project-form.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/project-form.png -------------------------------------------------------------------------------- /docs/source/_static/img/project-page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/project-page.png -------------------------------------------------------------------------------- /docs/source/_static/img/seeds-from-trail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/seeds-from-trail.png -------------------------------------------------------------------------------- /docs/source/_static/img/seeds-page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/seeds-page.png -------------------------------------------------------------------------------- /docs/source/_static/img/seeds_head.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/seeds_head.png -------------------------------------------------------------------------------- /docs/source/_static/img/testing_guide/add_index_kibana.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/testing_guide/add_index_kibana.png -------------------------------------------------------------------------------- /docs/source/_static/img/testing_guide/crawl_added_success.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/testing_guide/crawl_added_success.png -------------------------------------------------------------------------------- /docs/source/_static/img/testing_guide/crawl_buttons_success.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/testing_guide/crawl_buttons_success.png -------------------------------------------------------------------------------- /docs/source/_static/img/testing_guide/crawler_nutch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/testing_guide/crawler_nutch.png -------------------------------------------------------------------------------- /docs/source/_static/img/testing_guide/edit_index_link.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/testing_guide/edit_index_link.png -------------------------------------------------------------------------------- /docs/source/_static/img/testing_guide/index_creation_success.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/testing_guide/index_creation_success.png -------------------------------------------------------------------------------- /docs/source/_static/img/testing_guide/kibana_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/testing_guide/kibana_data.png -------------------------------------------------------------------------------- /docs/source/_static/img/testing_guide/logio_running.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/testing_guide/logio_running.png -------------------------------------------------------------------------------- /docs/source/_static/img/testing_guide/nutch_dashboard_initial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/testing_guide/nutch_dashboard_initial.png -------------------------------------------------------------------------------- /docs/source/_static/img/testing_guide/project_edit_success.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/testing_guide/project_edit_success.png -------------------------------------------------------------------------------- /docs/source/_static/img/testing_guide/project_success_message.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/testing_guide/project_success_message.png -------------------------------------------------------------------------------- /docs/source/_static/img/testing_guide/settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/testing_guide/settings.png -------------------------------------------------------------------------------- /docs/source/_static/img/upload-files.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/upload-files.png -------------------------------------------------------------------------------- /docs/source/_static/img/upload-success.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/docs/source/_static/img/upload-success.png -------------------------------------------------------------------------------- /docs/source/glossary.rst: -------------------------------------------------------------------------------- 1 | ######## 2 | Glossary 3 | ######## 4 | 5 | Service 6 | Anything that provides an external functionality not included 7 | directly in Memex Explorer. Current examples include particular 8 | applications such as DDT, Tika, Kibana, and Elasticsearch. 9 | 10 | Stack 11 | A particular set of Services in a working configuration. This 12 | term is not used frequently in the documentation. 13 | 14 | Instance 15 | A version of Memex Explorer running on a given host as well as 16 | its associated stack and databases. An instance may have 17 | multiple projects. 18 | 19 | Project 20 | An in-Memex Explorer data and application warehouse. Each 21 | project usually shares its application stack with other projects. 22 | 23 | Domain Challenge 24 | A problem set like human trafficking, MRS, ebola. 25 | 26 | Skin 27 | A particular UI (Text, CSS, etc...) on a particular webpage for a domain challenge 28 | 29 | Celery 30 | A task manager implemented in Python which manages several tasks in Memex Explorer, including the crawlers. 31 | 32 | Redis 33 | A key-value store database which used by Celery to keep information about task history and task queues. 34 | 35 | Django 36 | A python web application framework. Django is the core of the Memex Explorer application. 37 | 38 | Crawl Space 39 | Provides service for crawling the web using Nutch or Ache. 40 | 41 | Task Manager 42 | Manages the application tasks, like running crawls. Task manager is not available from the Memex Explorer GUI interface. 43 | 44 | 45 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Memex Explorer 2 | ============== 3 | 4 | Memex Explorer is a web application that provides easy-to-use interfaces for gathering, analyzing, and graphing web crawl data. 5 | 6 | For usage instructions, please refer to the `User's Guide `_. 7 | 8 | For more information about the project architecture, please refer to our `Developer's Guide `_ and `API Guide `_. 9 | 10 | Memex Explorer is built by `Continuum Analytics `_, with grants and support from the `NASA Jet Propulsion Laboratory `_, `Kitware `_, and the `NYU Polytechnic School of Engineering `_. 11 | 12 | Contents: 13 | 14 | .. toctree:: 15 | :maxdepth: 2 16 | 17 | user_guide 18 | crawler_guide 19 | dev_guide 20 | manual_testing_guide 21 | glossary 22 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: memex 2 | channels: 3 | - blaze 4 | - memex 5 | - quasiben 6 | - brittainhard 7 | - bokeh 8 | dependencies: 9 | - elasticsearch=1.6.0 10 | - tika-rest-server=1.9.0 11 | - kibana 12 | - bokeh=0.10.0 13 | # certifi needed by bokeh server! 14 | - certifi 15 | - blaze=0.8.2 16 | - python=2.7.8 17 | - nutch-python>=1.11 18 | - elasticnutch>=1.11 19 | - ache=0.3.0 20 | - django=1.7.3 21 | - gunicorn 22 | - pytest 23 | - pip 24 | - jinja2 25 | - redis-py 26 | - nodejs 27 | - pip: 28 | - tld 29 | - tika==1.8.7 30 | - selenium 31 | - coverage 32 | - pytest-cov 33 | - pytest-django 34 | - pytest-ordering 35 | - django-crispy-forms 36 | - django-debug-toolbar 37 | - coveralls 38 | - sphinx-rtd-theme 39 | - celery 40 | - elasticsearch 41 | - supervisor 42 | - djangorestframework 43 | - django-filter 44 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | DJANGO_SETTINGS_MODULE = memex.test_settings 3 | addopts = source 4 | -------------------------------------------------------------------------------- /source/app_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pushd ../; 3 | conda update conda -y; 4 | conda install conda-env -y; 5 | conda env update --file environment.yml; 6 | popd; 7 | 8 | pushd memex; 9 | cp settings_files/dev_settings.py settings.py; 10 | popd; 11 | 12 | source activate memex; 13 | python manage.py migrate; 14 | 15 | source deactivate; 16 | -------------------------------------------------------------------------------- /source/apps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/apps/__init__.py -------------------------------------------------------------------------------- /source/apps/crawl_space/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/apps/crawl_space/__init__.py -------------------------------------------------------------------------------- /source/apps/crawl_space/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /source/apps/crawl_space/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.db import models, migrations 5 | import apps.crawl_space.models 6 | import django.db.models.deletion 7 | import django.core.validators 8 | 9 | 10 | class Migration(migrations.Migration): 11 | 12 | dependencies = [ 13 | ('base', '0001_initial'), 14 | ] 15 | 16 | operations = [ 17 | migrations.CreateModel( 18 | name='Crawl', 19 | fields=[ 20 | ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), 21 | ('name', models.CharField(unique=True, max_length=64, validators=[django.core.validators.RegexValidator(b'^[a-zA-Z0-9-_ ]+$', b'Only numbers, letters, underscores, dashes and spaces are allowed.')])), 22 | ('slug', models.SlugField(unique=True, max_length=64)), 23 | ('description', models.TextField(blank=True)), 24 | ('crawler', models.CharField(max_length=64, choices=[(b'nutch', b'Nutch'), (b'ache', b'ACHE')])), 25 | ('status', models.CharField(default=b'NOT STARTED', max_length=64)), 26 | ('config', models.CharField(default=b'config_default', max_length=64)), 27 | ('seeds_list', models.FileField(default=None, null=True, upload_to=apps.crawl_space.models.get_seeds_upload_path, blank=True)), 28 | ('pages_crawled', models.BigIntegerField(default=0)), 29 | ('harvest_rate', models.FloatField(default=0)), 30 | ('rounds_left', models.IntegerField(default=1, null=True, blank=True)), 31 | ], 32 | options={ 33 | }, 34 | bases=(models.Model,), 35 | ), 36 | migrations.CreateModel( 37 | name='CrawlModel', 38 | fields=[ 39 | ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), 40 | ('name', models.CharField(unique=True, max_length=64, validators=[django.core.validators.RegexValidator(b'^[a-zA-Z0-9-_ ]+$', b'Only numbers, letters, underscores, dashes and spaces are allowed.')])), 41 | ('slug', models.SlugField(unique=True, max_length=64)), 42 | ('model', models.FileField(upload_to=apps.crawl_space.models.get_model_upload_path, validators=[apps.crawl_space.models.validate_model_file])), 43 | ('features', models.FileField(upload_to=apps.crawl_space.models.get_model_upload_path, validators=[apps.crawl_space.models.validate_features_file])), 44 | ('project', models.ForeignKey(to='base.Project')), 45 | ], 46 | options={ 47 | }, 48 | bases=(models.Model,), 49 | ), 50 | migrations.AddField( 51 | model_name='crawl', 52 | name='crawl_model', 53 | field=models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, default=None, blank=True, to='crawl_space.CrawlModel', null=True), 54 | preserve_default=True, 55 | ), 56 | migrations.AddField( 57 | model_name='crawl', 58 | name='project', 59 | field=models.ForeignKey(to='base.Project'), 60 | preserve_default=True, 61 | ), 62 | migrations.AddField( 63 | model_name='crawl', 64 | name='seeds_object', 65 | field=models.ForeignKey(to='base.SeedsList', on_delete=django.db.models.deletion.PROTECT), 66 | preserve_default=True, 67 | ), 68 | ] 69 | -------------------------------------------------------------------------------- /source/apps/crawl_space/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/apps/crawl_space/migrations/__init__.py -------------------------------------------------------------------------------- /source/apps/crawl_space/settings.py: -------------------------------------------------------------------------------- 1 | """Crawl settings.""" 2 | 3 | import os, sys 4 | 5 | """ 6 | Inserts path to project root into sys.path of of crawl_supervisor. 7 | 8 | Splits the directory path to this settings file, and cuts off the path up 9 | to the root of the project directory, allowing crawl_supervisor to import 10 | modules from other apps. 11 | """ 12 | sys.path.insert(1, '/'.join(os.path.dirname(__file__).split('/')[:-2])) 13 | 14 | """ 15 | Ensures that the settings module used by crawl_supervisor is the one 16 | used by the rest of the apps in the project. 17 | """ 18 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "memex.settings") 19 | 20 | import django 21 | from django.conf import settings 22 | 23 | resources_dir = settings.MEDIA_ROOT 24 | 25 | crawl_resources_dir = os.path.join(settings.BASE_DIR, "resources") 26 | 27 | # ACHE language detection files. 28 | # TODO Investigate using conda-installed ACHE resources. 29 | LANG_DETECT_PATH = os.path.join(crawl_resources_dir, 'profiles') 30 | 31 | CCA_PATH = os.path.join(resources_dir, 'cca') 32 | CRAWL_PATH = os.path.join(resources_dir, 'crawls') 33 | MODEL_PATH = os.path.join(resources_dir, 'models') 34 | CONFIG_PATH = os.path.join(crawl_resources_dir, 'configs') 35 | IMAGES_PATH = os.path.join(resources_dir, 'images') 36 | 37 | # Directory to store seed files temporary. See `Crawl.save()` in 38 | # `crawl_space.models` 39 | SEEDS_TMP_DIR = os.path.join(resources_dir, 'seeds_tmp') 40 | MODELS_TMP_DIR = os.path.join(resources_dir, 'models_tmp') 41 | 42 | #Location of SOLR instance 43 | SOLR_URL = "http://localhost:8983/solr/" 44 | -------------------------------------------------------------------------------- /source/apps/crawl_space/templates/crawl_space/add_crawl.html: -------------------------------------------------------------------------------- 1 | {% extends "base/base.html" %} 2 | {% load crispy_forms_tags %} 3 | 4 | {% block title %}Add Crawl{% endblock %} 5 | 6 | {% load staticfiles %} 7 | 8 | {% block extrafooter %} 9 | 10 | {% endblock %} 11 | 12 | {% block content %} 13 | 14 | {% include "base/sidebar.html" %} 15 | 16 |
17 |
18 | 21 |
22 |
23 |
24 |

Add Crawl

25 | {% crispy form %} 26 |
27 |
28 |
29 | 30 | {% endblock %} 31 | -------------------------------------------------------------------------------- /source/apps/crawl_space/templates/crawl_space/add_crawl_model.html: -------------------------------------------------------------------------------- 1 | {% extends "base/base.html" %} 2 | {% load crispy_forms_tags %} 3 | 4 | {% block title %}Add Crawl Model{% endblock %} 5 | 6 | {% block content %} 7 | 8 | {% include "base/sidebar.html" %} 9 | 10 |
11 |
12 | 15 |
16 |
17 |
18 |

Add Crawl Model

19 | {% crispy form %} 20 |
21 | {% if "ddt" in settings.READY_EXTERNAL_APPS %} 22 | Domain Discovery 24 | 25 | {% endif %} 26 |
27 |
28 | 29 | {% endblock %} 30 | 31 | -------------------------------------------------------------------------------- /source/apps/crawl_space/templates/crawl_space/add_crawl_model_form.html: -------------------------------------------------------------------------------- 1 | 9 |
10 | {% csrf_token %} 11 |
12 | 15 |
16 | 17 | 18 | 19 | 20 |
21 |
22 |
23 | 26 |
27 | 28 | 29 | 30 | 31 |
32 |
33 |
34 | 37 |
38 | 39 | 40 | 41 | 42 |
43 |
44 |
45 |
46 | 47 |
48 |
49 | 50 |
51 |
52 | 53 |
54 | -------------------------------------------------------------------------------- /source/apps/crawl_space/templates/crawl_space/crawl_update_form.html: -------------------------------------------------------------------------------- 1 | {% extends "base/base.html" %} 2 | {% load crispy_forms_tags %} 3 | 4 | {% block title %}Crawl Settings{% endblock %} 5 | 6 | {% block content %} 7 | 8 | {% include "base/sidebar.html" %} 9 | 10 |
11 |
12 |
13 |

Edit Crawl

14 |
15 |
16 |
25 | {% csrf_token %} 26 | 33 |
34 |
35 |
36 |
37 |
38 | {% crispy form %} 39 |
40 |
41 |
42 | {% endblock %} 43 | 44 | {% load staticfiles %} 45 | 46 | {% block extrahead %} 47 | 48 | {% endblock %} 49 | -------------------------------------------------------------------------------- /source/apps/crawl_space/templates/crawl_space/crawls.html: -------------------------------------------------------------------------------- 1 | {% extends "base/base.html" %} 2 | {% load static %} 3 | 4 | {% block title %}{{project.name}}: Crawls{% endblock %} 5 | {% block content %} 6 | 7 | {% include "base/sidebar.html" %} 8 | 9 |
10 |
11 |

Crawls

12 |
13 | {% for crawl in project.crawl_set.all %} 14 |
15 | 16 |

{{ crawl.name }}

17 |
18 |

{{ crawl.description }}

19 |
20 | {% if crawl.crawler == 'nutch' %} 21 | 22 | 23 | 24 | 25 | 26 | {% if crawl.image_space %} 27 | 28 | {#TODO #} 29 | {% endif %} 30 | 31 | 32 |
Pages Crawled
{{ crawl.image_space.images.all() | length }}{{ crawl.pages_crawled }}
33 | {% else %} 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 |
Pages CrawledHarvest Rate
{{ crawl.pages_crawled }}{{ crawl.harvest_rate }}
44 | {% endif %} 45 |
46 |
47 | 48 | 49 | 50 | {% empty %} 51 |
52 |

There are no crawls associated with this project.

53 |
54 | 55 | 56 | 57 | {% endfor %} 58 |
59 |
60 | Add Crawl 61 |
62 |
63 |
64 | 65 | {% endblock %} 66 | -------------------------------------------------------------------------------- /source/apps/crawl_space/tests/test_crawl.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import os 4 | import shutil 5 | import json 6 | 7 | # Test 8 | from memex.test_utils.unit_test_utils import UnitTestSkeleton, form_errors, get_object 9 | from django.test import TestCase 10 | from django.core.files.uploadedfile import SimpleUploadedFile 11 | 12 | import pytest 13 | 14 | # App 15 | from apps.crawl_space.forms import AddCrawlForm 16 | from apps.crawl_space.models import Crawl, CrawlModel 17 | from base.models import Project, SeedsList, alphanumeric_validator 18 | 19 | 20 | def assert_form_errors(response, *errors): 21 | """Given a response, assert that only the given `errors` 22 | are present in the form response.""" 23 | 24 | efe = expected_form_errors = set(errors) 25 | assert set(form_errors(response).keys()) - efe == set() 26 | 27 | 28 | class TestViews(UnitTestSkeleton): 29 | 30 | @classmethod 31 | def setUpClass(cls): 32 | """Initialize a test project and crawl model, 33 | and save them to the test database.""" 34 | super(TestViews, cls).setUpClass() 35 | 36 | cls.test_project = Project( 37 | name = "Test", 38 | description = "Test Project Description" 39 | ) 40 | cls.test_project.save() 41 | 42 | cls.test_seeds_list = SeedsList( 43 | name = "Test Seeds", 44 | seeds = json.dumps([ 45 | "http://www.reddit.com/r/aww", 46 | "http://gizmodo.com/of-course-japan-has-an-island-where-cats-outnumber-peop-1695365964", 47 | "http://en.wikipedia.org/wiki/Cat", 48 | "http://www.catchannel.com/", 49 | "http://mashable.com/category/cats/", 50 | "http://www.huffingtonpost.com/news/cats/", 51 | "http://www.lolcats.com/" 52 | ]), 53 | ) 54 | cls.test_seeds_list.save() 55 | 56 | cls.test_crawl = Crawl( 57 | name = "Test Crawl", 58 | description = "Test Crawl Description", 59 | crawler = "nutch", 60 | config = "config_default", 61 | project = cls.test_project, 62 | seeds_object = cls.test_seeds_list 63 | ) 64 | cls.test_crawl.save() 65 | 66 | @classmethod 67 | def get_seeds(self): 68 | """Return a new instance of SimpleUploadedFile. This file can only 69 | be used once.""" 70 | 71 | return SimpleUploadedFile('ht.seeds', bytes('This is some content.\n'), 'utf-8') 72 | 73 | @property 74 | def crawl_slugs(self): 75 | """Return a dictionary with a "test" project slug and 76 | a "test-crawl" crawl slug.""" 77 | 78 | return dict(slugs=dict( 79 | project_slug="test", 80 | crawl_slug="test-crawl")) 81 | 82 | def test_crawl_delete(self): 83 | response = self.post('base:crawl_space:delete_crawl', 84 | **self.crawl_slugs) 85 | assert 'base/project.html' in response.template_name 86 | -------------------------------------------------------------------------------- /source/apps/crawl_space/tests/test_crawl_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | # Test 3 | from memex.test_utils.unit_test_utils import UnitTestSkeleton, form_errors 4 | from django.test import TestCase 5 | from django.db import IntegrityError 6 | from django.core.files.uploadedfile import SimpleUploadedFile 7 | 8 | # App 9 | from base.forms import AddProjectForm 10 | from base.models import Project 11 | from test_crawl import assert_form_errors 12 | 13 | 14 | class TestAddDataModelView(UnitTestSkeleton): 15 | 16 | @classmethod 17 | def setUpClass(cls): 18 | super(TestAddDataModelView, cls).setUpClass() 19 | cls.test_project = Project( 20 | name = u"Model Test", 21 | description = "Test Project Description") 22 | cls.test_project.save() 23 | 24 | @property 25 | def slugs(self): 26 | return dict(slugs=dict( 27 | project_slug="model-test")) 28 | 29 | def get_model_file(self): 30 | return SimpleUploadedFile('pageclassifier.model', bytes('This is a model file.\n'), 'utf-8') 31 | 32 | def get_features_file(self): 33 | return SimpleUploadedFile('pageclassifier.features', bytes('This is a features file.\n'), 'utf-8') 34 | 35 | def test_add_model_page(self): 36 | response = self.get('base:crawl_space:add_crawl_model', **self.slugs) 37 | assert 'crawl_space/add_crawl_model.html' in response.template_name 38 | 39 | def test_add_model_no_data(self): 40 | response = self.post('base:crawl_space:add_crawl_model', **self.slugs) 41 | assert_form_errors(response, 'name', 'model', 'features') 42 | 43 | def test_add_model_no_name(self): 44 | response = self.post('base:crawl_space:add_crawl_model', 45 | { 46 | 'model': self.get_model_file(), 47 | 'features': self.get_features_file(), 48 | }, 49 | **self.slugs) 50 | assert_form_errors(response, 'name') 51 | 52 | def test_add_model_no_model(self): 53 | response = self.post('base:crawl_space:add_crawl_model', 54 | { 55 | 'name': 'Test Model', 56 | 'features': self.get_features_file(), 57 | }, 58 | **self.slugs) 59 | assert_form_errors(response, 'model') 60 | 61 | def test_add_model_no_features(self): 62 | response = self.post('base:crawl_space:add_crawl_model', 63 | { 64 | 'name': 'Test Model', 65 | 'model': self.get_model_file(), 66 | }, 67 | **self.slugs) 68 | assert_form_errors(response, 'features') 69 | 70 | def test_add_model_success(self): 71 | response = self.post('base:crawl_space:add_crawl_model', 72 | { 73 | 'name': 'Test Model', 74 | 'model': self.get_model_file(), 75 | 'features': self.get_features_file(), 76 | }, 77 | **self.slugs) 78 | -------------------------------------------------------------------------------- /source/apps/crawl_space/urls.py: -------------------------------------------------------------------------------- 1 | """Crawl space url routes. 2 | 3 | Each of these url patterns is built off a project URL- 4 | 5 | /project/ 6 | 7 | -and belongs to the "crawl_space" namespace. See `base.urls` for an 8 | explanation of the dynamic route generation. 9 | 10 | URLs 11 | ---- 12 | 13 | project// 14 | add_crawl/ 15 | add_crawl_model/ 16 | crawls/ 17 | crawls/[\w-]+)/delete/$', views.DeleteCrawlModelView.as_view(), 31 | name='delete_crawl_model'), 32 | url(r'^crawls/$', views.ListCrawlsView.as_view(), 33 | name='crawls'), 34 | url(r'^crawls/(?P[\w-]+)/$', views.CrawlView.as_view(), 35 | name='crawl'), 36 | url(r'^crawls/(?P[\w-]+)/settings/$', views.CrawlSettingsView.as_view(), 37 | name='crawl_settings'), 38 | url(r'^crawls/(?P[\w-]+)/settings/delete/$', views.DeleteCrawlView.as_view(), 39 | name='delete_crawl'), 40 | ) 41 | 42 | -------------------------------------------------------------------------------- /source/apps/crawl_space/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import errno 3 | 4 | 5 | def touch(fname, times=None): 6 | with open(fname, 'a'): 7 | os.utime(fname, times) 8 | 9 | def ensure_exists(path): 10 | try: 11 | os.makedirs(path) 12 | except OSError as e: 13 | if e.errno == errno.EEXIST: # (path exists) 14 | pass 15 | if not os.path.isdir(path): 16 | raise 17 | 18 | def rm_if_exists(filename): 19 | try: 20 | os.remove(filename) 21 | return True 22 | except OSError as e: 23 | if e.errno != errno.ENOENT: # (no such file or directory) 24 | raise 25 | return False 26 | -------------------------------------------------------------------------------- /source/apps/crawl_space/viz/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/apps/crawl_space/viz/__init__.py -------------------------------------------------------------------------------- /source/apps/crawl_space/viz/harvest.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import os 3 | from blaze import into 4 | import pandas as pd 5 | from bokeh.plotting import figure 6 | from bokeh.models import ColumnDataSource, HoverTool 7 | from collections import OrderedDict 8 | from bokeh.embed import components 9 | import subprocess 10 | import shlex 11 | from StringIO import StringIO 12 | 13 | 14 | GREEN = "#47a838" 15 | DARK_GRAY = "#2e2e2e" 16 | LIGHT_GRAY = "#6e6e6e" 17 | 18 | 19 | class Harvest(object): 20 | """Create a line plot to compare the growth of crawled and relevant pages in the crawl.""" 21 | 22 | def __init__(self, crawl): 23 | self.source = None 24 | self.harvest_data = os.path.join(crawl.get_crawl_path(), 'data_monitor/harvestinfo.csv') 25 | 26 | def update_source(self): 27 | proc = subprocess.Popen(shlex.split("tail -n 800 %s" % self.harvest_data), 28 | stdout=subprocess.PIPE, stderr=subprocess.PIPE) 29 | 30 | stdout, stderr = proc.communicate() 31 | 32 | if stderr or not stdout: 33 | raise ValueError("harvestinfo.csv is empty") 34 | 35 | # Converts stdout to StringIO to allow pandas to read it as a file 36 | 37 | df = pd.read_csv(StringIO(stdout), delimiter='\t', 38 | names=['relevant_pages', 'downloaded_pages', 'timestamp']) 39 | df['harvest_rate'] = df['relevant_pages'] / df['downloaded_pages'] 40 | df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') 41 | 42 | source = into(ColumnDataSource, df) 43 | return source 44 | 45 | def create(self): 46 | self.source = self.update_source() 47 | 48 | p = figure(plot_width=400, plot_height=400, 49 | title="Harvest Plot", x_axis_type='datetime', 50 | tools='pan, wheel_zoom, box_zoom, reset, resize, save, hover') 51 | 52 | p.line(x="timestamp", y="relevant_pages", color=GREEN, line_width=0.2, 53 | legend="relevant", source=self.source) 54 | p.scatter(x="timestamp", y="relevant_pages", fill_alpha=0.6, 55 | color=GREEN, source=self.source) 56 | 57 | p.line(x="timestamp", y="downloaded_pages", color=DARK_GRAY, line_width=0.2, 58 | legend="downloaded", source=self.source) 59 | p.scatter(x="timestamp", y="downloaded_pages", fill_alpha=0.6, 60 | color=DARK_GRAY, source=self.source) 61 | 62 | hover = p.select(dict(type=HoverTool)) 63 | hover.tooltips = OrderedDict([ 64 | ("harvest_rate", "@harvest_rate"), 65 | ]) 66 | 67 | p.legend.orientation = "top_left" 68 | 69 | script, div = components(p) 70 | return (script, div) 71 | -------------------------------------------------------------------------------- /source/apps/crawl_space/viz/plot.py: -------------------------------------------------------------------------------- 1 | from StringIO import StringIO 2 | 3 | import pandas as pd 4 | 5 | from harvest import Harvest 6 | from domain import Domain 7 | 8 | from .stream import init_plot 9 | from django.conf import settings 10 | 11 | ENABLE_STREAM_VIZ = settings.ENABLE_STREAM_VIZ 12 | 13 | class PlotsNotReadyException(Exception): 14 | pass 15 | 16 | 17 | class AcheDashboard(object): 18 | 19 | def __init__(self, crawl): 20 | self.crawl = crawl 21 | if self.crawl.crawler != "ache": 22 | raise ValueError("Crawl must be using the Ache crawler.") 23 | self.harvest = Harvest(crawl) 24 | self.domain = Domain(crawl) 25 | 26 | def get_harvest_plot(self): 27 | # TODO: Remove Pokemon exception catching 28 | try: 29 | script, div = self.harvest.create() 30 | except: 31 | return [None, None] 32 | return [script, div] 33 | 34 | def get_domain_plot(self): 35 | # TODO: Remove Pokemon exception catching 36 | try: 37 | script, div = self.domain.create() 38 | except Exception: 39 | return [None, None] 40 | return [script, div] 41 | 42 | def get_relevant_seeds(self): 43 | # Converts string to StringIO to allow pandas to read it as a file 44 | seeds = pd.read_csv(StringIO(self.domain.get_relevant_data()), 45 | delimiter='\t', header=None, 46 | names=['url', 'timestamp']) 47 | return seeds['url'].to_dict().values() 48 | 49 | def get_plots(self): 50 | harvest_plot = self.get_harvest_plot() 51 | domain_plot = self.get_domain_plot() 52 | if harvest_plot != [None, None]: 53 | return { 54 | 'scripts': [domain_plot[0], harvest_plot[0]], 55 | 'divs': [domain_plot[1], harvest_plot[1]], 56 | } 57 | else: 58 | return { 59 | 'scripts': None, 60 | 'divs': None, 61 | } 62 | 63 | 64 | class NutchDashboard(object): 65 | 66 | def __init__(self, crawl): 67 | self.crawl = crawl 68 | if self.crawl.crawler != "nutch": 69 | raise ValueError("Crawl must be using the Nutch crawler.") 70 | 71 | def get_plots(self): 72 | # TODO: For simultaneous crawl monitoring need to use unique crawl ids 73 | if ENABLE_STREAM_VIZ: 74 | script = init_plot(self.crawl.name) 75 | else: 76 | script = None 77 | return { 78 | 'scripts': [script], 79 | 'divs': [], 80 | } 81 | -------------------------------------------------------------------------------- /source/base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/__init__.py -------------------------------------------------------------------------------- /source/base/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | from base.models import Project 4 | 5 | @admin.register(Project) 6 | class ProjectAdmin(admin.ModelAdmin): 7 | prepopulated_fields = {"slug": ("name",)} 8 | -------------------------------------------------------------------------------- /source/base/deploy_templates/docker-compose.yml.jinja2: -------------------------------------------------------------------------------- 1 | {% for container in containers %} 2 | {{ container.slug }}: 3 | {% if container.image %} 4 | image: {{ container.image }} 5 | {% else %} 6 | build: {{ container.build }} 7 | {% endif %} 8 | {% if container.command %} 9 | command: {{ container.command }} 10 | {% endif %} 11 | {% if container.volumes %} 12 | volumes: 13 | {% for volume in container.volumes %} 14 | - {{ volume.located_at }}:{{ volume.mounted_at }} 15 | {% endfor %} 16 | {% endif %} 17 | {% if container.ports %} 18 | ports: 19 | {% for port in container.ports %} 20 | - "{{ port }}" 21 | {% endfor %} 22 | {% endif %} 23 | {% if container.links %} 24 | links: 25 | {% for link in container.links %} 26 | - {{ link.name }}:{{ link.alias }} 27 | {% endfor %} 28 | {% endif %} 29 | {% if container.environment_variables %} 30 | environment: 31 | {% for var in container.environment_variables %} 32 | - {{ var.name }}={{ var.value}} 33 | {% endfor %} 34 | {% endif %} 35 | {% endfor %} 36 | -------------------------------------------------------------------------------- /source/base/deploy_templates/nginx-reverse-proxy.conf.jinja2: -------------------------------------------------------------------------------- 1 | server { 2 | listen 80; 3 | server_name {{ hostname }} {{ ip_addr }}; 4 | client_max_body_size 100M; 5 | 6 | location / { 7 | proxy_pass http://0.0.0.0:{{ root_port }}/; 8 | } 9 | 10 | location /static/ { 11 | rewrite ^/static/(.*)$ /$1 break; 12 | root {{ static_root }}/; 13 | } 14 | 15 | {% for portmap in portmaps %} 16 | location {{ portmap.urlbase }}/ { 17 | rewrite {{ portmap.urlbase }}/(.*) /$1 break; 18 | proxy_pass http://0.0.0.0:{{ portmap.port }}/; 19 | proxy_redirect off; 20 | proxy_set_header Host $host; 21 | } 22 | {% endfor %} 23 | } 24 | -------------------------------------------------------------------------------- /source/base/management/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/management/__init__.py -------------------------------------------------------------------------------- /source/base/management/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/management/commands/__init__.py -------------------------------------------------------------------------------- /source/base/management/commands/testbase.py: -------------------------------------------------------------------------------- 1 | 2 | from django.core.management.base import BaseCommand, CommandError 3 | from base.models import * #this command is a throwaway. 4 | 5 | class Command(BaseCommand): 6 | def handle(*args, **kwargs): 7 | Container.create_containers() 8 | print Container.generate_nginx_context() 9 | print Container.map_public_ports() 10 | print(open(Container.NGINX_CONFIG_DESTINATION_PATH, 'r').read()) 11 | 12 | -------------------------------------------------------------------------------- /source/base/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.db import models, migrations 5 | import base.models 6 | import django.core.validators 7 | 8 | 9 | class Migration(migrations.Migration): 10 | 11 | dependencies = [ 12 | ] 13 | 14 | operations = [ 15 | migrations.CreateModel( 16 | name='Index', 17 | fields=[ 18 | ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), 19 | ('name', models.CharField(unique=True, max_length=64, validators=[django.core.validators.RegexValidator(b'^[a-zA-Z0-9-_ ]+$', b'Only numbers, letters, underscores, dashes and spaces are allowed.')])), 20 | ('slug', models.SlugField(unique=True, max_length=64)), 21 | ('uploaded_data', models.FileField(upload_to=base.models.get_zipped_data_path, validators=[django.core.validators.RegexValidator(b'.*\\.(ZIP|zip)$', b'Only compressed archive (.zip) files are allowed.')])), 22 | ('data_folder', models.TextField(blank=True)), 23 | ('status', models.CharField(default=b'', max_length=64)), 24 | ('num_files', models.IntegerField(default=0)), 25 | ], 26 | options={ 27 | }, 28 | bases=(models.Model,), 29 | ), 30 | migrations.CreateModel( 31 | name='Project', 32 | fields=[ 33 | ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), 34 | ('name', models.CharField(unique=True, max_length=64, validators=[django.core.validators.RegexValidator(b'^[a-zA-Z0-9-_ ]+$', b'Only numbers, letters, underscores, dashes and spaces are allowed.')])), 35 | ('slug', models.SlugField(unique=True, max_length=64)), 36 | ('description', models.TextField(blank=True)), 37 | ], 38 | options={ 39 | }, 40 | bases=(models.Model,), 41 | ), 42 | migrations.CreateModel( 43 | name='SeedsList', 44 | fields=[ 45 | ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), 46 | ('name', models.CharField(unique=True, max_length=64, validators=[django.core.validators.RegexValidator(b'^[a-zA-Z0-9-_ ]+$', b'Only numbers, letters, underscores, dashes and spaces are allowed.')])), 47 | ('slug', models.SlugField(unique=True, max_length=64)), 48 | ('seeds', models.TextField()), 49 | ], 50 | options={ 51 | }, 52 | bases=(models.Model,), 53 | ), 54 | migrations.AddField( 55 | model_name='index', 56 | name='project', 57 | field=models.ForeignKey(to='base.Project'), 58 | preserve_default=True, 59 | ), 60 | ] 61 | -------------------------------------------------------------------------------- /source/base/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/migrations/__init__.py -------------------------------------------------------------------------------- /source/base/static/base/css/bootstrap-table.min.css: -------------------------------------------------------------------------------- 1 | /* 2 | * bootstrap-table - v1.5.0 - 2014-12-12 3 | * https://github.com/wenzhixin/bootstrap-table 4 | * Copyright (c) 2014 zhixin wen 5 | * Licensed MIT License 6 | */ 7 | 8 | .bootstrap-table .table{margin-bottom:0!important;border-bottom:1px solid #ddd;border-collapse:collapse!important;border-radius:1px}.fixed-table-container{position:relative;clear:both;border:1px solid #ddd;border-radius:4px;-webkit-border-radius:4px;-moz-border-radius:4px}.fixed-table-header{overflow:hidden;border-radius:4px 4px 0 0;-webkit-border-radius:4px 4px 0 0;-moz-border-radius:4px 4px 0 0}.fixed-table-body{overflow-x:auto;overflow-y:auto;height:100%}.fixed-table-container table{width:100%}.fixed-table-container thead th{height:0;padding:0;margin:0;border-left:1px solid #ddd}.fixed-table-container thead th:first-child{border-left:none;border-top-left-radius:4px;-webkit-border-top-left-radius:4px;-moz-border-radius-topleft:4px}.fixed-table-container thead th .th-inner{padding:8px;line-height:24px;vertical-align:top;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.fixed-table-container thead th .sortable{cursor:pointer}.fixed-table-container tbody td{border-left:1px solid #ddd}.fixed-table-container tbody tr:first-child td{border-top:none}.fixed-table-container tbody td:first-child{border-left:none}.fixed-table-container tbody .selected td{background-color:#f5f5f5}.fixed-table-container .bs-checkbox{text-align:center}.fixed-table-container .bs-checkbox .th-inner{padding:8px 0}.fixed-table-container input[type=checkbox],.fixed-table-container input[type=radio]{margin:0 auto!important}.fixed-table-container .no-records-found{text-align:center}.fixed-table-pagination .pagination,.fixed-table-pagination .pagination-detail{margin-top:10px;margin-bottom:10px}.fixed-table-pagination .pagination a{padding:6px 12px;line-height:1.428571429}.fixed-table-pagination .pagination-info{line-height:34px;margin-right:5px}.fixed-table-pagination .btn-group{position:relative;display:inline-block;vertical-align:middle}.fixed-table-pagination .dropup .dropdown-menu{margin-bottom:0}.fixed-table-pagination .page-list{display:inline-block}.fixed-table-toolbar .columns-left{margin-right:5px}.fixed-table-toolbar .columns-right{margin-left:5px}.fixed-table-toolbar .columns label{display:block;padding:3px 20px;clear:both;font-weight:400;line-height:1.428571429}.fixed-table-toolbar .bars,.fixed-table-toolbar .columns,.fixed-table-toolbar .search{position:relative;margin-top:10px;margin-bottom:10px;line-height:34px}.fixed-table-pagination li.disabled a{pointer-events:none;cursor:default}.fixed-table-loading{display:none;position:absolute;top:42px;right:0;bottom:0;left:0;z-index:99;background-color:#fff;text-align:center}.fixed-table-body .card-view .title{font-weight:700;display:inline-block;min-width:30%;text-align:left!important}.fixed-table-body thead th .th-inner{box-sizing:border-box}.table td,.table th{vertical-align:middle;box-sizing:border-box}.fixed-table-toolbar .dropdown-menu{text-align:left;max-height:300px;overflow:auto}.fixed-table-toolbar .btn-group>.btn-group{display:inline-block;margin-left:-1px!important}.fixed-table-toolbar .btn-group>.btn-group>.btn{border-radius:0}.fixed-table-toolbar .btn-group>.btn-group:first-child>.btn{border-top-left-radius:4px;border-bottom-left-radius:4px}.fixed-table-toolbar .btn-group>.btn-group:last-child>.btn{border-top-right-radius:4px;border-bottom-right-radius:4px}.bootstrap-table .table>thead>tr>th{vertical-align:bottom;border-bottom:2px solid #ddd}.bootstrap-table .table thead>tr>th{padding:0;margin:0}.pull-right .dropdown-menu{right:0;left:auto}p.fixed-table-scroll-inner{width:100%;height:200px}div.fixed-table-scroll-outer{top:0;left:0;visibility:hidden;width:200px;height:150px;overflow:hidden} -------------------------------------------------------------------------------- /source/base/static/base/css/forms.css: -------------------------------------------------------------------------------- 1 | /* line 1, ../scss/forms.scss */ 2 | * { 3 | margin: 0; 4 | padding: 0; 5 | } 6 | 7 | /* line 6, ../scss/forms.scss */ 8 | form .form-group { 9 | margin: 0px; 10 | padding: 12px 12px; 11 | } 12 | /* line 10, ../scss/forms.scss */ 13 | form .input-greyed-out { 14 | background-color: #f5f5f5; 15 | border-radius: 8px; 16 | } 17 | /* line 14, ../scss/forms.scss */ 18 | form .input-available { 19 | background-color: #ffffff; 20 | border-radius: 0px; 21 | } 22 | 23 | /* line 20, ../scss/forms.scss */ 24 | .list-group-item form { 25 | display: inline; 26 | } 27 | /* line 22, ../scss/forms.scss */ 28 | .list-group-item form button { 29 | display: block; 30 | border: 0; 31 | background-color: #ffffff; 32 | float: right; 33 | color: #428BCA; 34 | font-size: 24px; 35 | } 36 | /* line 29, ../scss/forms.scss */ 37 | .list-group-item form button:hover { 38 | color: #2A6496; 39 | } 40 | -------------------------------------------------------------------------------- /source/base/static/base/css/mixins.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/css/mixins.css -------------------------------------------------------------------------------- /source/base/static/base/css/update.css: -------------------------------------------------------------------------------- 1 | /* line 1, ../scss/update.scss */ 2 | .asteriskField { 3 | display: none; 4 | } 5 | -------------------------------------------------------------------------------- /source/base/static/base/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/favicon.ico -------------------------------------------------------------------------------- /source/base/static/base/fonts/FontAwesome.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/fonts/FontAwesome.otf -------------------------------------------------------------------------------- /source/base/static/base/fonts/Lato-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/fonts/Lato-Regular.ttf -------------------------------------------------------------------------------- /source/base/static/base/fonts/fontawesome-webfont.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/fonts/fontawesome-webfont.eot -------------------------------------------------------------------------------- /source/base/static/base/fonts/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/fonts/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /source/base/static/base/fonts/fontawesome-webfont.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/fonts/fontawesome-webfont.woff -------------------------------------------------------------------------------- /source/base/static/base/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /source/base/static/base/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /source/base/static/base/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /source/base/static/base/images/continuum-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/images/continuum-logo.png -------------------------------------------------------------------------------- /source/base/static/base/images/continuum_analytics_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/images/continuum_analytics_logo.png -------------------------------------------------------------------------------- /source/base/static/base/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/images/favicon.ico -------------------------------------------------------------------------------- /source/base/static/base/images/kitware.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/images/kitware.png -------------------------------------------------------------------------------- /source/base/static/base/images/nasa_jpl_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/images/nasa_jpl_logo.png -------------------------------------------------------------------------------- /source/base/static/base/images/nyu-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/images/nyu-logo.png -------------------------------------------------------------------------------- /source/base/static/base/imgs/spritemap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/imgs/spritemap.png -------------------------------------------------------------------------------- /source/base/static/base/imgs/spritemap@2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/base/static/base/imgs/spritemap@2x.png -------------------------------------------------------------------------------- /source/base/static/base/js/ajax.js: -------------------------------------------------------------------------------- 1 | // This code is copied from "https://docs.djangoproject.com/en/1.7/ref/contrib/csrf/" 2 | 3 | $( document ).ready(function() { 4 | 5 | function getCookie(name) { 6 | var cookieValue = null; 7 | if (document.cookie && document.cookie != '') { 8 | var cookies = document.cookie.split(';'); 9 | for (var i = 0; i < cookies.length; i++) { 10 | var cookie = jQuery.trim(cookies[i]); 11 | // Does this cookie string begin with the name we want? 12 | if (cookie.substring(0, name.length + 1) == (name + '=')) { 13 | cookieValue = decodeURIComponent(cookie.substring(name.length + 1)); 14 | break; 15 | } 16 | } 17 | } 18 | return cookieValue; 19 | } 20 | 21 | var csrftoken = getCookie('csrftoken'); 22 | 23 | 24 | function csrfSafeMethod(method) { 25 | // these HTTP methods do not require CSRF protection 26 | return (/^(GET|HEAD|OPTIONS|TRACE)$/.test(method)); 27 | } 28 | 29 | function sameOrigin(url) { 30 | // test that a given url is a same-origin URL 31 | // url could be relative or scheme relative or absolute 32 | var host = document.location.host; // host + port 33 | var protocol = document.location.protocol; 34 | var sr_origin = '//' + host; 35 | var origin = protocol + sr_origin; 36 | // Allow absolute or scheme relative URLs to same origin 37 | return (url == origin || url.slice(0, origin.length + 1) == origin + '/') || 38 | (url == sr_origin || url.slice(0, sr_origin.length + 1) == sr_origin + '/') || 39 | // or any other URL that isn't scheme relative or absolute i.e relative. 40 | !(/^(\/\/|http:|https:).*/.test(url)); 41 | } 42 | 43 | $.ajaxSetup({ 44 | beforeSend: function(xhr, settings) { 45 | if (!csrfSafeMethod(settings.type) && sameOrigin(settings.url)) { 46 | // Send the token to same-origin, relative URLs only. 47 | // Send the token only if the method warrants CSRF protection 48 | // Using the CSRFToken value acquired earlier 49 | xhr.setRequestHeader("X-CSRFToken", csrftoken); 50 | } 51 | } 52 | }); 53 | 54 | }); 55 | -------------------------------------------------------------------------------- /source/base/static/base/js/backbone/base_views.js: -------------------------------------------------------------------------------- 1 | (function(exports){ 2 | 3 | exports.FormView = Backbone.View.extend({ 4 | modal: "", 5 | form: "", 6 | formFields: [], 7 | clearErrors: function(fields, form){ 8 | // Clear errors from the form after it has been successfully submitted. 9 | var that = this; 10 | _.each(fields, function(field){ 11 | $(form).find("#div_id_" + field).removeClass("has-error"); 12 | $(form).find("#error_id_" + field).attr("hidden", true).html("") 13 | }); 14 | }, 15 | showFormErrors: function(errors, form){ 16 | // Take the JSON response from the server containing the errors, parse them, 17 | // and show them on the form. 18 | var that = this; 19 | var errorsArray = Object.keys(errors); 20 | _.each(errorsArray, function(field){ 21 | $(form).find("#div_id_" + field).addClass("has-error"); 22 | $(form).find("#error_id_" + field).attr("hidden", false).html(errors[field][0]); 23 | }); 24 | }, 25 | formSuccess: function(modal, form){ 26 | // If the submit was successful, hide the modal and reset the form. 27 | $(modal).modal('hide'); 28 | $(form)[0].reset(); 29 | }, 30 | toFormData: function(form){ 31 | // Convert the contents of the form to FormData, to allow for file uploads. 32 | var objects = new FormData(); 33 | var formData = $(form).serializeArray(); 34 | _.each(formData, function(formObject){ 35 | objects[formObject.name] = formObject.value; 36 | objects.append(formObject.name, formObject.value) 37 | }); 38 | return objects; 39 | }, 40 | toJson: function(form){ 41 | // Convert the contents of the form to JSON with key:value pairs, where the 42 | // key is the name of the field and the value is the value of the field. 43 | var objects = {}; 44 | var formData = $(form).serializeArray(); 45 | _.each(formData, function(formObject){ 46 | objects[formObject.name] = formObject.value; 47 | }); 48 | return objects; 49 | }, 50 | }); 51 | 52 | 53 | // View for rendering each item in a collection to a new view. Must define 54 | // which modelView to use to render. 55 | exports.CollectionView = Backbone.View.extend({ 56 | modelView: "", 57 | initialize: function(collection){ 58 | this.collection = collection; 59 | var that = this; 60 | this.collection.fetch({ 61 | success: function(){ 62 | that.render(); 63 | }, 64 | }); 65 | }, 66 | render: function(){ 67 | // Render each model in collection into a separate backbone view, 68 | // with one model per view. 69 | var that = this; 70 | this.collection.each(function(model){ 71 | var singleView = new that.modelView(model); 72 | }); 73 | }, 74 | }); 75 | 76 | })(this.BaseViews = {}); 77 | -------------------------------------------------------------------------------- /source/base/static/base/js/backbone/edit_seeds_backbone.js: -------------------------------------------------------------------------------- 1 | (function(exports){ 2 | 3 | var EditSeedsView = Backbone.View.extend({ 4 | el: "#seeds", 5 | form: "#editSeedsForm", 6 | invalidLines: [], 7 | template: _.template($("#editSeedsTemplate").html()), 8 | initialize: function(model){ 9 | this.model = model; 10 | var that = this; 11 | this.model.set({id: $("#seeds_pk").val()}).fetch({ 12 | success: function(){ 13 | that.render(); 14 | that.setEditor(); 15 | } 16 | }); 17 | }, 18 | render: function(){ 19 | this.$el.append(this.template(this.model.toJSON())); 20 | }, 21 | setEditor: function(){ 22 | this.editor = CodeMirror.fromTextArea(document.getElementById("id_seeds"), { 23 | lineNumbers: true 24 | }); 25 | this.editor.setSize("100%", 1000); 26 | this.loadSeeds(); 27 | }, 28 | editSeeds: function(event){ 29 | var that = this; 30 | event.preventDefault(); 31 | var save = confirm("Save this seeds list?"); 32 | if(save == true){ 33 | var newSeeds = JSON.stringify($("#id_seeds").val().replace("\r", "").split("\n")) 34 | this.model.set("seeds", newSeeds); 35 | this.model.save({}, { 36 | beforeSend: function(){ 37 | that.clearErrors(); 38 | $("#seedsSuccess").hide(); 39 | $("#seedsFailure").hide(); 40 | }, 41 | success: function(response){ 42 | $("#seedsSuccess").show(); 43 | }, 44 | error: function(model, xhr, thrownError){ 45 | that.showLineErrors(xhr.responseJSON); 46 | }, 47 | }); 48 | } 49 | }, 50 | showLineErrors: function(errors){ 51 | this.errors = errors["seeds"]; 52 | var that = this; 53 | _.each(this.errors, function(seed){ 54 | // Skip the initial error message. 55 | if((that.errors.indexOf(seed) == 0) || (that.errors.indexOf(seed) == that.errors.length - 1)){ 56 | return; 57 | } 58 | line = that.editor.getLineHandle(Object.keys(seed)); 59 | that.invalidLines.push(line); 60 | that.editor.doc.addLineClass(line, 'background', 'line-error'); 61 | }); 62 | $("#seedsFailure").show() 63 | }, 64 | clearErrors: function(){ 65 | var that = this; 66 | _.each(this.invalidLines, function(line){ 67 | that.editor.doc.removeLineClass(line, 'background', 'line-error'); 68 | }); 69 | this.invalidLines = [] 70 | }, 71 | loadSeeds: function(){ 72 | this.editor.setValue(this.model.toJSON().file_string); 73 | }, 74 | events: { 75 | "submit #editSeedsForm": "editSeeds", 76 | "click #reset": "loadSeeds", 77 | }, 78 | }); 79 | 80 | 81 | var EditSeedsRouter = Backbone.Router.extend({ 82 | routes: { 83 | "": "index", 84 | }, 85 | index: function(){ 86 | var seedsList = new Seeds.Seeds(); 87 | var seedsView = new EditSeedsView(seedsList); 88 | }, 89 | }); 90 | 91 | $(document).ready(function(){ 92 | var appRouter = new EditSeedsRouter(); 93 | Backbone.history.start(); 94 | }); 95 | 96 | })(this.EditSeeds = {}); 97 | -------------------------------------------------------------------------------- /source/base/static/base/js/backbone/project_router.js: -------------------------------------------------------------------------------- 1 | (function(exports){ 2 | 3 | var ProjectRouter = Backbone.Router.extend({ 4 | routes: { 5 | "": "index", 6 | }, 7 | index: function(){ 8 | var project = $("#project_id").val(); 9 | var modelCollection = new CrawlModels.CrawlModelCollection(); 10 | var crawlCollection = new Crawls.CrawlCollection(); 11 | var seedsCollection = new Seeds.SeedsCollection(); 12 | crawlCollection.fetch({ 13 | url: crawlCollection.url += "?project=" + project, 14 | success: function(){ 15 | var crawlCollectionView = new Crawls.CrawlCollectionView(crawlCollection); 16 | var crawlFormView = new Crawls.AddCrawlView(crawlCollection, crawlCollectionView); 17 | }, 18 | complete: function(){ 19 | modelCollection.fetch({ 20 | url: modelCollection.url += "?project=" + project, 21 | success: function(){ 22 | var modelCollectionView = new CrawlModels.CrawlModelCollectionView(modelCollection); 23 | var addModelView = new CrawlModels.AddCrawlModelView(modelCollection, modelCollectionView); 24 | }, 25 | }); 26 | seedsCollection.fetch({ 27 | url: seedsCollection.url, 28 | success: function(){ 29 | var addModelView = new AddSeeds.AddSeedsProjectPage(seedsCollection); 30 | }, 31 | }); 32 | }, 33 | }); 34 | }, 35 | }); 36 | 37 | 38 | $(document).ready(function(){ 39 | var appRouter = new ProjectRouter(); 40 | Backbone.history.start(); 41 | }); 42 | 43 | })(this.projectRouter = {}); 44 | -------------------------------------------------------------------------------- /source/base/static/base/js/backbone/project_seeds.js: -------------------------------------------------------------------------------- 1 | (function(exports){ 2 | 3 | exports.SeedsFormItem = Backbone.View.extend({ 4 | el: "#id_seeds_object", 5 | template: _.template($("#crawlFormSeeds").html()), 6 | initialize: function(model){ 7 | this.model = model; 8 | _.bindAll(this, 'render'); 9 | this.render(); 10 | }, 11 | render: function(){ 12 | this.$el.append(this.template(this.model.toJSON())); 13 | }, 14 | }); 15 | 16 | 17 | exports.AddSeedsProjectPage = Seeds.AddSeedsView.extend({ 18 | render: function(){ 19 | var that = this; 20 | this.$el.html(this.template()); 21 | if (this.collection.models.length){ 22 | this.collection.each(function(model){ 23 | var singleView = new exports.SeedsFormItem(model); 24 | }); 25 | } 26 | $(this.modal).on('hidden.bs.modal', function(event){ 27 | if ($(that.modal).attr("aria-hidden") == "true"){ 28 | $("body").addClass("modal-open"); 29 | } 30 | }); 31 | }, 32 | addSeeds: function(){ 33 | var that = this; 34 | event.preventDefault(); 35 | var formObjects = this.toFormData(this.form); 36 | // Attach the contents of the file to the FormData object. 37 | var file = $(this.filesField)[0].files[0]; 38 | if (typeof file != 'undefined'){ 39 | formObjects.append("seeds", file, file.name); 40 | } 41 | var newSeeds = new Seeds.Seeds(formObjects); 42 | this.collection.add(newSeeds); 43 | // If model.save() is successful, clear the errors and the form, and hide 44 | // the modal. If model.save() had errors, show each error on form field, 45 | // along with the content of the error. 46 | newSeeds.save({}, { 47 | data: formObjects, 48 | contentType: false, 49 | beforeSend: function(){ 50 | that.clearLineErrors(); 51 | }, 52 | success: function(response){ 53 | var newSeeds = new exports.SeedsFormItem( 54 | that.collection.models[that.collection.models.length - 1] 55 | ); 56 | that.formSuccess(that.modal, that.form); 57 | that.clearErrors(that.formFields, that.form); 58 | that.clearLineErrors(xhr.responseJSON, that.form); 59 | }, 60 | error: function(model, xhr, thrownError){ 61 | that.showFormErrors(xhr.responseJSON, that.form); 62 | that.showLineErrors(xhr.responseJSON, that.form); 63 | }, 64 | }); 65 | }, 66 | }); 67 | 68 | })(this.AddSeeds = {}); 69 | -------------------------------------------------------------------------------- /source/base/static/base/js/backbone/projects.js: -------------------------------------------------------------------------------- 1 | (function(exports){ 2 | 3 | var Project = Backbone.Model.extend({ 4 | urlRoot: "/api/projects/", 5 | defaults: { 6 | name: "", 7 | description: "", 8 | url: "", 9 | slug: "", 10 | }, 11 | }); 12 | 13 | 14 | var ProjectCollection = Backbone.Collection.extend({ 15 | url: "/api/projects/", 16 | model: Project, 17 | }); 18 | 19 | 20 | var ProjectView = Backbone.View.extend({ 21 | el: "#projects", 22 | template: _.template($("#indexProjectItem").html()), 23 | initialize: function(model){ 24 | this.model = model; 25 | _.bindAll(this, 'render'); 26 | var that = this; 27 | this.render(); 28 | }, 29 | render: function(){ 30 | this.$el.append(this.template(this.model.toJSON())); 31 | }, 32 | }); 33 | 34 | 35 | var AddProjectView = BaseViews.FormView.extend({ 36 | el: "#addProjectContainer", 37 | modal: "#newProjectModal", 38 | form: "#addProjectForm", 39 | formFields: ["name"], 40 | template: _.template($("#addProjectTemplate").html()), 41 | initialize: function(collection){ 42 | this.collection = collection; 43 | this.render(); 44 | }, 45 | render: function(){ 46 | this.$el.html(this.template()); 47 | }, 48 | addProject: function(event){ 49 | var that = this; 50 | event.preventDefault(); 51 | var formObjects = this.toJson(this.form); 52 | var newProject = new Project(formObjects); 53 | this.collection.add(newProject); 54 | // If model.save() is successful, clear the errors and the form, and hide 55 | // the modal. If model.save() had errors, show each error on form field, 56 | // along with the content of the error. 57 | newProject.save({}, { 58 | success: function(response){ 59 | var newProject = new ProjectView( 60 | that.collection.models[that.collection.models.length - 1] 61 | ); 62 | that.formSuccess(that.modal, that.form); 63 | that.clearErrors(that.formFields, that.form); 64 | }, 65 | error: function(model, xhr, thrownError){ 66 | that.showFormErrors(xhr.responseJSON, that.form); 67 | }, 68 | }); 69 | }, 70 | events: { 71 | "submit #addProjectForm": "addProject", 72 | }, 73 | }); 74 | 75 | 76 | var ProjectCollectionView = BaseViews.CollectionView.extend({ 77 | modelView: ProjectView, 78 | }); 79 | 80 | 81 | var IndexRouter = Backbone.Router.extend({ 82 | routes: { 83 | "": "index", 84 | }, 85 | index: function(){ 86 | var projectCollection = new ProjectCollection(); 87 | var collectionView = new ProjectCollectionView(projectCollection); 88 | var addProjectView = new AddProjectView(projectCollection); 89 | }, 90 | }); 91 | 92 | 93 | $(document).ready(function(){ 94 | var appRouter = new IndexRouter(); 95 | Backbone.history.start(); 96 | }); 97 | 98 | })(this.Projects = {}); 99 | -------------------------------------------------------------------------------- /source/base/static/base/js/backbone/seeds_router.js: -------------------------------------------------------------------------------- 1 | (function(exports){ 2 | 3 | var SeedsRouter = Backbone.Router.extend({ 4 | routes: { 5 | "": "index", 6 | }, 7 | index: function(){ 8 | seedsCollection = new Seeds.SeedsCollection(); 9 | seedsCollectionView = new Seeds.SeedsCollectionView(seedsCollection); 10 | addSeedsView = new Seeds.AddSeedsView(seedsCollection); 11 | trailsCollection = new Trails.TrailsCollection(); 12 | trailsCollectionView = new Trails.TrailsCollectionView(trailsCollection); 13 | trailsFormView = new Trails.TrailFormView(trailsCollection); 14 | }, 15 | }); 16 | 17 | 18 | $(document).ready(function(){ 19 | var appRouter = new SeedsRouter(); 20 | Backbone.history.start(); 21 | }); 22 | 23 | })(); 24 | -------------------------------------------------------------------------------- /source/base/static/base/js/backbone/trails_backbone.js: -------------------------------------------------------------------------------- 1 | (function(exports){ 2 | 3 | // Module-wide variables 4 | var module = {}; 5 | 6 | 7 | exports.Trail = Backbone.Model.extend({ 8 | urlRoot: "/api/datawake", 9 | defaults: { 10 | trail_id: 0, 11 | domain_name: "", 12 | urls: [], 13 | urls_string: "", 14 | }, 15 | }); 16 | 17 | 18 | exports.TrailsCollection = Backbone.Collection.extend({ 19 | url: "/api/datawake", 20 | model: exports.Trail, 21 | }); 22 | 23 | 24 | exports.TrailView = Backbone.View.extend({ 25 | el: "#trails", 26 | template: _.template($("#trailItem").html()), 27 | initialize: function(model){ 28 | var that = this; 29 | this.model = model; 30 | this.trailId = this.model.toJSON()["trail_id"]; 31 | this.render(); 32 | this.events = {}; 33 | this.events["click #trail_" + this.trailId] = "setForm"; 34 | }, 35 | render: function(){ 36 | this.$el.append(this.template(this.model.toJSON())); 37 | }, 38 | setForm: function(){ 39 | $("#id_trailseeds").val(this.model.toJSON()["urls_string"]); 40 | } 41 | }); 42 | 43 | 44 | exports.TrailFormView = BaseViews.FormView.extend({ 45 | el: "#trailSeedsCreation", 46 | modal: "#trailModal", 47 | form: "#trailForm", 48 | formFields: [ 49 | "seedsname", 50 | ], 51 | template: _.template($("#trailFormBody").html()), 52 | initialize: function(collection){ 53 | this.render(); 54 | }, 55 | render: function(){ 56 | this.$el.html(this.template()); 57 | }, 58 | createSeeds: function(events){ 59 | var that = this; 60 | event.preventDefault(); 61 | // Convert formdata to fields necessary for creating a seeds list. 62 | var formObjects = this.toFormData(this.form); 63 | formObjects["name"] = formObjects["seedsname"]; 64 | formObjects["textseeds"] = $("#id_trailseeds").val(); 65 | delete formObjects["seedsname"]; 66 | var newSeeds = new Seeds.Seeds(formObjects); 67 | newSeeds.save({}, { 68 | success: function(response){ 69 | that.formSuccess(that.modal, that.form); 70 | that.clearErrors(that.formFields, that.form); 71 | var newSeedsView = new Seeds.SeedsView(response) 72 | }, 73 | error: function(model, xhr, thrownError){ 74 | // Convert the name of the error to seedsname, to avoid conflict with 75 | // the add seeds form. 76 | var response = xhr.responseJSON; 77 | response["seedsname"] = response["name"]; 78 | delete response["name"] 79 | that.showFormErrors(response, that.form); 80 | }, 81 | }); 82 | }, 83 | events: { 84 | "submit #trailForm": "createSeeds", 85 | }, 86 | }); 87 | 88 | 89 | exports.TrailsCollectionView = BaseViews.CollectionView.extend({ 90 | el: "#trailHeader", 91 | template: _.template($("#trailHead").html()), 92 | modelView: exports.TrailView, 93 | initialize: function(collection){ 94 | this.collection = collection; 95 | var that = this; 96 | this.collection.fetch({ 97 | success: function(){ 98 | that.render(); 99 | }, 100 | }); 101 | }, 102 | render: function(){ 103 | // Render each model in collection into a separate backbone view, 104 | // with one model per view. 105 | var that = this; 106 | if(this.collection.length){ 107 | this.$el.append(this.template()); 108 | this.collection.each(function(model){ 109 | var singleView = new that.modelView(model); 110 | }); 111 | } 112 | }, 113 | }); 114 | 115 | })(this.Trails = {}); 116 | -------------------------------------------------------------------------------- /source/base/static/base/js/delete_index.js: -------------------------------------------------------------------------------- 1 | $(document).ready(function(){ 2 | (function(){ 3 | 4 | var uploadAjax = document.getElementById("upload_data"); 5 | var csrf_token = uploadAjax.csrfmiddlewaretoken.value; 6 | 7 | function deleteIndex(){ 8 | return $.ajax({ 9 | type: "POST", 10 | url: window.location.href + "delete/", 11 | data: {csrfmiddlewaretoken: csrf_token}, 12 | success: function(){ 13 | window.close(); 14 | }, 15 | failure: function(){ 16 | return "error"; 17 | } 18 | }); 19 | } 20 | 21 | $('#deleteIndexButton').click(function(){ 22 | swal({ 23 | title: "Are you sure?", 24 | text: "This will delete the index and all files associated with the index.", 25 | type: "warning", 26 | showCancelButton: true, 27 | confirmButtonColor: '#DD6B55', 28 | confirmButtonText: 'Yes, delete it!', 29 | cancelButtonText: "No, cancel!", 30 | closeOnConfirm: false, 31 | closeOnCancel: false 32 | }, 33 | function(isConfirm){ 34 | if (isConfirm){ 35 | deleteIndex(); 36 | } else { 37 | swal("Cancelled", "You cancelled the delete process", "error"); 38 | } 39 | }) 40 | }); 41 | 42 | 43 | })(); 44 | }); 45 | -------------------------------------------------------------------------------- /source/base/static/base/js/ie-emulation-modes-warning.js: -------------------------------------------------------------------------------- 1 | // NOTICE!! DO NOT USE ANY OF THIS JAVASCRIPT 2 | // IT'S JUST JUNK FOR OUR DOCS! 3 | // ++++++++++++++++++++++++++++++++++++++++++ 4 | /*! 5 | * Copyright 2014 Twitter, Inc. 6 | * 7 | * Licensed under the Creative Commons Attribution 3.0 Unported License. For 8 | * details, see http://creativecommons.org/licenses/by/3.0/. 9 | */ 10 | // Intended to prevent false-positive bug reports about Bootstrap not working properly in old versions of IE due to folks testing using IE's unreliable emulation modes. 11 | (function () { 12 | 'use strict'; 13 | 14 | function emulatedIEMajorVersion() { 15 | var groups = /MSIE ([0-9.]+)/.exec(window.navigator.userAgent) 16 | if (groups === null) { 17 | return null 18 | } 19 | var ieVersionNum = parseInt(groups[1], 10) 20 | var ieMajorVersion = Math.floor(ieVersionNum) 21 | return ieMajorVersion 22 | } 23 | 24 | function actualNonEmulatedIEMajorVersion() { 25 | // Detects the actual version of IE in use, even if it's in an older-IE emulation mode. 26 | // IE JavaScript conditional compilation docs: http://msdn.microsoft.com/en-us/library/ie/121hztk3(v=vs.94).aspx 27 | // @cc_on docs: http://msdn.microsoft.com/en-us/library/ie/8ka90k2e(v=vs.94).aspx 28 | var jscriptVersion = new Function('/*@cc_on return @_jscript_version; @*/')() // jshint ignore:line 29 | if (jscriptVersion === undefined) { 30 | return 11 // IE11+ not in emulation mode 31 | } 32 | if (jscriptVersion < 9) { 33 | return 8 // IE8 (or lower; haven't tested on IE<8) 34 | } 35 | return jscriptVersion // IE9 or IE10 in any mode, or IE11 in non-IE11 mode 36 | } 37 | 38 | var ua = window.navigator.userAgent 39 | if (ua.indexOf('Opera') > -1 || ua.indexOf('Presto') > -1) { 40 | return // Opera, which might pretend to be IE 41 | } 42 | var emulated = emulatedIEMajorVersion() 43 | if (emulated === null) { 44 | return // Not IE 45 | } 46 | var nonEmulated = actualNonEmulatedIEMajorVersion() 47 | 48 | if (emulated !== nonEmulated) { 49 | window.alert('WARNING: You appear to be using IE' + nonEmulated + ' in IE' + emulated + ' emulation mode.\nIE emulation modes can behave significantly differently from ACTUAL older versions of IE.\nPLEASE DON\'T FILE BOOTSTRAP BUGS based on testing in IE emulation modes!') 50 | } 51 | })(); 52 | -------------------------------------------------------------------------------- /source/base/static/base/js/ie10-viewport-bug-workaround.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * IE10 viewport hack for Surface/desktop Windows 8 bug 3 | * Copyright 2014 Twitter, Inc. 4 | * Licensed under the Creative Commons Attribution 3.0 Unported License. For 5 | * details, see http://creativecommons.org/licenses/by/3.0/. 6 | */ 7 | 8 | // See the Getting Started docs for more information: 9 | // http://getbootstrap.com/getting-started/#support-ie10-width 10 | 11 | (function () { 12 | 'use strict'; 13 | if (navigator.userAgent.match(/IEMobile\/10\.0/)) { 14 | var msViewportStyle = document.createElement('style') 15 | msViewportStyle.appendChild( 16 | document.createTextNode( 17 | '@-ms-viewport{width:auto!important}' 18 | ) 19 | ) 20 | document.querySelector('head').appendChild(msViewportStyle) 21 | } 22 | })(); 23 | -------------------------------------------------------------------------------- /source/base/static/base/js/project.js: -------------------------------------------------------------------------------- 1 | $(document).ready(function(){ 2 | (function(){ 3 | 4 | function updateStatus(itemsObject){ 5 | statuses = Object.keys(itemsObject); 6 | for(var i=0; i" + 27 | "" + task_id + ""); 28 | $("#task-id").text(task_id); 29 | $("#raw-response").text(JSON.stringify(response, null, 2)); 30 | setTimeout(get_results, 1000); 31 | } 32 | else 33 | { 34 | $("#response").html("Something failed! Hopefully something in the response is useful.
" + response['result']['error']); 35 | $("#raw-response").html('' + JSON.stringify(response, null, 2) + ''); 36 | } 37 | }, 38 | error: function(response){ 39 | $("#response").html("Something failed! Hopefully something in the response is useful."); 40 | $("#raw-response").html('' + JSON.stringify(response, null, 2) + ''); 41 | } 42 | }); 43 | } 44 | 45 | function get_results(){ 46 | return $.ajax({ 47 | type: "POST", 48 | data: { 49 | "action": "progress", 50 | "task-id": $("#task-id").text(), 51 | "csrfmiddlewaretoken" : $("input[name=csrfmiddlewaretoken]").val() 52 | }, 53 | success: function(response) { 54 | $("#raw-response").text(JSON.stringify(response['result'], null, 2)); 55 | if ((response['result']['status'] != 'Finished') && (!response['result']['error'])) 56 | setTimeout(get_results, 1000); 57 | else 58 | $("#response").html( 59 | response['pvalue_plot']['script'] + response['pvalue_plot']['div'] + 60 | response['count_plot']['script'] + response['count_plot']['div']); 61 | }, 62 | error: function(response){ 63 | $("#response").html("Something failed! Hopefully something in the response is useful.") 64 | $("#raw-response").html('' + JSON.stringify(response, null, 2) + ''); 65 | } 66 | }); 67 | } 68 | 69 | $(document).ready(function(){ 70 | $("#run-detector").click(function(){ 71 | submit_query(); 72 | }); 73 | }); 74 | 75 | })(); 76 | 77 | $(function() { 78 | $.datepicker.setDefaults({ 79 | dateFormat: 'yy-mm-dd' 80 | }); 81 | }); 82 | 83 | $(function() { 84 | $("#analysis-start-date").datepicker(); 85 | }); 86 | 87 | $(function() { 88 | $("#analysis-end-date").datepicker(); 89 | }); 90 | -------------------------------------------------------------------------------- /source/base/templates/base/add_project.html: -------------------------------------------------------------------------------- 1 | {% extends "base/base.html" %} 2 | {% load crispy_forms_tags %} 3 | 4 | {% block title %}Add Project{% endblock %} 5 | 6 | {% block content %} 7 | 8 |
9 |
10 | 13 |
14 |
15 |
16 |

Add Project

17 | {% crispy form %} 18 |
19 |
20 |
21 | 22 | {% endblock %} 23 | -------------------------------------------------------------------------------- /source/base/templates/base/edit_seeds.html: -------------------------------------------------------------------------------- 1 | {% extends "base/base.html" %} 2 | 3 | {% block title %}{{seedslist.name}}{% endblock %} 4 | 5 | {% load static %} 6 | 7 | {% block extrahead %} 8 | 9 | {% endblock %} 10 | 11 | {% block extrafooter %} 12 | 13 | 14 | 15 | 16 | 17 | {% endblock %} 18 | 19 | {% block content %} 20 |
21 | 22 |
23 | 26 |
27 |
28 | 32 |
33 |
34 | 38 |
39 |
40 | 41 |
42 |
43 | 46 | 47 |
48 | {% include "base/underscore_templates/underscore_templates.html" %} 49 | {% include "base/underscore_templates/seeds_underscore.html" %} 50 | {% include "base/underscore_templates/crawls_underscore.html" %} 51 | {% endblock %} 52 | -------------------------------------------------------------------------------- /source/base/templates/base/index.html: -------------------------------------------------------------------------------- 1 | {% extends "base/base.html" %} 2 | {% load crispy_forms_tags %} 3 | 4 | {% load static %} 5 | 6 | {% block extrafooter %} 7 | 8 | 9 | {% endblock %} 10 | 11 | {% block content %} 12 |
13 |
14 |
15 |

MY PROJECTS

16 |
17 | 23 |
24 | 25 | 44 | 45 | 56 |
57 | 76 | {% include "base/underscore_templates/underscore_templates.html" %} 77 | {% endblock %} 78 | -------------------------------------------------------------------------------- /source/base/templates/base/indices.html: -------------------------------------------------------------------------------- 1 | {% extends "base/base.html" %} 2 | {% load static %} 3 | 4 | {% block title %}{{project.name}}: Datasets{% endblock %} 5 | {% block content %} 6 | 7 | {% include "base/sidebar.html" %} 8 | 9 |
10 | 11 |
12 |

Datasets 13 | 17 | Kibana 18 | 19 |

20 |
21 | {% for index in project.index_set.all %} 22 |
23 |

{{ index.name }}

24 | 25 | 30 |
31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 |
Number of Files
{{ index.num_files }}
39 |
40 |
41 | 42 | 43 | 44 | {% empty %} 45 |
46 |

There are no indices associated with this project.

47 |
48 | 49 | 50 | 51 | {% endfor %} 52 |
53 |
54 | Add Dataset 55 |
56 |
57 |
58 | {% endblock %} 59 | -------------------------------------------------------------------------------- /source/base/templates/base/project_update_form.html: -------------------------------------------------------------------------------- 1 | {% extends "base/base.html" %} 2 | {% load crispy_forms_tags %} 3 | 4 | {% block title %}Edit Project{% endblock %} 5 | 6 | {% block content %} 7 | 8 |
9 |
10 |
11 |

Edit Project

12 |
13 |
14 |
23 | {% csrf_token %} 24 | 31 |
32 |
33 |
34 |
35 |
36 | {% crispy form %} 37 |
38 |
39 |
40 | {% endblock %} 41 | 42 | {% load staticfiles %} 43 | 44 | {% block extrahead %} 45 | 46 | {% endblock %} 47 | -------------------------------------------------------------------------------- /source/base/templates/base/seeds_list.html: -------------------------------------------------------------------------------- 1 | {% extends "base/base.html" %} 2 | {% load crispy_forms_tags %} 3 | 4 | {% load static %} 5 | 6 | {% block extrahead %} 7 | 8 | {% endblock %} 9 | 10 | {% block extrafooter %} 11 | 12 | 13 | 14 | 15 | 16 | {% endblock %} 17 | 18 | {% block content %} 19 |
20 |
21 |
22 |

Seeds

23 |
24 | 30 |
31 | 32 | 51 |
52 |
53 | 54 | 65 |
66 | 85 | {% include "base/underscore_templates/seeds_underscore.html" %} 86 | {% endblock %} 87 | -------------------------------------------------------------------------------- /source/base/templates/base/sidebar.html: -------------------------------------------------------------------------------- 1 | 41 | 42 | -------------------------------------------------------------------------------- /source/base/templates/base/tad.html: -------------------------------------------------------------------------------- 1 | {% extends "base/base.html" %} 2 | 3 | {% block title %}TAD{% endblock %} 4 | 5 | {% load static %} 6 | 7 | {% block extrafooter %} 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | {% endblock %} 16 | 17 | {% block content %} 18 | {% include "base/sidebar.html" %} 19 |
20 |
21 |

Temporal Anomaly Detector

22 | {% csrf_token %} 23 |
24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
Analysis start date:
Analysis end date:
Target filters:
Baseline filters:
Constant baseline:
Scale baseline:
Index:
Time Field:
35 |
36 |
No query run.
37 |
38 | 39 |
40 |
41 | {% endblock %} 42 | -------------------------------------------------------------------------------- /source/base/tests/functional_project_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pytest 3 | 4 | from django.test import LiveServerTestCase 5 | from selenium.webdriver.firefox.webdriver import WebDriver 6 | 7 | @pytest.mark.slow 8 | class TestProject(LiveServerTestCase): 9 | 10 | @classmethod 11 | def setUpClass(cls): 12 | cls.browser = WebDriver() 13 | cls.browser.implicitly_wait(1) 14 | super(TestProject, cls).setUpClass() 15 | 16 | @classmethod 17 | def tearDownClass(cls): 18 | cls.browser.quit() 19 | super(TestProject, cls).tearDownClass() 20 | 21 | def test_add_project(self): 22 | 23 | # Names are awesome 24 | ff = self.browser 25 | 26 | ff.get(self.live_server_url) 27 | assert ff.title == 'Memex Explorer' 28 | 29 | # Click on "New Project". 30 | new_project = ff.find_element_by_link_text("New Project") 31 | new_project.click() 32 | 33 | # Click "Submit" on an empty form. 34 | submit = ff.find_element_by_id('submit-id-submit') 35 | submit.click() 36 | 37 | # Verify that errors appear as expected 38 | name_error = ff.find_element_by_id("error_1_id_name") 39 | assert name_error.text == 'This field is required.' 40 | 41 | # Fill out the form and submit 42 | name = ff.find_element_by_id("id_name") 43 | name.send_keys("Test name") 44 | description = ff.find_element_by_id("id_description") 45 | description.send_keys("Test description") 46 | submit = ff.find_element_by_id('submit-id-submit') 47 | submit.click() 48 | 49 | # Verify that we are redirected to the project page 50 | assert ff.current_url == self.live_server_url + '/projects/test-name/' 51 | 52 | 53 | -------------------------------------------------------------------------------- /source/base/tests/test_rest_project.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import os 4 | import pytest 5 | import requests 6 | import json 7 | 8 | from django.conf import settings 9 | from django.core.urlresolvers import reverse 10 | from django.test import TestCase 11 | from django.db import IntegrityError 12 | 13 | from rest_framework.test import APITestCase 14 | from rest_framework import status 15 | 16 | from base.models import Project 17 | 18 | 19 | class TestProjectREST(APITestCase): 20 | """ 21 | Testing for adding Projects through the REST framework. 22 | """ 23 | @classmethod 24 | def setUpClass(cls): 25 | cls.test_project = Project( 26 | name = "RestTest", 27 | description = "Testing Rest API" 28 | ) 29 | cls.test_project.save() 30 | cls.url = "/api/projects/" 31 | 32 | def parse_response(self, response): 33 | return json.loads(response.content)[0] 34 | 35 | def test_add_project(self): 36 | response = self.client.post(self.url, {"name":"Postresttest"}, format="json") 37 | assert json.loads(response.content)["slug"] == "postresttest" 38 | assert json.loads(response.content)["name"] == "Postresttest" 39 | 40 | def test_add_project_no_name(self): 41 | response = self.client.post(self.url, {}, format="json") 42 | assert json.loads(response.content)["name"][0] == "This field is required." 43 | 44 | def test_add_project_no_name(self): 45 | response = self.client.post(self.url, {"name":"postrest!"}, format="json") 46 | assert json.loads(response.content)["name"][0] == "Only numbers, letters, underscores, dashes and spaces are allowed." 47 | 48 | def test_get_all_projects(self): 49 | response = self.client.get(self.url) 50 | assert self.parse_response(response) 51 | 52 | def test_get_project_by_id(self): 53 | response = self.client.get(self.url + "?id=%s" % self.test_project.id) 54 | assert self.parse_response(response)["id"] == self.test_project.id 55 | 56 | def test_get_project_by_name(self): 57 | response = self.client.get(self.url + "?name=%s" % self.test_project.name) 58 | assert self.parse_response(response)["name"] == self.test_project.name 59 | 60 | def test_get_project_by_slug(self): 61 | response = self.client.get(self.url + "?slug=%s" % self.test_project.slug) 62 | assert self.parse_response(response)["slug"] == self.test_project.slug 63 | 64 | def test_no_project_exists(self): 65 | response = self.client.get(self.url + "?id=115") 66 | assert not response.data 67 | 68 | def test_change_name(self): 69 | response = self.client.patch(self.url + "%d/" % self.test_project.id, 70 | {'name':'newname'}, format="json") 71 | assert json.loads(response.content)["name"] == "newname" 72 | 73 | def test_change_description(self): 74 | response = self.client.patch(self.url + "%d/" % self.test_project.id, 75 | {'description':'New Description'}, format="json") 76 | assert json.loads(response.content)["description"] == "New Description" 77 | 78 | def test_change_slug_fails(self): 79 | """ 80 | Slug is read-only and cannot be changed. Assert the slug is unchanged. 81 | """ 82 | response = self.client.patch(self.url + "%d/" % self.test_project.id, 83 | {'slug':'Bad Slug'}, format="json") 84 | assert json.loads(response.content)["slug"] == "resttest" 85 | -------------------------------------------------------------------------------- /source/base/urls.py: -------------------------------------------------------------------------------- 1 | """Base url routes. 2 | 3 | This module also dynamically adds url routes to applications present in 4 | INSTALLED_APPS; see the comments below. 5 | 6 | URLs 7 | ---- 8 | 9 | / 10 | about/ 11 | add_project/ 12 | project/ 13 | 14 | """ 15 | 16 | from django.conf.urls import patterns, include, url 17 | from base import views 18 | 19 | 20 | project_slug = r'^projects/(?P[\w-]+)/' 21 | 22 | urlpatterns = patterns('', 23 | url(r'^$', views.IndexView.as_view(), name='index'), 24 | url(r'^seeds/$', views.SeedsListView.as_view(), name='seeds_list'), 25 | url(r'^seeds/(?P[\w-]+)/$', views.EditSeedsView.as_view(), name='edit_seeds'), 26 | url(r'^about/$', views.AboutView.as_view(), name='about'), 27 | url(r'^add_project/$', views.AddProjectView.as_view(), 28 | name='add_project'), 29 | url(project_slug + r'$', views.ProjectView.as_view(), 30 | name='project'), 31 | url(project_slug + r'settings/$', views.ProjectSettingsView.as_view(), 32 | name='project_settings'), 33 | url(project_slug + r'tad/$', views.TadView.as_view(), 34 | name='tad'), 35 | url(project_slug + r'settings/delete/$', views.DeleteProjectView.as_view(), 36 | name='delete_project'), 37 | url(project_slug + r'add_dataset/$', views.AddIndexView.as_view(), 38 | name='add_index'), 39 | url(project_slug + r'datasets/$', views.ListIndicesView.as_view(), 40 | name='indices'), 41 | url(project_slug + r'datasets/(?P[\w-]+)/settings/$', 42 | views.IndexSettingsView.as_view(), name='index_settings'), 43 | url(project_slug + r'datasets/(?P[\w-]+)/settings/delete/$', 44 | views.DeleteIndexView.as_view(), name='delete_index'), 45 | ) 46 | 47 | 48 | # The following lines build a url route to each application listed in 49 | # `settings.EXPLORER_APPS`, namespaced appropriately. 50 | # See 51 | 52 | from django.conf import settings 53 | 54 | inject_urls = [ 55 | url(project_slug, include('apps.%s.urls' % app, namespace=app)) 56 | for app in settings.EXPLORER_APPS 57 | ] 58 | 59 | urlpatterns += patterns('', *inject_urls) 60 | -------------------------------------------------------------------------------- /source/container_volumes/elasticsearch/data/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | path: 2 | logs: /data/log 3 | data: /data/data 4 | -------------------------------------------------------------------------------- /source/container_volumes/solr/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda 2 | 3 | MAINTAINER Benjamin Zaitlen 4 | 5 | RUN apt-get update && \ 6 | apt-get -y upgrade && \ 7 | apt-get -y install openjdk-7-jdk curl 8 | 9 | 10 | RUN export JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64 | tee -a /etc/bashrc 11 | RUN echo JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64 | tee -a /etc/environment 12 | 13 | RUN conda update conda conda-env 14 | 15 | RUN conda install -c memex solr --yes 16 | EXPOSE 8983 17 | 18 | CMD ["solr", "-f", "-V", "-d", "/opt/conda/solr_pkg"] 19 | -------------------------------------------------------------------------------- /source/container_volumes/solr/solr_entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | runsolr.sh 3 | sleep 100 4 | -------------------------------------------------------------------------------- /source/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "memex.settings") 7 | 8 | from django.core.management import execute_from_command_line 9 | 10 | execute_from_command_line(sys.argv) 11 | -------------------------------------------------------------------------------- /source/memex/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from .celery import app as celery_app 4 | 5 | -------------------------------------------------------------------------------- /source/memex/celery.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import subprocess 4 | import os 5 | 6 | from celery import Celery 7 | 8 | 9 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'memex.settings') 10 | from django.conf import settings 11 | 12 | app = Celery('memex') 13 | app.config_from_object("django.conf:settings") 14 | app.autodiscover_tasks(lambda: settings.INSTALLED_APPS) 15 | 16 | 17 | @app.task(bind=True) 18 | def debug_task(self): 19 | print('Request: {0!r}'.format(self.request)) 20 | 21 | -------------------------------------------------------------------------------- /source/memex/local_settings.py: -------------------------------------------------------------------------------- 1 | #should store variables specific to a server. This is * imported into settings.py 2 | HOSTNAME='example.com' 3 | IP_ADDR='0.0.0.0' 4 | ROOT_MEMEX_PORT='8000' 5 | -------------------------------------------------------------------------------- /source/memex/logs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/memex/logs/__init__.py -------------------------------------------------------------------------------- /source/memex/settings_files/deploy_settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for deploying memex project. 3 | 4 | For more information on this file, see 5 | https://docs.djangoproject.com/en/1.7/topics/settings/ 6 | 7 | For the full list of settings and their values, see 8 | https://docs.djangoproject.com/en/1.7/ref/settings/ 9 | """ 10 | 11 | import os 12 | import sys 13 | 14 | from common_settings import * 15 | HOSTNAME='explorer.continuum.io' 16 | 17 | # SECURITY WARNING: keep the secret key used in production secret! 18 | SECRET_KEY = '0#t((zq66&3*87djaltu-pn34%0p!*v_332f2p!$2i)w5y17f8' 19 | 20 | MEDIA_ROOT = '/home/vagrant/resources' 21 | PROJECT_PATH = os.path.join(MEDIA_ROOT, "projects") 22 | 23 | CELERYD_USER="vagrant" 24 | CELERYD_GROUP="vagrant" 25 | 26 | DEPLOYMENT = True 27 | # SECURITY - This should eventually be turned off in deployment 28 | DEBUG = True 29 | 30 | #Must match the urls given in deploy/nginx.conf 31 | EXTERNAL_APP_LOCATIONS = { 32 | 'ddt': 'http://explorer.continuum.io:8084', 33 | 'kibana': '/kibana/', 34 | } 35 | 36 | # A few more checks in deployment that services are running correctly 37 | 38 | REQUIRED_EXTERNAL_APPS = {'celery', 39 | 'elasticsearch', 40 | 'kibana', 41 | 'redis', 42 | 'tika'} 43 | 44 | # but celery imports settings.py too, so don't check if we're celery 45 | 46 | my_process = os.path.basename(sys.argv[0]) 47 | if my_process != 'celery': 48 | assert REQUIRED_EXTERNAL_APPS <= READY_EXTERNAL_APPS -------------------------------------------------------------------------------- /source/memex/settings_files/dev_settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for memex project. 3 | 4 | For more information on this file, see 5 | https://docs.djangoproject.com/en/1.7/topics/settings/ 6 | 7 | For the full list of settings and their values, see 8 | https://docs.djangoproject.com/en/1.7/ref/settings/ 9 | """ 10 | 11 | from common_settings import * 12 | 13 | # SECURITY WARNING: keep the secret key used in production secret! 14 | SECRET_KEY = '0#t((zq66&3*87djaltu-pn34%0p!*v_332f2p!$2i)w5y17f8' 15 | 16 | # SECURITY WARNING: don't run with debug turned on in production! 17 | DEBUG = True 18 | TEMPLATE_DEBUG = True 19 | 20 | INSTALLED_APPS += ('debug_toolbar',) 21 | 22 | MEDIA_ROOT = os.path.join(BASE_DIR, 'resources') 23 | PROJECT_PATH = os.path.join(MEDIA_ROOT, "projects") 24 | DEPLOYMENT = False 25 | 26 | EXTERNAL_APP_LOCATIONS = { 27 | 'ddt': 'http://localhost:8084', 28 | 'tad': 'http://localhost:5000', 29 | 'kibana': 'http://localhost:5601', 30 | } 31 | -------------------------------------------------------------------------------- /source/memex/supervisor_services.py: -------------------------------------------------------------------------------- 1 | """ Module for querying supervisor for running/available services 2 | """ 3 | 4 | import xmlrpclib 5 | import socket 6 | import time 7 | 8 | # supervisor doesn't know how to "wait to bring up a process until other processes are ready" 9 | # but I can wait for 3 seconds before querying supervisor on other process status to give them a chance to get ready 10 | wait_3 = True 11 | 12 | 13 | def check_process_state(process_name, state='RUNNING'): 14 | global wait_3 15 | if wait_3: 16 | time.sleep(3) 17 | wait_3 = False 18 | server = xmlrpclib.Server('http://localhost:9001/RPC2') 19 | try: 20 | response = server.supervisor.getProcessInfo(process_name)['statename'] == state 21 | except socket.error: 22 | response = None 23 | return response -------------------------------------------------------------------------------- /source/memex/test_settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test settings for memex project. 3 | 4 | """ 5 | import warnings 6 | import exceptions 7 | import os 8 | 9 | # Use default settings, overriding only those deemed necessary 10 | from .settings import * 11 | 12 | MEDIA_ROOT = os.path.join(BASE_DIR, 'test_resources') 13 | 14 | MEDIA_URL = '/test_resources/' 15 | 16 | DEPLOYMENT = False 17 | 18 | TEST_CRAWL_DATA = os.path.join(MEDIA_ROOT, "test_crawl_data") 19 | 20 | LANG_DETECT_PATH = os.path.join(MEDIA_ROOT, 'profiles') 21 | 22 | CONFIG_PATH = os.path.join(MEDIA_ROOT, 'configs') 23 | 24 | CCA_PATH = os.path.join(MEDIA_ROOT, 'cca') 25 | 26 | CRAWL_PATH = os.path.join(MEDIA_ROOT, 'crawls') 27 | 28 | MODEL_PATH = os.path.join(MEDIA_ROOT, 'models') 29 | 30 | IMAGES_PATH = os.path.join(MEDIA_ROOT, 'images') 31 | 32 | SEEDS_TMP_DIR = os.path.join(MEDIA_ROOT, 'seeds_tmp') 33 | 34 | MODELS_TMP_DIR = os.path.join(MEDIA_ROOT, 'models_tmp') 35 | 36 | TESTING = True 37 | 38 | # Ignore (particular) warnings 39 | # ============================ 40 | 41 | # RuntimeWarning: 42 | # SQLite received a naive datetime (2012-11-02 11:20:15.156506) 43 | # while time zone support is active. 44 | # http://stackoverflow.com/questions/11708821/ 45 | # django-ipython-sqlite-complains-about-naive-datetime 46 | warnings.filterwarnings("ignore", category=exceptions.RuntimeWarning, 47 | module='django.db.backends.sqlite3.base', lineno=63) 48 | -------------------------------------------------------------------------------- /source/memex/test_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/memex/test_utils/__init__.py -------------------------------------------------------------------------------- /source/memex/test_utils/unit_test_utils.py: -------------------------------------------------------------------------------- 1 | # Test 2 | from django.test import TestCase, Client 3 | 4 | # Utility 5 | from django.core.urlresolvers import reverse 6 | 7 | def form_errors(response): 8 | return response.context['form'].errors 9 | 10 | def get_object(response): 11 | return response.context['object'] 12 | 13 | class UnitTestSkeleton(TestCase): 14 | 15 | @classmethod 16 | def setUpClass(cls): 17 | cls.client = Client() 18 | 19 | 20 | @classmethod 21 | def get(cls, view_name, *args, **kwargs): 22 | if 'slugs' in kwargs: 23 | slugs = kwargs.pop('slugs') 24 | return cls.client.get( 25 | reverse(view_name, kwargs=slugs), 26 | *args, follow=True, **kwargs) 27 | else: 28 | return cls.client.get(reverse(view_name), 29 | *args, follow=True, **kwargs) 30 | 31 | @classmethod 32 | def post(cls, view_name, *args, **kwargs): 33 | if 'slugs' in kwargs: 34 | slugs = kwargs.pop('slugs') 35 | return cls.client.post( 36 | reverse(view_name, kwargs=slugs), 37 | *args, follow=True, **kwargs) 38 | else: 39 | return cls.client.post( 40 | reverse(view_name), 41 | *args, follow=True, **kwargs) 42 | -------------------------------------------------------------------------------- /source/memex/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf.urls import patterns, include, url 2 | from django.contrib import admin 3 | 4 | from rest import router, DataWakeView 5 | 6 | 7 | urlpatterns = patterns('', 8 | url(r'', include('base.urls', namespace="base")), 9 | url(r'^admin/', include(admin.site.urls)), 10 | url(r'^api/', include(router.urls)), 11 | url(r'^api/datawake/$', DataWakeView.as_view(), name="datawake"), 12 | url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')), 13 | ) 14 | -------------------------------------------------------------------------------- /source/memex/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for memex project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.7/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | import sys 12 | 13 | sys.path.insert(1, '/'.join(os.path.abspath(__file__).split('/')[:-2])) 14 | 15 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "memex.settings") 16 | 17 | from django.core.wsgi import get_wsgi_application 18 | application = get_wsgi_application() 19 | 20 | -------------------------------------------------------------------------------- /source/resources/configs/config_default/crawler/crawler.cfg: -------------------------------------------------------------------------------- 1 | # 2 | # 3 | # Crawler's configuration file 4 | # 5 | # 6 | 7 | ROBOT_THREAD_GROUP crawler_group 8 | 9 | ROBOT_MANAGER_RESTINGTIME 10 10 | 11 | ROBOT_MANAGER_CHECKTIME 30000 12 | 13 | ROBOT_MANAGER_MAXTIME 30000 14 | 15 | ROBOT_MANAGER_ERROR_SLEEPTIME 5000 16 | 17 | ROBOT_MANAGER_ROBOT_ERROR_SLEEP_TIME 5000 18 | 19 | ROBOT_MANAGER_ROBOT_THREAD_FACTOR 10 20 | 21 | ROBOT_QUANTITY 5 22 | 23 | ########DOWNLOADER_CONFIGURATION######## 24 | # Indica se as mensagens de log deve ser mostradas no output padrao. 25 | DOWNLOADER_SHOW_NORMAL_LOG true 26 | 27 | # Indica se as mensagens de erro devem ser mostradas no output padrao. 28 | DOWNLOADER_SHOW_ERROR_LOG true 29 | 30 | # Identificador do downloader. 31 | DOWNLOADER_ID simpleGet 32 | 33 | # Protocolo usado para solicita?o no downloader. 34 | DOWNLOADER_PROTOCOL HTTP/1.0 35 | 36 | # Mtodo utilizado na solicita?o. 37 | DOWNLOADER_METHOD GET 38 | 39 | # Para cada URL que um downloader tenta "baixar" ele cria uma thread que responsvel pela 40 | # cria?o do Socket, em algumas URLs o processo de cria?o do socket fica travado, caso isso aconteca 41 | # o donwloader indicar status=FAIL_TIMEOUT e ficar com o thread bloqueado, a varivel abaixo indica 42 | # qual o nmero mximo de Thread de download blockeado que podem existir, se o numero de bloqueado 43 | # for superior ao indicado no parametro DOWNLOADER_MAX_BLOCKED_THREADS o gerenciador morre automaticamente. 44 | DOWNLOADER_MAX_BLOCKED_THREADS 20000000 45 | 46 | # Timeout para realizar a conexo, caso haja estouro do timeout o downloader apresentar status = FAIL_TIMEOUT. 47 | DOWNLOADER_TIMEOUT 5000 48 | 49 | # Indica se devemos seguir um possvel redirecionamento da URL solicitada. 50 | DOWNLOADER_FOLLOW_REDIRECT true 51 | 52 | # Indica quantos redirecionamentos o downloader est habilitado a seguir. 53 | DOWNLOADER_FOLLOW_REDIRECT_TOLERANCE 2 54 | 55 | # 56 | # Para adicionar uma mensagem especfica no request basta colocar uma propriedade com precedida 57 | # pelo prefixo DOWNLOADER_REQUEST. 58 | # 59 | 60 | # Indica o token que separa o nome e o valor da propriedade usada no Request. 61 | DOWNLOADER_TOKEN | 62 | 63 | # Parametro que ajusta o campo User-Agent. 64 | DOWNLOADER_REQUEST_AGENT User-Agent | Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 4.0) 65 | #Mozilla/4.0 (compatible; MSIE 6.0; Windows 98) 66 | #Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 4.0) 67 | #Mozilla/5.0 (Windows; U; Win98; en-US; rv:0.9.2) Gecko/20010726 Netscape6/6.1 68 | #Mozilla/4.76 [en] (X11; U; Linux 2.4.2-2 i586) 69 | 70 | # Parametro que ajusta o campo Accept. 71 | DOWNLOADER_REQUEST_ACCEPT Accept | */* 72 | -------------------------------------------------------------------------------- /source/resources/configs/config_default/form_storage/form_storage.cfg: -------------------------------------------------------------------------------- 1 | # 2 | # Configuration file of linkStorage 3 | # 4 | 5 | 6 | # Parameters for making available remotely the linkStorage 7 | . ../home.cfg 8 | 9 | FILE_CLASSIFIER ${CONFIG_DIR}/form_storage/gfc1.model 10 | 11 | ATTRIBUTES hidden checkbox radio file_input submit select image_input button reset password textbox elem_select total_size_text method search_string 12 | 13 | CLASS_VALUES yes no 14 | 15 | FORM_STORAGE_DIRECTORY ${DATA_DIR}/data_form/ 16 | 17 | FORM_HASH_DIRECTORY ${DATA_DIR}/data_host/ 18 | 19 | NON_SEARCHABLE_DIRECTORY ${DATA_DIR}/data_non_search_form/ 20 | 21 | STOPLIST ${CONFIG_DIR}/stoplist.txt 22 | 23 | PAGE_SAVED_DIRECTORY ${DATA_DIR}/saved_page/ 24 | 25 | BAD_LIST_DIRECTORY ${DATA_DIR}/data_bad_list 26 | 27 | CONT_BAD_URL_LIMIT 5 28 | 29 | VISIT_PAGE_LIMIT 99999 30 | 31 | MAX_CACHE_HOST_SIZE 1000 32 | 33 | MAX_REPEATED_FORMS_PER_SITE 10 34 | 35 | MAX_FORMS_PER_SITE 3 36 | 37 | LINK_STORAGE_FILE ${CONFIG_DIR}/link_storage/link_storage.cfg 38 | 39 | STATISTIC_FILE ${DATA_DIR}/statistics.txt 40 | 41 | TSFC_CONFIG ${CONFIG_DIR}/form_storage/tsfc/car.cfg 42 | 43 | BOW_HOST localhost 44 | 45 | BOW_PORT 5000 46 | 47 | USE_BOW TRUE 48 | 49 | # Configuration file of linkStorage############################################################# 50 | 51 | RMI_STORAGE_SERVER_HOST localhost 52 | 53 | RMI_STORAGE_SERVER_PORT 3191 54 | 55 | RMI_STORAGE_SERVER_NAME formStorage 56 | 57 | STORAGE_TRY_NUMBER 1 58 | 59 | STORAGE_DELAY_AFTER_EXCEPTION 5 60 | 61 | STORAGE_BINDER_CLASSNAME focusedCrawler.util.storage.socket.StorageBinder 62 | 63 | STORAGE_FACTORY_CLASSNAME focusedCrawler.util.storage.socket.StorageRemoteAdapterFactory 64 | 65 | -------------------------------------------------------------------------------- /source/resources/configs/config_default/link_storage/backlink.cfg: -------------------------------------------------------------------------------- 1 | ###### 2 | # This configuration file has the paramters to run the LinkClassifier 3 | # 4 | #### 5 | . ../home.cfg 6 | 7 | PATTERN_INI ,"uu":" 8 | 9 | #PATTERN_INI title=" 10 | #
11 | #q=related: 12 | 13 | PATTERN_END "} 14 | #PATTERN_END "> 15 | # 16 | #>similar  17 | 18 | PATTERN_INI_TITLE ,"ut":" 19 | 20 | PATTERN_END_TITLE ","uu": 21 | 22 | DEEP_BACKLINK 3 23 | 24 | NUM_BACKLINK 10 25 | 26 | #BACKLINK http://siteexplorer.search.yahoo.com/search?bwm=i&bwmo=d&bwmf=u&p= 27 | #BACKLINK http://www.altavista.com/web/results?itag=ody&q=link: 28 | BACKLINK http://www.opensiteexplorer.org/links.html?site= 29 | 30 | 31 | CONNECT_TIMEOUT 5000 32 | 33 | READ_TIMEOUT 5000 34 | 35 | 36 | -------------------------------------------------------------------------------- /source/resources/configs/config_default/link_storage/link_storage.cfg: -------------------------------------------------------------------------------- 1 | # 2 | # Configuration file of linkStorage 3 | # 4 | 5 | 6 | #Export all urls in the frontier each time when a certain number of new urls are inserted 7 | FRONTIER_REFRESH_FREQUENCY 500 8 | 9 | #STOP CONDITION: when to stop crawling pages from a domain 10 | MAX_PAGES_PER_DOMAIN 100 11 | 12 | CLASS_VALUES 0 1 2 13 | 14 | #Allows the crawler to follow forward links 15 | GRAB_LINKS TRUE 16 | 17 | #Restricts the crawler to the sites provided as seeds 18 | USE_SCOPE FALSE 19 | 20 | #Gets backlinks of the pages from a search engine used by the bipartite crawling 21 | SAVE_BACKLINKS FALSE 22 | 23 | #Type of classifier used by link storage 24 | ##LinkClassifierBaseline: random link strategy when no page classifier is provided, or Soumen's baseline strategy when a page classifier is provided 25 | ##LinkClassifierImpl: link strategy using a link classifier 26 | ##LinkClassifierAuthority: link strategy for the bipartite crawling 27 | TYPE_OF_CLASSIFIER LinkClassifierBaseline 28 | 29 | #Create classifiers on-the-fly 30 | ONLINE_LEARNING FALSE 31 | 32 | #Type of online learning (FORWARD_CLASSIFIER_BINARY,FORWARD_CLASSIFIER_BINARY) 33 | ##FORWARD_CLASSIFIER_BINARY: pos/neg link classifier 34 | ##FORWARD_CLASSIFIER_LEVELS: contextual graph with 3 levels 35 | ONLINE_METHOD FORWARD_CLASSIFIER_BINARY 36 | 37 | #Learn iteration criterion (every n pages runs online learning) 38 | LEARNING_LIMIT 500 39 | 40 | #########OTHER PARAMETERS########## 41 | MAX_SIZE_LINK_QUEUE 100000 42 | 43 | MAX_CACHE_URLS_SIZE 200000 44 | 45 | LINK_DIRECTORY data_url/dir 46 | 47 | MAX_CACHE_BACKLINK_SIZE 100000 48 | 49 | BACKLINK_DIRECTORY ${DATA_DIR}/data_backlinks/dir 50 | 51 | BACKLINK_DIRECTORY ${DATA_DIR}/data_backlinks/dir 52 | 53 | HUB_DIRECTORY ${DATA_DIR}/data_backlinks/hubHash/ 54 | 55 | URL_ID_DIRECTORY data_backlinks/url 56 | 57 | AUTH_ID_DIRECTORY data_backlinks/auth_id 58 | 59 | AUTH_GRAPH_DIRECTORY data_backlinks/auth_graph 60 | 61 | HUB_ID_DIRECTORY data_backlinks/hub_id 62 | 63 | HUB_GRAPH_DIRECTORY data_backlinks/hub_graph 64 | 65 | CONFIG_FILTERING ${CONFIG_DIR}/form_storage/filtering.cfg 66 | 67 | CONFIG_LCRECREATOR ${CONFIG_DIR}/classifierRecreator.cfg 68 | 69 | BACKLINK_CONFIG ${CONFIG_DIR}/link_storage/backlink.cfg 70 | 71 | FORM_STORAGE_FILE ${CONFIG_DIR}/target_storage/target_storage.cfg 72 | 73 | NOT_RELEVANT_DIRECTORY ${DATA_DIR}/data_not_relevance 74 | 75 | TARGET_STORAGE_DIRECTORY data_target/ 76 | 77 | 78 | ######################################STORAGE_FILE############################################### 79 | 80 | RMI_STORAGE_SERVER_HOST localhost 81 | 82 | RMI_STORAGE_SERVER_PORT 1988 83 | 84 | RMI_STORAGE_SERVER_NAME linkStorage 85 | 86 | STORAGE_TRY_NUMBER 1 87 | 88 | STORAGE_DELAY_AFTER_EXCEPTION 5 89 | 90 | STORAGE_BINDER_CLASSNAME focusedCrawler.util.storage.socket.StorageBinder 91 | 92 | STORAGE_FACTORY_CLASSNAME focusedCrawler.util.storage.socket.StorageRemoteAdapterFactory 93 | -------------------------------------------------------------------------------- /source/resources/configs/config_default/stoplist.txt: -------------------------------------------------------------------------------- 1 | *** Excecoes 2 | *** Palavras Irrelevantes 3 | 17 4 | 5 | 2003 6 | 7 | a 8 | about 9 | above 10 | all 11 | also 12 | am 13 | an 14 | and 15 | are 16 | as 17 | asked 18 | at 19 | away 20 | back 21 | be 22 | because 23 | been 24 | below 25 | beside 26 | between 27 | both 28 | but 29 | by 30 | bytes 31 | 32 | came 33 | can 34 | com 35 | 36 | come 37 | could 38 | did 39 | do 40 | doing 41 | done 42 | each 43 | edu 44 | 45 | else 46 | for 47 | from 48 | fri 49 | 50 | gmt 51 | 52 | gov 53 | 54 | had 55 | has 56 | have 57 | he 58 | her 59 | hers 60 | him 61 | his 62 | 63 | href 64 | 65 | html 66 | 67 | http 68 | 69 | i 70 | if 71 | in 72 | into 73 | it 74 | its 75 | let 76 | many 77 | me 78 | mon 79 | 80 | mr 81 | much 82 | my 83 | no 84 | none 85 | nor 86 | not 87 | now 88 | 89 | oct 90 | 91 | of 92 | off 93 | ok 94 | on 95 | onto 96 | or 97 | 98 | org 99 | 100 | other 101 | our 102 | ours 103 | out 104 | over 105 | own 106 | part 107 | sat 108 | 109 | say 110 | sep 111 | 112 | since 113 | so 114 | soon 115 | still 116 | such 117 | text 118 | 119 | than 120 | that 121 | the 122 | them 123 | then 124 | there 125 | these 126 | they 127 | this 128 | those 129 | thu 130 | 131 | to 132 | too 133 | tue 134 | 135 | under 136 | until 137 | up 138 | upon 139 | very 140 | was 141 | we 142 | wed 143 | 144 | were 145 | what 146 | when 147 | which 148 | whose 149 | why 150 | with 151 | would 152 | www 153 | 154 | yet 155 | you 156 | your 157 | yours 158 | *** Terminacoes Ignoraveis 159 | *** Palavras Complementares 160 | -------------------------------------------------------------------------------- /source/resources/configs/config_default/target_storage/target_storage.cfg: -------------------------------------------------------------------------------- 1 | # 2 | # Configuration file of linkStorage 3 | # 4 | DATA_FORMAT ELASTICSEARCH 5 | ELASTICSEARCH_HOST localhost 6 | ELASTICSEARCH_PORT 9300 7 | 8 | #Dump crawler's status when a certain number of pages are crawled 9 | CRAWLED_REFRESH_FREQUENCY 5 10 | RELEVANT_REFRESH_FREQUENCY 5 11 | HARVESTINFO_REFRESH_FREQUENCY 5 12 | 13 | SYNC_REFRESH_FREQUENCY 5 14 | REFRESH_SYNC TRUE 15 | 16 | SAVE_NEGATIVE_PAGES FALSE 17 | 18 | TARGET_STORAGE_DIRECTORY data_target 19 | NEGATIVE_STORAGE_DIRECTORY data_negative 20 | 21 | VISITED_PAGE_LIMIT 90000000 22 | 23 | MAX_CACHE_HOST_SIZE 1000 24 | 25 | #uses a classifier to collect pages in topics,domains etc. 26 | USE_CLASSIFIER TRUE 27 | 28 | #Performs hard focus or soft focus 29 | HARD_FOCUS TRUE 30 | 31 | #Run bipartite crawler 32 | BIPARTITE FALSE 33 | 34 | #Relevance threshold for classified pages. Pages with probability of being 35 | #relevant above this threshold are considered relevant 36 | RELEVANCE_THRESHOLD 0.9 37 | 38 | #Format of crawled data, either CBOR or PLAINFILE 39 | #DATA_FORMAT FILE 40 | #DATA_FORMAT CBOR 41 | 42 | ######################################STORAGE_FILE############################################### 43 | 44 | RMI_STORAGE_SERVER_HOST localhost 45 | 46 | RMI_STORAGE_SERVER_PORT 1987 47 | 48 | RMI_STORAGE_SERVER_NAME formStorage 49 | 50 | STORAGE_TRY_NUMBER 1 51 | 52 | STORAGE_DELAY_AFTER_EXCEPTION 5 53 | 54 | STORAGE_BINDER_CLASSNAME focusedCrawler.util.storage.socket.StorageBinder 55 | 56 | STORAGE_FACTORY_CLASSNAME focusedCrawler.util.storage.socket.StorageRemoteAdapterFactory 57 | -------------------------------------------------------------------------------- /source/resources/logs/README.md: -------------------------------------------------------------------------------- 1 | This directory is where supervisor logfiles go by default 2 | 3 | It's recommended that you edit supervisord.conf to point elsewhere in production settings 4 | -------------------------------------------------------------------------------- /source/supervisord.conf: -------------------------------------------------------------------------------- 1 | [supervisorctl] 2 | username=cloud-user 3 | 4 | [supervisord] 5 | childlogdir=resources/logs 6 | logfile=supervisord.log ; (main log file;default $CWD/supervisord.log) 7 | logfile_maxbytes=50MB ; (max main logfile bytes b4 rotation;default 50MB) 8 | logfile_backups=10 ; (num of main logfile rotation backups;default 10) 9 | loglevel=info ; (log level;default info; others: debug,warn,trace) 10 | pidfile=supervisord.pid ; (supervisord pidfile;default supervisord.pid) 11 | nodaemon=true ; (start in foreground if true;default false) 12 | minfds=1024 ; (min. avail startup file descriptors;default 1024) 13 | minprocs=200 ; (min. avail process descriptors;default 200) 14 | 15 | [inet_http_server] 16 | port = 127.0.0.1:9001 17 | 18 | [program:redis] 19 | command=redis-server 20 | priority=1 21 | 22 | [program:elasticsearch] 23 | command=elasticsearch 24 | priority=1 25 | 26 | [program:nutch] 27 | command=nutch startserver 28 | priority=1 29 | 30 | [program:rabbitmq] 31 | command=rabbitmq-server 32 | priority=1 33 | autostart=false 34 | 35 | [program:bokeh-server] 36 | command=bokeh-server --backend memory --port 5006 37 | priority=1 38 | autostart=false 39 | 40 | [program:tika] 41 | command=tika-rest-server 42 | priority=2 43 | 44 | [program:kibana] 45 | command=kibana 46 | priority=2 47 | 48 | [program:ddt] 49 | command=ddt 50 | priority=5 51 | autostart=false 52 | 53 | [program:tad] 54 | command=tad 55 | priority=5 56 | autostart=false 57 | 58 | # have to bring up celery after other applications 59 | [program:celery] 60 | command=celery -A memex worker -l info --loglevel=debug --logfile=resources/logs/celery-worker.log 61 | priority=10 62 | killasgroup=true 63 | 64 | [program:django] 65 | command=python manage.py runserver 127.0.0.1:8000 66 | priority=20 67 | stopasgroup=true 68 | 69 | [rpcinterface:supervisor] 70 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface 71 | -------------------------------------------------------------------------------- /source/task_manager/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /source/task_manager/file_tasks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import zipfile 4 | 5 | from celery import shared_task, Task, task 6 | 7 | from tika_tasks import create_index 8 | 9 | from django.conf import settings 10 | 11 | 12 | class UploadZipTask(Task): 13 | abstract = True 14 | 15 | def on_failure(self, *args, **kwargs): 16 | """If there is an error, set the index status to UPLOAD FAILURE.""" 17 | self.index.status = "UPLOAD FAILURE" 18 | self.index.save() 19 | 20 | def on_success(self, *args, **kwargs): 21 | """ 22 | If the upload task succeeded, change index status to UPLOAD SUCCESS. 23 | 24 | If we are in deployment mode, create the corresponding index after the task 25 | has succeeded. 26 | """ 27 | self.index.status = "UPLOAD SUCCESS" 28 | self.index.num_files = len(os.listdir(self.index.get_dumped_data_path())) 29 | self.index.save() 30 | create_index.delay(self.index) 31 | 32 | 33 | @shared_task(bind=True, base=UploadZipTask) 34 | def upload_zip(self, index, *args, **kwargs): 35 | """ 36 | Celery task which unzips files in a .zip archive and ignores folder 37 | structure, taking each file to the top level of the output folder. 38 | """ 39 | self.index = index 40 | self.index.status = "STARTED" 41 | self.index.save() 42 | if not os.path.exists(self.index.data_folder): 43 | os.mkdir(self.index.data_folder) 44 | with zipfile.ZipFile(self.index.uploaded_data.name) as archive: 45 | for x in archive.namelist(): 46 | filename = os.path.basename(x).decode("utf-8") 47 | if not filename: 48 | continue 49 | source = archive.open(x) 50 | target = open(os.path.join(self.index.data_folder, filename), "wb") 51 | with source, target: 52 | shutil.copyfileobj(source, target) 53 | return "success" 54 | -------------------------------------------------------------------------------- /source/task_manager/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.db import models, migrations 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('base', '0001_initial'), 11 | ('crawl_space', '0001_initial'), 12 | ] 13 | 14 | operations = [ 15 | migrations.CreateModel( 16 | name='CeleryTask', 17 | fields=[ 18 | ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), 19 | ('pid', models.IntegerField(default=0)), 20 | ('uuid', models.TextField()), 21 | ('crawl', models.OneToOneField(null=True, default=None, blank=True, to='crawl_space.Crawl')), 22 | ('index', models.OneToOneField(null=True, default=None, blank=True, to='base.Index')), 23 | ], 24 | options={ 25 | }, 26 | bases=(models.Model,), 27 | ), 28 | ] 29 | -------------------------------------------------------------------------------- /source/task_manager/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/task_manager/migrations/__init__.py -------------------------------------------------------------------------------- /source/task_manager/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | from celery.result import AsyncResult 4 | 5 | from apps.crawl_space.models import Crawl 6 | 7 | from base.models import Index 8 | 9 | 10 | class CeleryTask(models.Model): 11 | 12 | pid = models.IntegerField(default=0) 13 | crawl = models.OneToOneField(Crawl, blank=True, null=True, default=None) 14 | index = models.OneToOneField(Index, blank=True, null=True, default=None) 15 | uuid = models.TextField() 16 | 17 | @property 18 | def task(self): 19 | """ 20 | Gives an instance of the crawl task which can be used to check on the 21 | status of the crawl. 22 | """ 23 | return AsyncResult(self.uuid) 24 | 25 | def __unicode__(self): 26 | return str(self.uuid) 27 | -------------------------------------------------------------------------------- /source/task_manager/tasks.py: -------------------------------------------------------------------------------- 1 | from crawl_tasks import ache, nutch 2 | from file_tasks import upload_zip 3 | from tika_tasks import create_index 4 | -------------------------------------------------------------------------------- /source/task_manager/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /source/templates/403.html: -------------------------------------------------------------------------------- 1 | {% extends "base/base.html" %} 2 | 3 | {% block title %}Page Not Found{% endblock %} 4 | 5 | {% load static %} 6 | 7 | {% block content %} 8 | 9 |
10 |
11 |

Permission Denied (403)

12 |

You do not have the proper permissions to view this page. Visit the index and seek again!

13 |
14 |
15 | 16 | 17 | {% endblock content %} 18 | -------------------------------------------------------------------------------- /source/templates/404.html: -------------------------------------------------------------------------------- 1 | {% extends "base/base.html" %} 2 | 3 | {% block title %}Page Not Found{% endblock %} 4 | 5 | {% load static %} 6 | 7 | {% block content %} 8 | 9 |
10 |
11 |

Page Not Found (404)

12 |

Perhaps visit the index and seek again!

13 |
14 |
15 | 16 | 17 | {% endblock content %} 18 | -------------------------------------------------------------------------------- /source/templates/500.html: -------------------------------------------------------------------------------- 1 | {% extends "base/base.html" %} 2 | 3 | {% block title %}Server Error{% endblock %} 4 | 5 | {% load static %} 6 | 7 | {% block content %} 8 | 9 |
10 |
11 |

Internal Server Error (500)

12 |

An administrator has been notified.

13 |

Perhaps visit the index and seek again!

14 |
15 |
16 | 17 | {% endblock content %} 18 | -------------------------------------------------------------------------------- /source/test_resources/configs/config_default/crawler/crawler.cfg: -------------------------------------------------------------------------------- 1 | # 2 | # 3 | # Crawler's configuration file 4 | # 5 | # 6 | 7 | ROBOT_THREAD_GROUP crawler_group 8 | 9 | ROBOT_MANAGER_RESTINGTIME 10 10 | 11 | ROBOT_MANAGER_CHECKTIME 30000 12 | 13 | ROBOT_MANAGER_MAXTIME 30000 14 | 15 | ROBOT_MANAGER_ERROR_SLEEPTIME 5000 16 | 17 | ROBOT_MANAGER_ROBOT_ERROR_SLEEP_TIME 5000 18 | 19 | ROBOT_MANAGER_ROBOT_THREAD_FACTOR 10 20 | 21 | ROBOT_QUANTITY 5 22 | 23 | ########DOWNLOADER_CONFIGURATION######## 24 | # Indica se as mensagens de log deve ser mostradas no output padrao. 25 | DOWNLOADER_SHOW_NORMAL_LOG true 26 | 27 | # Indica se as mensagens de erro devem ser mostradas no output padrao. 28 | DOWNLOADER_SHOW_ERROR_LOG true 29 | 30 | # Identificador do downloader. 31 | DOWNLOADER_ID simpleGet 32 | 33 | # Protocolo usado para solicita?o no downloader. 34 | DOWNLOADER_PROTOCOL HTTP/1.0 35 | 36 | # Mtodo utilizado na solicita?o. 37 | DOWNLOADER_METHOD GET 38 | 39 | # Para cada URL que um downloader tenta "baixar" ele cria uma thread que responsvel pela 40 | # cria?o do Socket, em algumas URLs o processo de cria?o do socket fica travado, caso isso aconteca 41 | # o donwloader indicar status=FAIL_TIMEOUT e ficar com o thread bloqueado, a varivel abaixo indica 42 | # qual o nmero mximo de Thread de download blockeado que podem existir, se o numero de bloqueado 43 | # for superior ao indicado no parametro DOWNLOADER_MAX_BLOCKED_THREADS o gerenciador morre automaticamente. 44 | DOWNLOADER_MAX_BLOCKED_THREADS 20000000 45 | 46 | # Timeout para realizar a conexo, caso haja estouro do timeout o downloader apresentar status = FAIL_TIMEOUT. 47 | DOWNLOADER_TIMEOUT 5000 48 | 49 | # Indica se devemos seguir um possvel redirecionamento da URL solicitada. 50 | DOWNLOADER_FOLLOW_REDIRECT true 51 | 52 | # Indica quantos redirecionamentos o downloader est habilitado a seguir. 53 | DOWNLOADER_FOLLOW_REDIRECT_TOLERANCE 2 54 | 55 | # 56 | # Para adicionar uma mensagem especfica no request basta colocar uma propriedade com precedida 57 | # pelo prefixo DOWNLOADER_REQUEST. 58 | # 59 | 60 | # Indica o token que separa o nome e o valor da propriedade usada no Request. 61 | DOWNLOADER_TOKEN | 62 | 63 | # Parametro que ajusta o campo User-Agent. 64 | DOWNLOADER_REQUEST_AGENT User-Agent | Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 4.0) 65 | #Mozilla/4.0 (compatible; MSIE 6.0; Windows 98) 66 | #Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 4.0) 67 | #Mozilla/5.0 (Windows; U; Win98; en-US; rv:0.9.2) Gecko/20010726 Netscape6/6.1 68 | #Mozilla/4.76 [en] (X11; U; Linux 2.4.2-2 i586) 69 | 70 | # Parametro que ajusta o campo Accept. 71 | DOWNLOADER_REQUEST_ACCEPT Accept | */* 72 | -------------------------------------------------------------------------------- /source/test_resources/configs/config_default/form_storage/form_storage.cfg: -------------------------------------------------------------------------------- 1 | # 2 | # Configuration file of linkStorage 3 | # 4 | 5 | 6 | # Parameters for making available remotely the linkStorage 7 | . ../home.cfg 8 | 9 | FILE_CLASSIFIER ${CONFIG_DIR}/form_storage/gfc1.model 10 | 11 | ATTRIBUTES hidden checkbox radio file_input submit select image_input button reset password textbox elem_select total_size_text method search_string 12 | 13 | CLASS_VALUES yes no 14 | 15 | FORM_STORAGE_DIRECTORY ${DATA_DIR}/data_form/ 16 | 17 | FORM_HASH_DIRECTORY ${DATA_DIR}/data_host/ 18 | 19 | NON_SEARCHABLE_DIRECTORY ${DATA_DIR}/data_non_search_form/ 20 | 21 | STOPLIST ${CONFIG_DIR}/stoplist.txt 22 | 23 | PAGE_SAVED_DIRECTORY ${DATA_DIR}/saved_page/ 24 | 25 | BAD_LIST_DIRECTORY ${DATA_DIR}/data_bad_list 26 | 27 | CONT_BAD_URL_LIMIT 5 28 | 29 | VISIT_PAGE_LIMIT 99999 30 | 31 | MAX_CACHE_HOST_SIZE 1000 32 | 33 | MAX_REPEATED_FORMS_PER_SITE 10 34 | 35 | MAX_FORMS_PER_SITE 3 36 | 37 | LINK_STORAGE_FILE ${CONFIG_DIR}/link_storage/link_storage.cfg 38 | 39 | STATISTIC_FILE ${DATA_DIR}/statistics.txt 40 | 41 | TSFC_CONFIG ${CONFIG_DIR}/form_storage/tsfc/car.cfg 42 | 43 | BOW_HOST localhost 44 | 45 | BOW_PORT 5000 46 | 47 | USE_BOW TRUE 48 | 49 | # Configuration file of linkStorage############################################################# 50 | 51 | RMI_STORAGE_SERVER_HOST localhost 52 | 53 | RMI_STORAGE_SERVER_PORT 3191 54 | 55 | RMI_STORAGE_SERVER_NAME formStorage 56 | 57 | STORAGE_TRY_NUMBER 1 58 | 59 | STORAGE_DELAY_AFTER_EXCEPTION 5 60 | 61 | STORAGE_BINDER_CLASSNAME focusedCrawler.util.storage.socket.StorageBinder 62 | 63 | STORAGE_FACTORY_CLASSNAME focusedCrawler.util.storage.socket.StorageRemoteAdapterFactory 64 | 65 | -------------------------------------------------------------------------------- /source/test_resources/configs/config_default/link_storage/backlink.cfg: -------------------------------------------------------------------------------- 1 | ###### 2 | # This configuration file has the paramters to run the LinkClassifier 3 | # 4 | #### 5 | . ../home.cfg 6 | 7 | PATTERN_INI ,"uu":" 8 | 9 | #PATTERN_INI title=" 10 | #
11 | #q=related: 12 | 13 | PATTERN_END "} 14 | #PATTERN_END "> 15 | # 16 | #>similar  17 | 18 | PATTERN_INI_TITLE ,"ut":" 19 | 20 | PATTERN_END_TITLE ","uu": 21 | 22 | DEEP_BACKLINK 3 23 | 24 | NUM_BACKLINK 10 25 | 26 | #BACKLINK http://siteexplorer.search.yahoo.com/search?bwm=i&bwmo=d&bwmf=u&p= 27 | #BACKLINK http://www.altavista.com/web/results?itag=ody&q=link: 28 | BACKLINK http://www.opensiteexplorer.org/links.html?site= 29 | 30 | 31 | CONNECT_TIMEOUT 5000 32 | 33 | READ_TIMEOUT 5000 34 | 35 | 36 | -------------------------------------------------------------------------------- /source/test_resources/configs/config_default/link_storage/link_storage.cfg: -------------------------------------------------------------------------------- 1 | # 2 | # Configuration file of linkStorage 3 | # 4 | 5 | 6 | #Export all urls in the frontier each time when a certain number of new urls are inserted 7 | FRONTIER_REFRESH_FREQUENCY 500 8 | 9 | #STOP CONDITION: when to stop crawling pages from a domain 10 | MAX_PAGES_PER_DOMAIN 100 11 | 12 | CLASS_VALUES 0 1 2 13 | 14 | #Allows the crawler to follow forward links 15 | GRAB_LINKS TRUE 16 | 17 | #Restricts the crawler to the sites provided as seeds 18 | USE_SCOPE FALSE 19 | 20 | #Gets backlinks of the pages from a search engine used by the bipartite crawling 21 | SAVE_BACKLINKS FALSE 22 | 23 | #Type of classifier used by link storage 24 | ##LinkClassifierBaseline: random link strategy when no page classifier is provided, or Soumen's baseline strategy when a page classifier is provided 25 | ##LinkClassifierImpl: link strategy using a link classifier 26 | ##LinkClassifierAuthority: link strategy for the bipartite crawling 27 | TYPE_OF_CLASSIFIER LinkClassifierBaseline 28 | 29 | #Create classifiers on-the-fly 30 | ONLINE_LEARNING FALSE 31 | 32 | #Type of online learning (FORWARD_CLASSIFIER_BINARY,FORWARD_CLASSIFIER_BINARY) 33 | ##FORWARD_CLASSIFIER_BINARY: pos/neg link classifier 34 | ##FORWARD_CLASSIFIER_LEVELS: contextual graph with 3 levels 35 | ONLINE_METHOD FORWARD_CLASSIFIER_BINARY 36 | 37 | #Learn iteration criterion (every n pages runs online learning) 38 | LEARNING_LIMIT 500 39 | 40 | #########OTHER PARAMETERS########## 41 | MAX_SIZE_LINK_QUEUE 100000 42 | 43 | MAX_CACHE_URLS_SIZE 200000 44 | 45 | LINK_DIRECTORY data_url/dir 46 | 47 | MAX_CACHE_BACKLINK_SIZE 100000 48 | 49 | BACKLINK_DIRECTORY ${DATA_DIR}/data_backlinks/dir 50 | 51 | BACKLINK_DIRECTORY ${DATA_DIR}/data_backlinks/dir 52 | 53 | HUB_DIRECTORY ${DATA_DIR}/data_backlinks/hubHash/ 54 | 55 | URL_ID_DIRECTORY data_backlinks/url 56 | 57 | AUTH_ID_DIRECTORY data_backlinks/auth_id 58 | 59 | AUTH_GRAPH_DIRECTORY data_backlinks/auth_graph 60 | 61 | HUB_ID_DIRECTORY data_backlinks/hub_id 62 | 63 | HUB_GRAPH_DIRECTORY data_backlinks/hub_graph 64 | 65 | CONFIG_FILTERING ${CONFIG_DIR}/form_storage/filtering.cfg 66 | 67 | CONFIG_LCRECREATOR ${CONFIG_DIR}/classifierRecreator.cfg 68 | 69 | BACKLINK_CONFIG ${CONFIG_DIR}/link_storage/backlink.cfg 70 | 71 | FORM_STORAGE_FILE ${CONFIG_DIR}/target_storage/target_storage.cfg 72 | 73 | NOT_RELEVANT_DIRECTORY ${DATA_DIR}/data_not_relevance 74 | 75 | TARGET_STORAGE_DIRECTORY data_target/ 76 | 77 | 78 | ######################################STORAGE_FILE############################################### 79 | 80 | RMI_STORAGE_SERVER_HOST localhost 81 | 82 | RMI_STORAGE_SERVER_PORT 1988 83 | 84 | RMI_STORAGE_SERVER_NAME linkStorage 85 | 86 | STORAGE_TRY_NUMBER 1 87 | 88 | STORAGE_DELAY_AFTER_EXCEPTION 5 89 | 90 | STORAGE_BINDER_CLASSNAME focusedCrawler.util.storage.socket.StorageBinder 91 | 92 | STORAGE_FACTORY_CLASSNAME focusedCrawler.util.storage.socket.StorageRemoteAdapterFactory 93 | -------------------------------------------------------------------------------- /source/test_resources/configs/config_default/stoplist.txt: -------------------------------------------------------------------------------- 1 | *** Excecoes 2 | *** Palavras Irrelevantes 3 | 17 4 | 5 | 2003 6 | 7 | a 8 | about 9 | above 10 | all 11 | also 12 | am 13 | an 14 | and 15 | are 16 | as 17 | asked 18 | at 19 | away 20 | back 21 | be 22 | because 23 | been 24 | below 25 | beside 26 | between 27 | both 28 | but 29 | by 30 | bytes 31 | 32 | came 33 | can 34 | com 35 | 36 | come 37 | could 38 | did 39 | do 40 | doing 41 | done 42 | each 43 | edu 44 | 45 | else 46 | for 47 | from 48 | fri 49 | 50 | gmt 51 | 52 | gov 53 | 54 | had 55 | has 56 | have 57 | he 58 | her 59 | hers 60 | him 61 | his 62 | 63 | href 64 | 65 | html 66 | 67 | http 68 | 69 | i 70 | if 71 | in 72 | into 73 | it 74 | its 75 | let 76 | many 77 | me 78 | mon 79 | 80 | mr 81 | much 82 | my 83 | no 84 | none 85 | nor 86 | not 87 | now 88 | 89 | oct 90 | 91 | of 92 | off 93 | ok 94 | on 95 | onto 96 | or 97 | 98 | org 99 | 100 | other 101 | our 102 | ours 103 | out 104 | over 105 | own 106 | part 107 | sat 108 | 109 | say 110 | sep 111 | 112 | since 113 | so 114 | soon 115 | still 116 | such 117 | text 118 | 119 | than 120 | that 121 | the 122 | them 123 | then 124 | there 125 | these 126 | they 127 | this 128 | those 129 | thu 130 | 131 | to 132 | too 133 | tue 134 | 135 | under 136 | until 137 | up 138 | upon 139 | very 140 | was 141 | we 142 | wed 143 | 144 | were 145 | what 146 | when 147 | which 148 | whose 149 | why 150 | with 151 | would 152 | www 153 | 154 | yet 155 | you 156 | your 157 | yours 158 | *** Terminacoes Ignoraveis 159 | *** Palavras Complementares 160 | -------------------------------------------------------------------------------- /source/test_resources/configs/config_default/target_storage/target_storage.cfg: -------------------------------------------------------------------------------- 1 | # 2 | # Configuration file of linkStorage 3 | # 4 | 5 | 6 | #Dump crawler's status when a certain number of pages are crawled 7 | CRAWLED_REFRESH_FREQUENCY 500 8 | RELEVANT_REFRESH_FREQUENCY 500 9 | HARVESTINFO_REFRESH_FREQUENCY 100 10 | 11 | SYNC_REFRESH_FREQUENCY 100 12 | REFRESH_SYNC TRUE 13 | 14 | SAVE_NEGATIVE_PAGES FALSE 15 | 16 | TARGET_STORAGE_DIRECTORY data_target 17 | NEGATIVE_STORAGE_DIRECTORY data_negative 18 | 19 | VISITED_PAGE_LIMIT 90000000 20 | 21 | MAX_CACHE_HOST_SIZE 1000 22 | 23 | #uses a classifier to collect pages in topics,domains etc. 24 | USE_CLASSIFIER TRUE 25 | 26 | #Performs hard focus or soft focus 27 | HARD_FOCUS TRUE 28 | 29 | #Run bipartite crawler 30 | BIPARTITE FALSE 31 | 32 | #Relevance threshold for classified pages. Pages with probability of being 33 | #relevant above this threshold are considered relevant 34 | RELEVANCE_THRESHOLD 0.9 35 | 36 | #Format of crawled data, either CBOR or PLAINFILE 37 | DATA_FORMAT FILE 38 | #DATA_FORMAT CBOR 39 | 40 | ######################################STORAGE_FILE############################################### 41 | 42 | RMI_STORAGE_SERVER_HOST localhost 43 | 44 | RMI_STORAGE_SERVER_PORT 1987 45 | 46 | RMI_STORAGE_SERVER_NAME formStorage 47 | 48 | STORAGE_TRY_NUMBER 1 49 | 50 | STORAGE_DELAY_AFTER_EXCEPTION 5 51 | 52 | STORAGE_BINDER_CLASSNAME focusedCrawler.util.storage.socket.StorageBinder 53 | 54 | STORAGE_FACTORY_CLASSNAME focusedCrawler.util.storage.socket.StorageRemoteAdapterFactory 55 | -------------------------------------------------------------------------------- /source/test_resources/sample.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/test_resources/sample.zip -------------------------------------------------------------------------------- /source/test_resources/sample2.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/test_resources/sample2.zip -------------------------------------------------------------------------------- /source/test_resources/test_crawl_data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/test_resources/test_crawl_data/.DS_Store -------------------------------------------------------------------------------- /source/test_resources/test_crawl_data/cats.seeds: -------------------------------------------------------------------------------- 1 | http://www.reddit.com/r/aww 2 | http://gizmodo.com/of-course-japan-has-an-island-where-cats-outnumber-peop-1695365964 3 | http://en.wikipedia.org/wiki/Cat 4 | http://www.catchannel.com/ 5 | http://mashable.com/category/cats/ 6 | http://www.huffingtonpost.com/news/cats/ 7 | http://www.lolcats.com/ -------------------------------------------------------------------------------- /source/test_resources/test_crawl_data/harvestinfo.csv: -------------------------------------------------------------------------------- 1 | 1 1 1426535956 2 | 2 2 1426535956 3 | 3 4 1426535957 4 | 4 5 1426535957 5 | 5 7 1426535957 6 | 6 8 1426535958 7 | 7 9 1426535958 8 | 8 10 1426535959 9 | 9 11 1426535959 10 | 10 13 1426535960 11 | 10 15 1426535960 12 | 11 16 1426535962 13 | 12 17 1426535964 14 | 12 20 1426535965 15 | 13 23 1426535966 16 | 14 25 1426535971 17 | 15 27 1426535972 18 | 15 28 1426535972 19 | 16 29 1426535972 20 | 17 31 1426535974 21 | 18 32 1426535975 22 | 19 33 1426535975 23 | 20 34 1426535976 24 | 21 36 1426535976 25 | 22 37 1426535977 26 | 23 38 1426535978 27 | 24 39 1426535978 28 | 25 41 1426535979 29 | 26 43 1426535981 30 | 27 44 1426535981 31 | 28 45 1426535983 32 | 29 47 1426535983 33 | 29 48 1426535984 34 | 30 49 1426535986 35 | 31 50 1426535986 36 | 32 53 1426535988 37 | 32 55 1426535988 38 | 33 56 1426535989 39 | 33 57 1426535989 40 | 34 58 1426535990 41 | 34 60 1426535992 42 | 34 62 1426535993 43 | 35 64 1426535993 44 | 36 65 1426535995 45 | 37 67 1426535998 46 | 38 68 1426535998 47 | 39 69 1426535999 48 | 40 71 1426536000 49 | 41 72 1426536000 50 | 42 78 1426536007 51 | 43 80 1426536007 52 | 44 81 1426536008 53 | 45 84 1426536010 54 | 46 85 1426536010 55 | 47 86 1426536011 56 | 47 88 1426536011 57 | 48 90 1426536014 58 | 49 91 1426536015 59 | 50 92 1426536015 60 | 51 94 1426536017 61 | 52 96 1426536017 62 | 53 97 1426536018 63 | 54 98 1426536018 64 | 55 99 1426536019 65 | 56 100 1426536019 66 | 57 101 1426536020 67 | 58 103 1426536022 68 | 59 104 1426536022 69 | 60 105 1426536022 70 | 61 107 1426536023 71 | 62 110 1426536028 72 | 63 112 1426536029 73 | 64 115 1426536034 74 | 65 116 1426536035 75 | 66 117 1426536035 76 | 67 120 1426536037 77 | 68 122 1426536041 78 | 69 123 1426536042 79 | 70 126 1426536043 80 | 71 129 1426536048 81 | 72 131 1426536049 82 | 72 132 1426536049 83 | 73 133 1426536049 84 | 73 134 1426536049 85 | 74 136 1426536049 86 | 75 137 1426536050 87 | 75 138 1426536051 88 | 76 140 1426536051 89 | 77 141 1426536051 90 | 78 142 1426536053 91 | 79 145 1426536059 92 | 80 148 1426536065 93 | 81 151 1426536066 94 | 82 152 1426536067 95 | 83 153 1426536068 96 | 83 154 1426536068 97 | 84 155 1426536069 98 | 85 156 1426536069 99 | 86 157 1426536069 100 | 87 160 1426536072 101 | 88 161 1426536073 102 | 89 163 1426536074 103 | 90 164 1426536074 104 | 91 165 1426536075 105 | 92 166 1426536075 106 | 93 167 1426536076 107 | 94 168 1426536077 108 | 95 169 1426536077 109 | 96 170 1426536077 110 | 97 171 1426536077 111 | 98 173 1426536078 112 | 99 174 1426536079 113 | 100 176 1426536080 114 | -------------------------------------------------------------------------------- /source/test_resources/test_crawl_data/nonrelevantpages.csv: -------------------------------------------------------------------------------- 1 | http://www.britannica.com/EBchecked/topic/1088166/viral-hemorrhagic-fever 5.742678302873385E-18 1426535960 2 | http://ebolaviruspictures.com/Symptoms-Of-Ebola2.php 2.701174650737888E-12 1426535965 3 | http://www.nejm.org/doi/full/10.1056/NEJMoa1410863 9.533348862319484E-38 1426535972 4 | http://www.w3.org/2005/Atom 3.19378526614457E-29 1426535984 5 | http://www.nejm.org/doi/full/10.1056/NEJMp1412166 9.533348862319484E-38 1426535988 6 | http://www.webmd.com/a-to-z-guides/video/reece-vaccine-development 1.2671446792631123E-14 1426535989 7 | http://www.nejm.org/doi/full/10.1056/NEJMoa1411100 9.533348862319484E-38 1426535992 8 | http://www.nydailynews.com/life-style/health/ebola-vaccine-trial-halted-temporarily-geneva-hospital-article-1.2042149 8.362303248056457E-53 1426535993 9 | http://www.healthline.com/health/dengue-hemorrhagic-fever 3.1860857274672904E-5 1426536011 10 | http://www.rightdiagnosis.com/h/hemorrhagic_fever/intro.htm 5.639443979158931E-7 1426536049 11 | http://www.eboladeeply.org/archive/articles/ 9.86914955690156E-48 1426536049 12 | http://www.healthgrades.com/conditions/dengue-hemorrhagic-fever 2.0409476821599656E-40 1426536051 13 | http://www.medicinenet.com/ebola_virus_disease_symptoms_and_signs/symptoms.htm 2.2885984986355265E-19 1426536068 14 | -------------------------------------------------------------------------------- /source/test_resources/test_model/pageclassifier.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/memex-explorer/d2910496238359b3676b4467721017fc82f0b324/source/test_resources/test_model/pageclassifier.model --------------------------------------------------------------------------------