├── .coveragerc ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ └── python-app.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .pylintrc ├── DECISIONS.md ├── Dockerfile ├── Dockerfile-agent ├── Dockerfile-api ├── Dockerfile-classic-api ├── Dockerfile-elasticsearch ├── Dockerfile-index ├── Dockerfile-kibana ├── Dockerfile-ui ├── LICENSE ├── Makefile ├── Pipfile ├── Pipfile.lock ├── README.md ├── RELEASE_NOTES.md ├── api.py ├── app.py ├── audit.py ├── bin ├── start_agent.py └── start_search.sh ├── bulk_index.py ├── classic-api.py ├── classic_api.py ├── config ├── uwsgi-api.ini └── uwsgi-classic-api.ini ├── create_index.py ├── deploy ├── api │ ├── Chart.yaml │ ├── README.md │ ├── templates │ │ ├── 00-deployment.yml │ │ ├── 10-service.yml │ │ └── 20-ingress.yml │ └── values.yaml └── classic-api │ ├── Chart.yaml │ ├── README.md │ ├── templates │ ├── 00-deployment.yml │ ├── 10-service.yml │ └── 20-ingress.yml │ └── values.yaml ├── docker-compose.yml ├── docs ├── Makefile ├── source │ ├── TODO │ ├── _static │ │ └── diagrams │ │ │ ├── ng-search-containers.graphml │ │ │ ├── ng-search-containers.png │ │ │ ├── ng-search-context.graphml │ │ │ ├── ng-search-context.png │ │ │ ├── ng-search-indexing-agent-components.graphml │ │ │ ├── ng-search-indexing-agent-components.png │ │ │ ├── ng-search-service-components.graphml │ │ │ ├── ng-search-service-components.png │ │ │ └── ng-search-subsystems.png │ ├── api │ │ ├── modules.rst │ │ ├── search.agent.consumer.rst │ │ ├── search.agent.rst │ │ ├── search.agent.tests.rst │ │ ├── search.agent.tests.test_integration.rst │ │ ├── search.agent.tests.test_record_processor.rst │ │ ├── search.config.rst │ │ ├── search.context.rst │ │ ├── search.controllers.advanced.forms.rst │ │ ├── search.controllers.advanced.rst │ │ ├── search.controllers.advanced.tests.rst │ │ ├── search.controllers.api.rst │ │ ├── search.controllers.rst │ │ ├── search.controllers.simple.forms.rst │ │ ├── search.controllers.simple.rst │ │ ├── search.controllers.simple.tests.rst │ │ ├── search.controllers.tests.rst │ │ ├── search.controllers.util.rst │ │ ├── search.converters.rst │ │ ├── search.domain.advanced.rst │ │ ├── search.domain.api.rst │ │ ├── search.domain.base.rst │ │ ├── search.domain.rst │ │ ├── search.encode.rst │ │ ├── search.factory.rst │ │ ├── search.process.rst │ │ ├── search.process.tests.rst │ │ ├── search.process.transform.rst │ │ ├── search.routes.api.exceptions.rst │ │ ├── search.routes.api.rst │ │ ├── search.routes.api.serialize.rst │ │ ├── search.routes.api.tests.rst │ │ ├── search.routes.api.tests.test_api.rst │ │ ├── search.routes.api.tests.test_serialize.rst │ │ ├── search.routes.rst │ │ ├── search.routes.ui.rst │ │ ├── search.rst │ │ ├── search.services.fulltext.rst │ │ ├── search.services.index.advanced.rst │ │ ├── search.services.index.authors.rst │ │ ├── search.services.index.exceptions.rst │ │ ├── search.services.index.highlighting.rst │ │ ├── search.services.index.prepare.rst │ │ ├── search.services.index.results.rst │ │ ├── search.services.index.rst │ │ ├── search.services.index.simple.rst │ │ ├── search.services.index.tests.rst │ │ ├── search.services.index.tests.test_reindex.rst │ │ ├── search.services.index.tests.test_results.rst │ │ ├── search.services.index.tests.test_util.rst │ │ ├── search.services.index.tests.tests.rst │ │ ├── search.services.index.util.rst │ │ ├── search.services.metadata.rst │ │ ├── search.services.rst │ │ ├── search.services.tests.rst │ │ ├── search.services.tests.test_fulltext.rst │ │ ├── search.services.tests.test_metadata.rst │ │ ├── search.tests.rst │ │ ├── search.tests.test_advanced_search.rst │ │ └── search.tests.test_param_persistence.rst │ ├── architecture.rst │ ├── classic_api.rst │ ├── conf.py │ ├── index.rst │ ├── migration.rst │ ├── search_api.rst │ └── search_ui.rst └── ui │ ├── README.md │ ├── advanced-query-v1.pdf │ ├── advanced-query.sketch │ ├── arxiv-search-results1.png │ ├── screenshots │ ├── arxiv-search-advanced.png │ ├── arxiv-search-basic.png │ ├── arxiv-search-mobile-advanced.png │ ├── arxiv-search-mobile-basic.png │ ├── arxiv-search-mobile-results.png │ └── arxiv-search-results.png │ ├── search-prototype.sketch │ ├── search-wireframes.pdf │ ├── search-wireframes.sketch │ ├── search-workflows.pdf │ └── search-workflows.sketch ├── lintstats.sh ├── main.py ├── mappings └── DocumentMapping.json ├── mypy.ini ├── poetry.lock ├── pyproject.toml ├── recent_docs_updated.sql ├── reindex.py ├── schema ├── resources │ ├── AtomXML.yaml │ ├── Classification.json │ ├── ClassificationTerm.json │ ├── Document.json │ ├── DocumentMetadata.json │ ├── DocumentSet.json │ └── Person.json ├── search-xml.yaml └── search.yaml ├── search ├── __init__.py ├── config.py ├── consts.py ├── context.py ├── controllers │ ├── __init__.py │ ├── advanced │ │ ├── __init__.py │ │ ├── forms.py │ │ └── tests.py │ ├── api │ │ ├── __init__.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ └── tests_api_search.py │ ├── classic_api │ │ ├── __init__.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ └── test_classic_api_search.py │ ├── simple │ │ ├── __init__.py │ │ ├── forms.py │ │ └── tests.py │ ├── tests.py │ └── util.py ├── converters.py ├── domain │ ├── __init__.py │ ├── advanced.py │ ├── api.py │ ├── base.py │ ├── classic_api │ │ ├── __init__.py │ │ ├── classic_query.py │ │ ├── query_parser.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ └── test_classic_parser.py │ └── documents.py ├── encode.py ├── errors.py ├── factory.py ├── filters.py ├── process │ ├── __init__.py │ ├── tests.py │ └── transform.py ├── routes │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── exceptions.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ └── test_api.py │ ├── classic_api │ │ ├── __init__.py │ │ ├── exceptions.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ └── test_classic.py │ ├── consts.py │ ├── context_processors.py │ └── ui.py ├── serialize │ ├── __init__.py │ ├── atom.py │ ├── atom_extensions.py │ ├── base.py │ ├── json.py │ └── tests │ │ ├── __init__.py │ │ └── test_serialize.py ├── services │ ├── __init__.py │ ├── fulltext.py │ ├── index │ │ ├── __init__.py │ │ ├── advanced.py │ │ ├── api.py │ │ ├── authors.py │ │ ├── classic_api │ │ │ ├── __init__.py │ │ │ ├── classic_search.py │ │ │ ├── query_builder.py │ │ │ └── tests │ │ │ │ ├── __init__.py │ │ │ │ └── test_query_builder.py │ │ ├── exceptions.py │ │ ├── highlighting.py │ │ ├── prepare.py │ │ ├── results.py │ │ ├── simple.py │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── test_reindex.py │ │ │ ├── test_results.py │ │ │ ├── test_util.py │ │ │ └── tests.py │ │ └── util.py │ ├── metadata.py │ ├── tests │ │ ├── __init__.py │ │ ├── test_fulltext.py │ │ ├── test_metadata.py │ │ └── test_tex.py │ └── tex.py ├── static │ ├── css │ │ ├── bulma-tooltip.min.css │ │ ├── search.css │ │ └── search.css.map │ ├── js │ │ └── fieldset.js │ └── sass │ │ ├── bulma-tooltip.sass │ │ └── search.sass ├── templates │ └── search │ │ ├── advanced_search.html │ │ ├── base.html │ │ ├── search-macros.html │ │ └── search.html ├── tests │ ├── __init__.py │ ├── mocks.py │ ├── test_advanced_search.py │ ├── test_param_persistence.py │ └── test_searches.py └── utils │ ├── __init__.py │ ├── string.py │ ├── tests │ ├── __init__.py │ └── test_string.py │ └── timestamp.py ├── setup.cfg ├── tests ├── __init__.py ├── base_app_tests.py ├── data │ ├── 1106.1238v2.json │ ├── 1709.01849v1.indexable.json │ ├── docmeta.json │ ├── docmeta_bulk.json │ ├── examples │ │ ├── 0711.0418.json │ │ ├── 0711.0418v1.json │ │ ├── 1401.1012.json │ │ ├── 1403.6219.json │ │ ├── 1403.6219v1.json │ │ ├── 1404.3450.json │ │ ├── 1404.3450v1.json │ │ ├── 1404.3450v2.json │ │ ├── 1404.3450v3.json │ │ ├── 1408.6682.json │ │ ├── 1408.6682v1.json │ │ ├── 1509.08727.json │ │ ├── 1511.07473.json │ │ ├── 1511.07473v1.json │ │ ├── 1604.04228.json │ │ ├── 1607.05107.json │ │ ├── 1703.09067.json │ │ ├── 1708.07156.json │ │ ├── 1710.01597.json │ │ ├── 1712.04442.json │ │ ├── 1712.04442v1.json │ │ └── 1712.04442v2.json │ ├── fulltext.json │ ├── sample.json │ └── to_index.json ├── examples │ ├── advanced_search.feature.example │ ├── author_search.feature.example │ ├── authors_combined_terms.example │ ├── ordering_pagination.feature.example │ └── simple_search.feature.example ├── integration │ ├── README.md │ └── __init__.py ├── stubs │ └── docmeta.py └── test_exceptions.py ├── update-docs.sh ├── upload_static_assets.py ├── uwsgi.ini ├── wsgi-api.py ├── wsgi-classic-api.py └── wsgi.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | api.py 4 | app.py 5 | audit.py 6 | classic_api.py 7 | setup.py 8 | docs/* 9 | *test* 10 | wsgi.py 11 | wsgi-api.py 12 | wsgi-classic-api.py 13 | wsgi-app.py 14 | populate_test_metadata.py 15 | upload_static_assets.py 16 | create_index.py 17 | reindex.py 18 | bulk_index.py 19 | shard_ids_for_index.py 20 | search/config.py 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ "develop" ] 9 | pull_request: 10 | branches: [ "develop" ] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python 3.10 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: "3.10.9" 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install flake8 pytest poetry 30 | poetry install 31 | #- name: Lint with flake8 32 | # run: | 33 | # # stop the build if there are Python syntax errors or undefined names 34 | # flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 35 | # # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 36 | # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 37 | - name: Test with pytest 38 | run: | 39 | poetry run pytest -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/source/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # VS Code 101 | .vscode 102 | settings.json 103 | 104 | # PyCharm 105 | .idea 106 | 107 | # mypy 108 | .mypy_cache/ 109 | 110 | src/ 111 | temp/ 112 | .DS_Store 113 | 114 | to_index/ 115 | 116 | .pytest_cache/ 117 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | - repo: https://github.com/ambv/black 2 | rev: stable 3 | hooks: 4 | - id: black 5 | name: Format Python Code 6 | language: python 7 | entry: black 8 | args: 9 | - --safe 10 | - --line-length=79 11 | - --target-version=py37 12 | - . 13 | 14 | - repo: https://github.com/PyCQA/flake8 15 | rev: 3.7.9 16 | hooks: 17 | - id: flake8 18 | name: Flake8 Check 19 | language: python 20 | entry: flake8 21 | args: 22 | - search 23 | - tests 24 | 25 | - repo: https://github.com/pycqa/pydocstyle 26 | rev: master 27 | hooks: 28 | - id: pydocstyle 29 | name: Python Documentation Style Check 30 | language: python 31 | entry: pydocstyle 32 | args: 33 | - search 34 | - tests 35 | - --add-ignore=D401,D202 36 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # arxiv/search 2 | # 3 | # Defines the runtime for the arXiv search service, which provides the main 4 | # UIs (and, eventually, APIs) for search. 5 | 6 | FROM arxiv/base:0.16.7 7 | 8 | WORKDIR /opt/arxiv 9 | 10 | # remove conflicting mariadb-libs from arxiv/base 11 | RUN yum remove -y mariadb-libs 12 | 13 | # Install MySQL. 14 | RUN yum install -y which mysql mysql-devel 15 | RUN pip install uwsgi 16 | 17 | # Add Python application and configuration. 18 | ADD app.py /opt/arxiv/ 19 | ADD Pipfile /opt/arxiv/ 20 | ADD Pipfile.lock /opt/arxiv/ 21 | RUN pip install -U pip pipenv 22 | RUN pipenv install 23 | 24 | ENV PATH "/opt/arxiv:${PATH}" 25 | 26 | ADD schema /opt/arxiv/schema 27 | ADD mappings /opt/arxiv/mappings 28 | ADD search /opt/arxiv/search 29 | ADD wsgi.py uwsgi.ini /opt/arxiv/ 30 | 31 | 32 | ADD bin/start_search.sh /opt/arxiv/ 33 | RUN chmod +x /opt/arxiv/start_search.sh 34 | 35 | ENV LC_ALL en_US.utf8 36 | ENV LANG en_US.utf8 37 | ENV LOGLEVEL 40 38 | ENV FLASK_DEBUG 1 39 | ENV FLASK_APP /opt/arxiv/app.py 40 | 41 | ENV ELASTICSEARCH_SERVICE_HOST 127.0.0.1 42 | ENV ELASTICSEARCH_SERVICE_PORT 9200 43 | ENV ELASTICSEARCH_SERVICE_PORT_9200_PROTO http 44 | ENV ELASTICSEARCH_PASSWORD changeme 45 | ENV METADATA_ENDPOINT https://arxiv.org/docmeta_bulk/ 46 | 47 | EXPOSE 8000 48 | ENTRYPOINT ["pipenv", "run"] 49 | CMD ["uwsgi", "--ini", "/opt/arxiv/uwsgi.ini"] 50 | -------------------------------------------------------------------------------- /Dockerfile-agent: -------------------------------------------------------------------------------- 1 | # arxiv/search-agent 2 | # 3 | # The indexing agent is responsible for updating the search index as new 4 | # article metadata becomes available. Subscribes to a Kinesis stream for 5 | # notifications about new metadata. 6 | 7 | FROM arxiv/search:0.5.6 8 | 9 | WORKDIR /opt/arxiv 10 | 11 | ENV ELASTICSEARCH_SERVICE_HOST 127.0.0.1 12 | ENV ELASTICSEARCH_SERVICE_PORT 9200 13 | ENV ELASTICSEARCH_SERVICE_PORT_9200_PROTO http 14 | ENV ELASTICSEARCH_INDEX arxiv 15 | ENV ELASTICSEARCH_USER elastic 16 | ENV ELASTICSEARCH_PASSWORD changeme 17 | ENV METADATA_ENDPOINT https://arxiv.org/ 18 | ENV LOGLEVEL 20 19 | 20 | ENV AWS_ACCESS_KEY_ID "" 21 | ENV AWS_SECRET_ACCESS_KEY "" 22 | 23 | VOLUME /checkpoint 24 | 25 | ENV KINESIS_STREAM "MetadataIsAvailable" 26 | ENV KINESIS_SHARD_ID "0" 27 | ENV KINESIS_CHECKPOINT_VOLUME "/checkpoint" 28 | ENV KINESIS_START_TYPE "AT_TIMESTAMP" 29 | 30 | # Add this module again, so that it's tracked. 31 | ADD search/agent/ /opt/arxiv/search/agent/ 32 | ADD bin/start_agent.py /opt/arxiv/start_agent.py 33 | 34 | 35 | ENTRYPOINT ["pipenv", "run", "python3.6", "/opt/arxiv/start_agent.py"] 36 | -------------------------------------------------------------------------------- /Dockerfile-api: -------------------------------------------------------------------------------- 1 | # arxiv/search-api 2 | # 3 | # Defines the runtime for the arXiv search API, which provides a metadata 4 | # query API backed by Elasticsearch. 5 | 6 | FROM arxiv/base:0.16.6 7 | 8 | WORKDIR /opt/arxiv 9 | 10 | ENV PATH "/opt/arxiv:${PATH}" 11 | ENV LC_ALL en_US.utf8 12 | ENV LANG en_US.utf8 13 | ENV LOGLEVEL 40 14 | ENV PIPENV_VENV_IN_PROJECT 1 15 | ENV FLASK_DEBUG 1 16 | ENV FLASK_APP /opt/arxiv/api.py 17 | ENV ELASTICSEARCH_SERVICE_HOST 127.0.0.1 18 | ENV ELASTICSEARCH_SERVICE_PORT 9200 19 | ENV ELASTICSEARCH_SERVICE_PORT_9200_PROTO http 20 | ENV ELASTICSEARCH_INDEX arxiv 21 | ENV ELASTICSEARCH_USER elastic 22 | ENV ELASTICSEARCH_PASSWORD changeme 23 | ENV METADATA_ENDPOINT https://arxiv.org/docmeta_bulk/ 24 | 25 | # Install MySQL. 26 | # RUN yum install -y which mysql mysql-devel 27 | 28 | # Add Python application and configuration. 29 | ADD Pipfile /opt/arxiv/ 30 | ADD Pipfile.lock /opt/arxiv/ 31 | RUN pip install -U pip pipenv 32 | RUN pipenv sync --dev 33 | ADD api.py /opt/arxiv/ 34 | ADD schema /opt/arxiv/schema 35 | ADD mappings /opt/arxiv/mappings 36 | ADD search /opt/arxiv/search 37 | ADD wsgi-api.py config/uwsgi-api.ini /opt/arxiv/ 38 | 39 | 40 | EXPOSE 8000 41 | 42 | ENTRYPOINT ["pipenv", "run"] 43 | CMD ["uwsgi", "--ini", "/opt/arxiv/uwsgi-api.ini"] 44 | -------------------------------------------------------------------------------- /Dockerfile-classic-api: -------------------------------------------------------------------------------- 1 | # arxiv/classic-api 2 | # 3 | # Defines the runtime for the arXiv classic API, which provides a metadata 4 | # query API backed by Elasticsearch. 5 | 6 | # File: Dockerfile-classic-api 7 | # Desc: arxiv search classic api 8 | # Use: 9 | # docker build --build-arg GIT_COMMIT=$(git rev-parse HEAD) \ 10 | # -t "arxiv/arxiv-search-classic-api" -f ./Dockerfile-classic-api . 11 | # docker run -it --env-file=env -p 8080:8080 arxiv/arxiv-search-classic-api 12 | 13 | FROM python:3.10.9-buster 14 | 15 | ARG GIT_COMMIT 16 | 17 | ENV \ 18 | APP_HOME=/app \ 19 | ELASTICSEARCH_PASSWORD=changeme \ 20 | ELASTICSEARCH_SERVICE_HOST=127.0.0.1 \ 21 | ELASTICSEARCH_SERVICE_PORT=9200 \ 22 | ELASTICSEARCH_SERVICE_PORT_9200_PROTO=http \ 23 | GIT_COMMIT=$GIT_COMMIT \ 24 | METADATA_ENDPOINT=https://arxiv.org/docmeta_bulk/ \ 25 | PIP_DEFAULT_TIMEOUT=100 \ 26 | PIP_DISABLE_PIP_VERSION_CHECK=on \ 27 | PIP_NO_CACHE_DIR=off \ 28 | PYTHONFAULTHANDLER=1 \ 29 | PYTHONHASHSEED=random \ 30 | PYTHONUNBUFFERED=1 \ 31 | TRACE=1 32 | 33 | WORKDIR $APP_HOME 34 | COPY poetry.lock pyproject.toml ./ 35 | COPY app.py wsgi.py uwsgi.ini ./ 36 | COPY schema ./schema 37 | COPY mappings ./mappings 38 | COPY search ./search 39 | RUN echo $GIT_COMMIT > ./git-commit.txt 40 | 41 | RUN pip install "gunicorn==20.1.0" "poetry" 42 | RUN poetry config virtualenvs.create false && \ 43 | poetry install --no-interaction --no-ansi 44 | 45 | EXPOSE 8080 46 | 47 | # See cicd/cloudbuild-master-pr.yaml for use in integration tests. 48 | ENV GUNICORN gunicorn --bind :8080 \ 49 | --workers 1 --threads 8 --timeout 0 "search.factory:create_classic_api_web_app()" 50 | 51 | CMD exec $GUNICORN 52 | 53 | -------------------------------------------------------------------------------- /Dockerfile-elasticsearch: -------------------------------------------------------------------------------- 1 | # arxiv/eleasticsearch 2 | # 3 | # Runs Elasticsearch 6.2.4, with additional plugins. 4 | # 5 | # To run, use the ``docker-compose.yml`` config in this directory to spin up 6 | # alongside Kibana. Or: 7 | # 8 | # $ docker build . -t arxiv/elasticsearch -f ./Dockerfil-elasticsearch 9 | # $ docker run -it -p 9200:9200 -p 9300:9300 \ 10 | # > -e "http.host=0.0.0.0" -e "transport.host=127.0.0.1" \ 11 | # > arxiv/elasticsearch 12 | # 13 | # ES should be available on tcp://localhost:9200. 14 | 15 | FROM docker.elastic.co/elasticsearch/elasticsearch:6.2.4 16 | 17 | # Install plugins. 18 | # 19 | # This adds the International Components for Unicode analyzer, which is not 20 | # bundled with ES by default. For more information, see 21 | # https://www.elastic.co/guide/en/elasticsearch/plugins/master/analysis-icu.html 22 | RUN /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-icu 23 | -------------------------------------------------------------------------------- /Dockerfile-index: -------------------------------------------------------------------------------- 1 | # arxiv/search-index 2 | # 3 | # Runtime for bulk indexing of a large number of records. This is not used in 4 | # version 0.1, but is available for exploratory work in Kubernetes. 5 | # 6 | # Expects a path to a list of newline-delimited arXiv IDs (versionless). 7 | # 8 | # Example (local stack): 9 | # 10 | # $ mkdir /tmp/to_index 11 | # $ cp arxiv_id_dump.txt /tmp/to_index 12 | # $ docker run -it --network=arxivsearch_es_stack \ 13 | # > -v /tmp/to_index:/to_index \ 14 | # > -e ELASTICSEARCH_SERVICE_HOST=elasticsearch \ 15 | # > arxiv/search-index /to_index/arxiv_id_dump.txt 16 | # 17 | # See also ELASTICSEARCH_* and METADATA_ENDPOINT parameters, below. 18 | 19 | FROM arxiv/search:0.5.5 20 | 21 | ENV PATH "/opt/arxiv:${PATH}" 22 | ADD bulk_index.py /opt/arxiv/ 23 | 24 | WORKDIR /opt/arxiv/ 25 | 26 | ENV LC_ALL en_US.utf8 27 | ENV LANG en_US.utf8 28 | ENV LOGLEVEL 40 29 | ENV FLASK_DEBUG 1 30 | ENV FLASK_APP /opt/arxiv/app.py 31 | 32 | ENV ELASTICSEARCH_SERVICE_HOST 127.0.0.1 33 | ENV ELASTICSEARCH_SERVICE_PORT 9200 34 | ENV ELASTICSEARCH_SERVICE_PORT_9200_PROTO http 35 | ENV ELASTICSEARCH_USER elastic 36 | ENV ELASTICSEARCH_PASSWORD changeme 37 | ENV METADATA_ENDPOINT https://arxiv.org/docmeta_bulk/ 38 | ENV METADATA_VERIFY_CERT True 39 | 40 | VOLUME /to_index 41 | 42 | ENTRYPOINT ["pipenv", "run", "python3.6", "bulk_index.py", "-l"] 43 | -------------------------------------------------------------------------------- /Dockerfile-kibana: -------------------------------------------------------------------------------- 1 | # arxiv/kibana 2 | # 3 | # Runs Kibana 6.2.4. This Dockerfile is not strictly necessary, but it here 4 | # in case we want to run any additional plugins. 5 | # 6 | # As of version 0.1, this is here for local development purposes only and is 7 | # not deployed in production. 8 | # 9 | # To run, use the ``docker-compose.yml`` config in this directory to spin up 10 | # alongside Elasticsearch. 11 | 12 | 13 | FROM docker.elastic.co/kibana/kibana:6.2.4 14 | -------------------------------------------------------------------------------- /Dockerfile-ui: -------------------------------------------------------------------------------- 1 | # File: Dockerfile-ui 2 | # Desc: arxiv search ui 3 | # Use: 4 | # docker build --build-arg GIT_COMMIT=$(git rev-parse HEAD) \ 5 | # -t "arxiv/arxiv-search" -f ./Dockerfile-ui . 6 | # docker run -it --env-file=env -p 8000:8000 arxiv/arxiv-search 7 | 8 | FROM python:3.10.9-buster 9 | 10 | ARG GIT_COMMIT 11 | 12 | ENV \ 13 | APP_HOME=/app \ 14 | ELASTICSEARCH_PASSWORD=changeme \ 15 | ELASTICSEARCH_SERVICE_HOST=127.0.0.1 \ 16 | ELASTICSEARCH_SERVICE_PORT=9200 \ 17 | ELASTICSEARCH_SERVICE_PORT_9200_PROTO=http \ 18 | GIT_COMMIT=$GIT_COMMIT \ 19 | METADATA_ENDPOINT=https://arxiv.org/docmeta_bulk/ \ 20 | PIP_DEFAULT_TIMEOUT=100 \ 21 | PIP_DISABLE_PIP_VERSION_CHECK=on \ 22 | PIP_NO_CACHE_DIR=off \ 23 | PYTHONFAULTHANDLER=1 \ 24 | PYTHONHASHSEED=random \ 25 | PYTHONUNBUFFERED=1 \ 26 | TRACE=1 27 | 28 | WORKDIR $APP_HOME 29 | COPY poetry.lock pyproject.toml ./ 30 | COPY app.py wsgi.py uwsgi.ini ./ 31 | COPY schema ./schema 32 | COPY mappings ./mappings 33 | COPY search ./search 34 | RUN echo $GIT_COMMIT > ./git-commit.txt 35 | 36 | RUN pip install "gunicorn==20.1.0" "poetry" 37 | RUN poetry config virtualenvs.create false && \ 38 | poetry install --no-interaction --no-ansi 39 | 40 | EXPOSE 8000 41 | 42 | # See cicd/cloudbuild-master-pr.yaml for use in integration tests. 43 | ENV GUNICORN gunicorn --bind :8000 \ 44 | --workers 1 --threads 8 --timeout 0 wsgi 45 | 46 | CMD exec $GUNICORN 47 | 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Cornell University Library 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: default help index index-test run-classic-api test check format 2 | .DEFAULT_GOAL := help 3 | SHELL := /bin/bash 4 | PROJECT := feed 5 | 6 | .EXPORT_ALL_VARIABLES: 7 | PIPENV_VERBOSITY = -1 8 | 9 | 10 | help: ## Show help. 11 | @grep -E '^[a-zA-Z2_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 12 | 13 | 14 | # Index 15 | 16 | index: ## Create and populate elasticsearch index. 17 | @FLASK_APP=app.py pipenv run python create_index.py 18 | @FLASK_APP=app.py pipenv run python bulk_index.py 19 | 20 | 21 | index-test: ## Test if the index is created. 22 | @curl http://127.0.0.1:9200/arxiv/_search 2> /dev/null | jq '.hits.hits[]._source | {id: .id, title: .title, arxiv: .primary_classification.category.id}' 23 | 24 | # Services 25 | 26 | run-classic-api: ## Run classic feed server in development mode. 27 | @FLASK_APP=classic_api.py pipenv run flask run 28 | 29 | 30 | # Utilities 31 | 32 | test: ## Run tests and coverage checks. 33 | @pipenv run nose2 -vvv tests.base_app_tests 34 | @pipenv run nose2 -vvv --with-coverage 35 | 36 | 37 | check: ## Run code checks. 38 | @bash lintstats.sh 39 | 40 | 41 | format: ## Format the code. 42 | @pipenv run black --safe --target-version=py37 --line-length=79 "$(PROJECT)" 43 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.python.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | #arxiv-auth = ">=1.0.0rc1" #"*" #"==0.2.7" 8 | arxiv-base = {editable = true, ref = "ed3feece", git = "https://github.com/arXiv/arxiv-base.git"} #"*" #">1.0.0a2" #"==0.17.4.post2" 9 | bleach = "*" 10 | boto = "*" #"==2.48.0" 11 | boto3 = "*" #">=1.0.0,<2.0.0" #"==1.6.6" 12 | botocore = "*" #"==1.9.6" 13 | certifi = "*" #"==2017.7.27.1" 14 | chardet = "*" #"==3.0.4" 15 | click = "*" #"==6.7" 16 | dataclasses = "*" #"==0.4" 17 | docutils = "*" #"==0.14" 18 | elasticsearch = "==6.3.0" 19 | elasticsearch-dsl = "==6.4.0" 20 | feedgen = "*" #"==0.9.0" 21 | flask = ">=2.2,<3.0" #"==1.0.4" 22 | flask-s3 = "*" #"==0.3.3" 23 | idna = "*" #"==2.6" 24 | ipaddress = "*" #"==1.0.19" 25 | itsdangerous = "*" #"==0.24" 26 | jinja2 = "*" #"<3.0" #"*" #">=3.0" #"==2.11.3" 27 | jmespath = "*" #"==0.9.3" 28 | jsonschema = "*" #"==2.6.0" 29 | lark-parser = "*" #"==0.8.1" 30 | lxml = "*" #"==4.6.3" 31 | markupsafe = "*" #"==1.1.1" 32 | mccabe = "*" #"==0.6.1" 33 | mypy-extensions = "*" 34 | pbr = "*" #"==3.1.1" 35 | psutil = "*" #"==5.6.6" 36 | pyjwt = "*" #"==1.7.1" 37 | pylama = "*" #"==7.4.3" 38 | python-dateutil = "*" #"==2.6.1" 39 | pytz = "*" #"==2017.3" 40 | requests = "*" #">=2.21.0" 41 | retry = "*" 42 | s3transfer = "*" #"==0.1.13" 43 | snowballstemmer = "*" #"==1.2.1" 44 | thrift = "*" #"==0.11.0" 45 | thrift-connector = "*" #"==0.23" 46 | urllib3 = "*" #">=1.23" 47 | werkzeug = "*" #"~=0.14" 48 | wtforms = "==2.1" 49 | 50 | [dev-packages] 51 | pycodestyle = "*" 52 | pydocstyle = ">=2.1.1" 53 | mock = "==2.0.0" 54 | mypy = "==0.720" 55 | mypy-extensions = "*" 56 | "nose2" = "==0.7.3" 57 | coveralls = "*" 58 | sphinx = "*" 59 | sphinxcontrib-websupport = "*" 60 | sphinx-autodoc-typehints = "*" 61 | pylint = "*" 62 | pytest = "*" 63 | nose = "*" 64 | pre-commit = "==2.0.1" 65 | coverage = "==4.4.2" 66 | 67 | [requires] 68 | python_version = "3.10" 69 | -------------------------------------------------------------------------------- /RELEASE_NOTES.md: -------------------------------------------------------------------------------- 1 | # Release notes 2 | 3 | ## Version 0.5 - The cross-list release. 4 | 5 | ### Bug 6 | - [ARXIVNG-1349] - Layout of category tags and DOI is really wonky in search results 7 | 8 | 9 | ### New Feature 10 | - [ARXIVNG-1277] - Search result entries should display secondary categories if present 11 | - [ARXIVNG-1278] - "All fields" queries should search secondary categories 12 | - [ARXIVNG-1347] - Search by secondary/cross-list 13 | - [ARXIVNG-1357] - Update document mapping for secondaries to be consistent with mapping for primary category 14 | - [ARXIVNG-1362] - Search indexing agent should identify itself using a custom User-Agent 15 | 16 | ### Task 17 | - [ARXIVNG-1048] - Update accessibility notice in footer 18 | - [ARXIVNG-1348] - Update requests dependency to 2.20.0 or greater 19 | -------------------------------------------------------------------------------- /api.py: -------------------------------------------------------------------------------- 1 | """Provides application for development purposes.""" 2 | 3 | from search.factory import create_api_web_app 4 | 5 | app = create_api_web_app() 6 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | """Provides application for development purposes.""" 2 | 3 | from search.factory import create_ui_web_app 4 | 5 | app = create_ui_web_app() 6 | -------------------------------------------------------------------------------- /bin/start_agent.py: -------------------------------------------------------------------------------- 1 | """Run the indexing agent stream processor.""" 2 | from search.agent import process_stream 3 | from search.factory import create_ui_web_app 4 | 5 | 6 | def start_agent() -> None: 7 | """Start the record processor.""" 8 | app = create_ui_web_app() 9 | with app.app_context(): 10 | process_stream() 11 | 12 | 13 | if __name__ == "__main__": 14 | start_agent() 15 | -------------------------------------------------------------------------------- /bin/start_search.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | /usr/bin/uwsgi -H $(pipenv --venv) "$@" 5 | -------------------------------------------------------------------------------- /classic-api.py: -------------------------------------------------------------------------------- 1 | """Provides application for development purposes.""" 2 | 3 | from search.factory import create_classic_api_web_app 4 | 5 | app = create_classic_api_web_app() 6 | -------------------------------------------------------------------------------- /classic_api.py: -------------------------------------------------------------------------------- 1 | """Provides application for development purposes.""" 2 | 3 | from search.factory import create_classic_api_web_app 4 | 5 | app = create_classic_api_web_app() 6 | -------------------------------------------------------------------------------- /config/uwsgi-api.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | http-socket = :8000 3 | chdir = /opt/arxiv/ 4 | wsgi-file = wsgi-api.py 5 | callable = application 6 | master = true 7 | harakiri = 3000 8 | manage-script-name = true 9 | processes = 1 10 | queue = 0 11 | threads = 1 12 | single-interpreter = true 13 | mount = /=wsgi-api.py 14 | mount = $(APPLICATION_ROOT)=wsgi-api.py 15 | logformat = "%(addr) %(addr) - %(user_id)|%(session_id) [%(rtime)] [%(uagent)] \"%(method) %(uri) %(proto)\" %(status) %(size) %(micros) %(ttfb)" 16 | -------------------------------------------------------------------------------- /config/uwsgi-classic-api.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | http-socket = :8000 3 | chdir = /opt/arxiv/ 4 | wsgi-file = wsgi-classic-api.py 5 | callable = application 6 | master = true 7 | harakiri = 3000 8 | manage-script-name = true 9 | processes = 8 10 | queue = 0 11 | threads = 1 12 | single-interpreter = true 13 | mount = $(APPLICATION_ROOT)=wsgi-classic-api.py 14 | buffer-size = 65535 15 | logformat = "%(addr) %(addr) - %(user_id)|%(session_id) [%(rtime)] [%(uagent)] \"%(method) %(uri) %(proto)\" %(status) %(size) %(micros) %(ttfb)" 16 | -------------------------------------------------------------------------------- /create_index.py: -------------------------------------------------------------------------------- 1 | """Use this to initialize the search index for testing.""" 2 | 3 | 4 | from search.factory import create_ui_web_app 5 | from search.services import index 6 | 7 | app = create_ui_web_app() 8 | app.app_context().push() 9 | 10 | 11 | @app.cli.command() 12 | def create_index(): 13 | """Initialize the search index.""" 14 | index.SearchSession.create_index() 15 | 16 | 17 | if __name__ == "__main__": 18 | create_index() 19 | -------------------------------------------------------------------------------- /deploy/api/Chart.yaml: -------------------------------------------------------------------------------- 1 | name: search-api 2 | version: 0.1.3 3 | appVersion: 0.1.1 4 | description: ES-backed search API for arXiv-NG. 5 | sources: 6 | - https://github.com/arxiv/arxiv-search 7 | engine: gotpl 8 | -------------------------------------------------------------------------------- /deploy/api/README.md: -------------------------------------------------------------------------------- 1 | # Deployment Instructions for search-api 2 | 3 | To install `search-api` to the development namespace in the kubernetes cluster: 4 | 5 | ```bash 6 | helm install ./ --set=image.tag=some_tag \ 7 | --tiller-namespace=development --namespace=development \ 8 | --set=ingress.host=development.arxiv.org \ 9 | --set=elasticsearch.host=foo.es.amazonaws.com \ 10 | --set=elasticsearch.index=arxiv0.3 \ 11 | --set=scaling.replicas=2 12 | ``` 13 | 14 | 15 | Notes: 16 | - `image.tag`: this refers to the tag in [dockerhub](https://hub.docker.com/repository/docker/arxiv/search-api) 17 | - `elasticsearch.host`: this is the hostname of our Elasticsearch endpoint. We have provisioned it in AWS. 18 | - `elasticsearch.index`: this is the index identifier. As of this writing, `arxiv0.3` is the index associated with development. 19 | -------------------------------------------------------------------------------- /deploy/api/templates/00-deployment.yml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: "{{ default "search-api" .Values.name }}" 5 | namespace: "{{ .Values.namespace }}" 6 | labels: 7 | subsystem: "{{ .Values.labels.subsystem }}" 8 | container: "{{ default "search-api" .Values.deployment.name }}" 9 | service-group: api 10 | log-style: uwsgi 11 | env: "{{ .Values.namespace }}" 12 | spec: 13 | replicas: {{ default 1 .Values.scaling.replicas }} 14 | template: 15 | metadata: 16 | labels: 17 | subsystem: "{{ .Values.labels.subsystem }}" 18 | container: "{{ default "search-api" .Values.deployment.name }}" 19 | service-group: api 20 | log-style: uwsgi 21 | # annotations: 22 | # prometheus.io/scrape: 'true' 23 | spec: 24 | containers: 25 | - name: "{{ default "search-api" .Values.deployment.name }}" 26 | image: arxiv/search-api:{{ .Values.image.tag }} 27 | imagePullPolicy: Always 28 | ports: 29 | - containerPort: 8000 30 | env: 31 | - name: APPLICATION_ROOT 32 | value: "{{ .Values.ingress.path }}" 33 | - name: BASE_SERVER 34 | value: "{{ .Values.base_server }}" 35 | - name: NAMESPACE 36 | value: "{{ .Values.namespace }}" 37 | - name: LOGLEVEL 38 | value: "{{ default "40" .Values.loglevel }}" 39 | - name: ELASTICSEARCH_SERVICE_HOST 40 | value: "{{ default "elasticsearch" .Values.elasticsearch.host }}" 41 | - name: ELASTICSEARCH_SERVICE_PORT 42 | value: "{{ default "9200" .Values.elasticsearch.port }}" 43 | - name: ELASTICSEARCH_SERVICE_PORT_{{ default "9200" .Values.elasticsearch.port }}_PROTO 44 | value: "{{ default "http" .Values.elasticsearch.proto }}" 45 | - name: ELASTICSEARCH_INDEX 46 | value: "{{ default "arxiv" .Values.elasticsearch.index }}" 47 | - name: ELASTICSEARCH_USER 48 | value: "{{ default "" .Values.elasticsearch.user }}" 49 | - name: ELASTICSEARCH_PASSWORD 50 | value: "{{ default "" .Values.elasticsearch.password }}" 51 | - name: ELASTICSEARCH_VERIFY 52 | value: "{{ default "false" .Values.elasticsearch.verify }}" 53 | - name: JWT_SECRET 54 | valueFrom: 55 | secretKeyRef: 56 | name: jwt 57 | key: secret 58 | resources: 59 | limits: 60 | cpu: 300m 61 | memory: 256Mi 62 | requests: 63 | cpu: 100m 64 | memory: 128Mi 65 | -------------------------------------------------------------------------------- /deploy/api/templates/10-service.yml: -------------------------------------------------------------------------------- 1 | kind: Service 2 | metadata: 3 | annotations: 4 | prometheus.io/scrape: 'true' 5 | name: "{{ default "search-api" .Values.name }}" 6 | labels: 7 | subsystem: "{{ .Values.labels.subsystem }}" 8 | container: "{{ default "search-api" .Values.deployment.name }}" 9 | service-group: api 10 | log-style: uwsgi 11 | env: "{{ .Values.namespace }}" 12 | spec: 13 | type: NodePort 14 | ports: 15 | - port: 80 16 | targetPort: 8000 17 | selector: 18 | subsystem: "{{ .Values.labels.subsystem }}" 19 | container: "{{ default "search-api" .Values.name }}" 20 | -------------------------------------------------------------------------------- /deploy/api/templates/20-ingress.yml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Ingress 3 | metadata: 4 | name: "{{ default "search-api-ingress" .Values.ingress.name }}" 5 | namespace: "{{ .Values.namespace }}" 6 | labels: 7 | subsystem: "{{ .Values.labels.subsystem }}" 8 | container: "{{ default "search-api" .Values.deployment.name }}" 9 | service-group: api 10 | # annotations: 11 | # ingress.kubernetes.io/configuration-snippet: | 12 | # more_set_headers "Request-Id: $req_id"; 13 | # ingress.kubernetes.io/auth-url: http://{{ .Values.authenticatorService }}.{{ .Values.namespace }}.svc.cluster.local/auth 14 | # ingress.kubernetes.io/auth-response-headers: Authorization 15 | # ingress.kubernetes.io/limit-connections: "4" 16 | # ingress.kubernetes.io/limit-rps: "16" 17 | # ingress.kubernetes.io/rewrite-target: / 18 | spec: 19 | tls: # This will use the default certificate for the ingress controller. 20 | - hosts: 21 | - "{{ .Values.ingress.host }}" 22 | rules: 23 | - host: "{{ .Values.ingress.host }}" 24 | http: 25 | paths: 26 | - path: "{{ default "/metadata" .Values.ingress.path }}" 27 | backend: 28 | serviceName: "{{ default "search-api" .Values.name }}" 29 | servicePort: 80 30 | -------------------------------------------------------------------------------- /deploy/api/values.yaml: -------------------------------------------------------------------------------- 1 | name: search-api 2 | namespace: development 3 | loglevel: 40 4 | 5 | image: 6 | tag: "0.1-noauth" 7 | 8 | scaling: 9 | replicas: 1 10 | 11 | elasticsearch: 12 | es_cluster_name: arxiv 13 | host: changeme 14 | port: "443" 15 | proto: "https" 16 | user: "" 17 | password: "" 18 | verify: "true" 19 | index: "arxiv0.3" # currently used for development 20 | 21 | labels: 22 | subsystem: api-gateway 23 | 24 | authenticatorService: authenticator 25 | 26 | deployment: 27 | name: search-api 28 | 29 | ingress: 30 | name: search-api 31 | host: "development.arxiv.org" 32 | path: /metadata 33 | 34 | base_server: "arxiv.org" 35 | -------------------------------------------------------------------------------- /deploy/classic-api/Chart.yaml: -------------------------------------------------------------------------------- 1 | name: classic-api 2 | version: 0.1 3 | appVersion: 0.1 4 | description: ES-backed classic API. 5 | sources: 6 | - https://github.com/arxiv/arxiv-search 7 | engine: gotpl 8 | -------------------------------------------------------------------------------- /deploy/classic-api/README.md: -------------------------------------------------------------------------------- 1 | # Deployment Instructions for classic-api 2 | 3 | To install `classic-api` to the development namespace in the kubernetes cluster: 4 | 5 | ```bash 6 | helm install ./ --name=classic-api --set=image.tag=some_tag \ 7 | --tiller-namespace=development --namespace=development \ 8 | --set=ingress.host=development.arxiv.org \ 9 | --set=elasticsearch.host=foo.es.amazonaws.com \ 10 | --set=elasticsearch.index=arxiv0.3 11 | ``` 12 | 13 | To delete the pod(s) associated with `classic-api`, run: 14 | 15 | ```bash 16 | helm del --purge classic-api --tiller-namespace=development 17 | ``` 18 | 19 | Notes: 20 | - `image.tag`: this refers to the tag in [dockerhub](https://hub.docker.com/repository/docker/arxiv/classic-api) 21 | - `elasticsearch.host`: this is the hostname of our Elasticsearch endpoint. We have provisioned it in AWS. 22 | - `elasticsearch.index`: this is the index identifier. As of this writing, `arxiv0.3` is the index associated with development. 23 | -------------------------------------------------------------------------------- /deploy/classic-api/templates/00-deployment.yml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: "{{ default "classic-api" .Values.name }}" 5 | namespace: "{{ .Values.namespace }}" 6 | labels: 7 | subsystem: "{{ .Values.labels.subsystem }}" 8 | container: "{{ default "classic-api" .Values.deployment.name }}" 9 | service-group: api 10 | log-style: uwsgi 11 | env: "{{ .Values.namespace }}" 12 | spec: 13 | replicas: {{ default 1 .Values.scaling.replicas }} 14 | template: 15 | metadata: 16 | labels: 17 | subsystem: "{{ .Values.labels.subsystem }}" 18 | container: "{{ default "classic-api" .Values.deployment.name }}" 19 | service-group: api 20 | log-style: uwsgi 21 | spec: 22 | containers: 23 | - name: "{{ default "classic-api" .Values.deployment.name }}" 24 | image: arxiv/classic-api:{{ .Values.image.tag }} 25 | imagePullPolicy: Always 26 | ports: 27 | - containerPort: 8000 28 | env: 29 | - name: APPLICATION_ROOT 30 | value: "{{ .Values.ingress.path }}" 31 | - name: BASE_SERVER 32 | value: "{{ .Values.base_server }}" 33 | - name: NAMESPACE 34 | value: "{{ .Values.namespace }}" 35 | - name: LOGLEVEL 36 | value: "{{ default "40" .Values.loglevel }}" 37 | - name: ELASTICSEARCH_SERVICE_HOST 38 | value: "{{ default "elasticsearch" .Values.elasticsearch.host }}" 39 | - name: ELASTICSEARCH_SERVICE_PORT 40 | value: "{{ default "9200" .Values.elasticsearch.port }}" 41 | - name: ELASTICSEARCH_SERVICE_PORT_{{ default "9200" .Values.elasticsearch.port }}_PROTO 42 | value: "{{ default "http" .Values.elasticsearch.proto }}" 43 | - name: ELASTICSEARCH_INDEX 44 | value: "{{ default "arxiv" .Values.elasticsearch.index }}" 45 | - name: ELASTICSEARCH_USER 46 | value: "{{ default "" .Values.elasticsearch.user }}" 47 | - name: ELASTICSEARCH_PASSWORD 48 | value: "{{ default "" .Values.elasticsearch.password }}" 49 | - name: ELASTICSEARCH_VERIFY 50 | value: "{{ default "false" .Values.elasticsearch.verify }}" 51 | resources: 52 | limits: 53 | cpu: 300m 54 | memory: 256Mi 55 | requests: 56 | cpu: 100m 57 | memory: 128Mi 58 | -------------------------------------------------------------------------------- /deploy/classic-api/templates/10-service.yml: -------------------------------------------------------------------------------- 1 | kind: Service 2 | metadata: 3 | # annotations: 4 | # prometheus.io/scrape: 'true' 5 | name: "{{ default "classic-api" .Values.name }}" 6 | labels: 7 | subsystem: "{{ .Values.labels.subsystem }}" 8 | container: "{{ default "classic-api" .Values.deployment.name }}" 9 | service-group: api 10 | log-style: uwsgi 11 | env: "{{ .Values.namespace }}" 12 | spec: 13 | type: NodePort 14 | ports: 15 | - port: 80 16 | targetPort: 8000 17 | selector: 18 | subsystem: "{{ .Values.labels.subsystem }}" 19 | container: "{{ default "classic-api" .Values.name }}" 20 | -------------------------------------------------------------------------------- /deploy/classic-api/templates/20-ingress.yml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Ingress 3 | metadata: 4 | name: "{{ default "classic-api-ingress" .Values.ingress.name }}" 5 | namespace: "{{ .Values.namespace }}" 6 | labels: 7 | subsystem: "{{ .Values.labels.subsystem }}" 8 | container: "{{ default "classic-api" .Values.deployment.name }}" 9 | service-group: api 10 | env: "{{ .Values.namespace }}" 11 | # annotations: 12 | # ingress.kubernetes.io/configuration-snippet: | 13 | # more_set_headers "Request-Id: $req_id"; 14 | # ingress.kubernetes.io/auth-url: http://{{ .Values.authenticatorService }}.{{ .Values.namespace }}.svc.cluster.local/auth 15 | # ingress.kubernetes.io/auth-response-headers: Authorization 16 | # ingress.kubernetes.io/limit-connections: "4" 17 | # ingress.kubernetes.io/limit-rps: "16" 18 | # ingress.kubernetes.io/rewrite-target: / 19 | spec: 20 | tls: # This will use the default certificate for the ingress controller. 21 | - hosts: 22 | - "{{ .Values.ingress.host }}" 23 | rules: 24 | - host: "{{ .Values.ingress.host }}" 25 | http: 26 | paths: 27 | - path: "{{ default "/classic_api" .Values.ingress.path }}" 28 | backend: 29 | serviceName: "{{ default "classic-api" .Values.name }}" 30 | servicePort: 80 31 | -------------------------------------------------------------------------------- /deploy/classic-api/values.yaml: -------------------------------------------------------------------------------- 1 | name: classic-api 2 | namespace: development 3 | loglevel: 40 4 | 5 | image: 6 | tag: "latest" 7 | 8 | scaling: 9 | replicas: 1 10 | 11 | elasticsearch: 12 | es_cluster_name: arxiv 13 | host: changeme 14 | port: "443" 15 | proto: "https" 16 | user: "" 17 | password: "" 18 | verify: "true" 19 | index: "arxiv0.3" # currently used for development 20 | 21 | labels: 22 | subsystem: api-gateway 23 | 24 | deployment: 25 | name: classic-api 26 | 27 | ingress: 28 | name: classic-api 29 | host: changeme 30 | path: /classic_api 31 | 32 | base_server: "development.arxiv.org" 33 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Provides the Elasticsearch + Kibana stack for local development. See 2 | # README.md for instructions on use. 3 | --- 4 | version: '3' 5 | services: 6 | elasticsearch: 7 | build: 8 | context: . 9 | dockerfile: Dockerfile-elasticsearch 10 | container_name: elasticsearch 11 | # environment: ['http.host=0.0.0.0', 'transport.host=127.0.0.1', 'ELASTIC_PASSWORD=${ELASTIC_PASSWORD}'] 12 | environment: ['http.host=0.0.0.0', 'transport.host=127.0.0.1'] 13 | ports: ['127.0.0.1:9200:9200', '127.0.0.1:9300:9300'] 14 | networks: ['es_stack'] 15 | 16 | kibana: 17 | build: 18 | context: . 19 | dockerfile: Dockerfile-kibana 20 | container_name: kibana 21 | # environment: 22 | # - ELASTICSEARCH_USERNAME=kibana 23 | # - ELASTICSEARCH_PASSWORD=${ELASTIC_PASSWORD} 24 | ports: ['127.0.0.1:5601:5601'] 25 | networks: ['es_stack'] 26 | depends_on: ['elasticsearch'] 27 | 28 | agent: 29 | build: 30 | context: . 31 | dockerfile: Dockerfile-agent 32 | container_name: agent 33 | environment: 34 | AWS_ACCESS_KEY_ID: "foo" 35 | AWS_SECRET_ACCESS_KEY: "bar" 36 | ELASTICSEARCH_SERVICE_HOST: "elasticsearch" 37 | ELASTICSEARCH_SERVICE_PORT: "9200" 38 | ELASTICSEARCH_SERVICE_PORT_9200_PROTO: "http" 39 | ELASTICSEARCH_USER: "elastic" 40 | ELASTICSEARCH_PASSWORD: "changeme" 41 | ELASTICSEARCH_VERIFY: "false" 42 | KINESIS_STREAM: "MetadataIsAvailable" 43 | KINESIS_SHARD_ID: "0" 44 | KINESIS_ENDPOINT: "https://localstack:4568" 45 | KINESIS_VERIFY: "false" 46 | KINESIS_START_TYPE: "TRIM_HORIZON" 47 | LOGLEVEL: 20 48 | networks: 49 | - es_stack 50 | depends_on: 51 | - localstack 52 | - elasticsearch 53 | 54 | localstack: 55 | image: atlassianlabs/localstack 56 | container_name: localstack 57 | networks: 58 | - es_stack 59 | ports: 60 | - "5568:4568" 61 | environment: 62 | USE_SSL: 'true' 63 | DEBUG: 'true' 64 | 65 | networks: {es_stack: {}} 66 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = arXivZero 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/source/TODO: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/source/TODO -------------------------------------------------------------------------------- /docs/source/_static/diagrams/ng-search-containers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/source/_static/diagrams/ng-search-containers.png -------------------------------------------------------------------------------- /docs/source/_static/diagrams/ng-search-context.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/source/_static/diagrams/ng-search-context.png -------------------------------------------------------------------------------- /docs/source/_static/diagrams/ng-search-indexing-agent-components.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/source/_static/diagrams/ng-search-indexing-agent-components.png -------------------------------------------------------------------------------- /docs/source/_static/diagrams/ng-search-service-components.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/source/_static/diagrams/ng-search-service-components.png -------------------------------------------------------------------------------- /docs/source/_static/diagrams/ng-search-subsystems.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/source/_static/diagrams/ng-search-subsystems.png -------------------------------------------------------------------------------- /docs/source/api/modules.rst: -------------------------------------------------------------------------------- 1 | search 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | search 8 | -------------------------------------------------------------------------------- /docs/source/api/search.agent.consumer.rst: -------------------------------------------------------------------------------- 1 | search.agent.consumer module 2 | ============================ 3 | 4 | .. automodule:: search.agent.consumer 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.agent.rst: -------------------------------------------------------------------------------- 1 | search.agent package 2 | ==================== 3 | 4 | .. automodule:: search.agent 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | 14 | search.agent.tests 15 | 16 | Submodules 17 | ---------- 18 | 19 | .. toctree:: 20 | 21 | search.agent.consumer 22 | 23 | -------------------------------------------------------------------------------- /docs/source/api/search.agent.tests.rst: -------------------------------------------------------------------------------- 1 | search.agent.tests package 2 | ========================== 3 | 4 | .. automodule:: search.agent.tests 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | .. toctree:: 13 | 14 | search.agent.tests.test_integration 15 | search.agent.tests.test_record_processor 16 | 17 | -------------------------------------------------------------------------------- /docs/source/api/search.agent.tests.test_integration.rst: -------------------------------------------------------------------------------- 1 | search.agent.tests.test\_integration module 2 | =========================================== 3 | 4 | .. automodule:: search.agent.tests.test_integration 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.agent.tests.test_record_processor.rst: -------------------------------------------------------------------------------- 1 | search.agent.tests.test\_record\_processor module 2 | ================================================= 3 | 4 | .. automodule:: search.agent.tests.test_record_processor 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.config.rst: -------------------------------------------------------------------------------- 1 | search.config module 2 | ==================== 3 | 4 | .. automodule:: search.config 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.context.rst: -------------------------------------------------------------------------------- 1 | search.context module 2 | ===================== 3 | 4 | .. automodule:: search.context 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.controllers.advanced.forms.rst: -------------------------------------------------------------------------------- 1 | search.controllers.advanced.forms module 2 | ======================================== 3 | 4 | .. automodule:: search.controllers.advanced.forms 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.controllers.advanced.rst: -------------------------------------------------------------------------------- 1 | search.controllers.advanced package 2 | =================================== 3 | 4 | .. automodule:: search.controllers.advanced 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | .. toctree:: 13 | 14 | search.controllers.advanced.forms 15 | search.controllers.advanced.tests 16 | 17 | -------------------------------------------------------------------------------- /docs/source/api/search.controllers.advanced.tests.rst: -------------------------------------------------------------------------------- 1 | search.controllers.advanced.tests module 2 | ======================================== 3 | 4 | .. automodule:: search.controllers.advanced.tests 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.controllers.api.rst: -------------------------------------------------------------------------------- 1 | search.controllers.api package 2 | ============================== 3 | 4 | .. automodule:: search.controllers.api 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | -------------------------------------------------------------------------------- /docs/source/api/search.controllers.rst: -------------------------------------------------------------------------------- 1 | search.controllers package 2 | ========================== 3 | 4 | .. automodule:: search.controllers 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | 14 | search.controllers.advanced 15 | search.controllers.api 16 | search.controllers.simple 17 | 18 | Submodules 19 | ---------- 20 | 21 | .. toctree:: 22 | 23 | search.controllers.tests 24 | search.controllers.util 25 | 26 | -------------------------------------------------------------------------------- /docs/source/api/search.controllers.simple.forms.rst: -------------------------------------------------------------------------------- 1 | search.controllers.simple.forms module 2 | ====================================== 3 | 4 | .. automodule:: search.controllers.simple.forms 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.controllers.simple.rst: -------------------------------------------------------------------------------- 1 | search.controllers.simple package 2 | ================================= 3 | 4 | .. automodule:: search.controllers.simple 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | .. toctree:: 13 | 14 | search.controllers.simple.forms 15 | search.controllers.simple.tests 16 | 17 | -------------------------------------------------------------------------------- /docs/source/api/search.controllers.simple.tests.rst: -------------------------------------------------------------------------------- 1 | search.controllers.simple.tests module 2 | ====================================== 3 | 4 | .. automodule:: search.controllers.simple.tests 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.controllers.tests.rst: -------------------------------------------------------------------------------- 1 | search.controllers.tests module 2 | =============================== 3 | 4 | .. automodule:: search.controllers.tests 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.controllers.util.rst: -------------------------------------------------------------------------------- 1 | search.controllers.util module 2 | ============================== 3 | 4 | .. automodule:: search.controllers.util 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.converters.rst: -------------------------------------------------------------------------------- 1 | search.converters module 2 | ======================== 3 | 4 | .. automodule:: search.converters 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.domain.advanced.rst: -------------------------------------------------------------------------------- 1 | search.domain.advanced module 2 | ============================= 3 | 4 | .. automodule:: search.domain.advanced 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.domain.api.rst: -------------------------------------------------------------------------------- 1 | search.domain.api module 2 | ======================== 3 | 4 | .. automodule:: search.domain.api 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.domain.base.rst: -------------------------------------------------------------------------------- 1 | search.domain.base module 2 | ========================= 3 | 4 | .. automodule:: search.domain.base 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.domain.rst: -------------------------------------------------------------------------------- 1 | search.domain package 2 | ===================== 3 | 4 | .. automodule:: search.domain 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | .. toctree:: 13 | 14 | search.domain.advanced 15 | search.domain.api 16 | search.domain.base 17 | 18 | -------------------------------------------------------------------------------- /docs/source/api/search.encode.rst: -------------------------------------------------------------------------------- 1 | search.encode module 2 | ==================== 3 | 4 | .. automodule:: search.encode 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.factory.rst: -------------------------------------------------------------------------------- 1 | search.factory module 2 | ===================== 3 | 4 | .. automodule:: search.factory 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.process.rst: -------------------------------------------------------------------------------- 1 | search.process package 2 | ====================== 3 | 4 | .. automodule:: search.process 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | .. toctree:: 13 | 14 | search.process.tests 15 | search.process.transform 16 | 17 | -------------------------------------------------------------------------------- /docs/source/api/search.process.tests.rst: -------------------------------------------------------------------------------- 1 | search.process.tests module 2 | =========================== 3 | 4 | .. automodule:: search.process.tests 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.process.transform.rst: -------------------------------------------------------------------------------- 1 | search.process.transform module 2 | =============================== 3 | 4 | .. automodule:: search.process.transform 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.routes.api.exceptions.rst: -------------------------------------------------------------------------------- 1 | search.routes.api.exceptions module 2 | =================================== 3 | 4 | .. automodule:: search.routes.api.exceptions 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.routes.api.rst: -------------------------------------------------------------------------------- 1 | search.routes.api package 2 | ========================= 3 | 4 | .. automodule:: search.routes.api 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | 14 | search.routes.api.tests 15 | 16 | Submodules 17 | ---------- 18 | 19 | .. toctree:: 20 | 21 | search.routes.api.exceptions 22 | search.routes.api.serialize 23 | 24 | -------------------------------------------------------------------------------- /docs/source/api/search.routes.api.serialize.rst: -------------------------------------------------------------------------------- 1 | search.routes.api.serialize module 2 | ================================== 3 | 4 | .. automodule:: search.routes.api.serialize 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.routes.api.tests.rst: -------------------------------------------------------------------------------- 1 | search.routes.api.tests package 2 | =============================== 3 | 4 | .. automodule:: search.routes.api.tests 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | .. toctree:: 13 | 14 | search.routes.api.tests.test_api 15 | search.routes.api.tests.test_serialize 16 | 17 | -------------------------------------------------------------------------------- /docs/source/api/search.routes.api.tests.test_api.rst: -------------------------------------------------------------------------------- 1 | search.routes.api.tests.test\_api module 2 | ======================================== 3 | 4 | .. automodule:: search.routes.api.tests.test_api 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.routes.api.tests.test_serialize.rst: -------------------------------------------------------------------------------- 1 | search.routes.api.tests.test\_serialize module 2 | ============================================== 3 | 4 | .. automodule:: search.routes.api.tests.test_serialize 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.routes.rst: -------------------------------------------------------------------------------- 1 | search.routes package 2 | ===================== 3 | 4 | .. automodule:: search.routes 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | 14 | search.routes.api 15 | 16 | Submodules 17 | ---------- 18 | 19 | .. toctree:: 20 | 21 | search.routes.ui 22 | 23 | -------------------------------------------------------------------------------- /docs/source/api/search.routes.ui.rst: -------------------------------------------------------------------------------- 1 | search.routes.ui module 2 | ======================= 3 | 4 | .. automodule:: search.routes.ui 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.rst: -------------------------------------------------------------------------------- 1 | search package 2 | ============== 3 | 4 | .. automodule:: search 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | 14 | search.agent 15 | search.controllers 16 | search.domain 17 | search.process 18 | search.routes 19 | search.services 20 | search.tests 21 | 22 | Submodules 23 | ---------- 24 | 25 | .. toctree:: 26 | 27 | search.config 28 | search.context 29 | search.converters 30 | search.encode 31 | search.factory 32 | 33 | -------------------------------------------------------------------------------- /docs/source/api/search.services.fulltext.rst: -------------------------------------------------------------------------------- 1 | search.services.fulltext module 2 | =============================== 3 | 4 | .. automodule:: search.services.fulltext 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.services.index.advanced.rst: -------------------------------------------------------------------------------- 1 | search.services.index.advanced module 2 | ===================================== 3 | 4 | .. automodule:: search.services.index.advanced 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.services.index.authors.rst: -------------------------------------------------------------------------------- 1 | search.services.index.authors module 2 | ==================================== 3 | 4 | .. automodule:: search.services.index.authors 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.services.index.exceptions.rst: -------------------------------------------------------------------------------- 1 | search.services.index.exceptions module 2 | ======================================= 3 | 4 | .. automodule:: search.services.index.exceptions 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.services.index.highlighting.rst: -------------------------------------------------------------------------------- 1 | search.services.index.highlighting module 2 | ========================================= 3 | 4 | .. automodule:: search.services.index.highlighting 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.services.index.prepare.rst: -------------------------------------------------------------------------------- 1 | search.services.index.prepare module 2 | ==================================== 3 | 4 | .. automodule:: search.services.index.prepare 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.services.index.results.rst: -------------------------------------------------------------------------------- 1 | search.services.index.results module 2 | ==================================== 3 | 4 | .. automodule:: search.services.index.results 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.services.index.rst: -------------------------------------------------------------------------------- 1 | search.services.index package 2 | ============================= 3 | 4 | .. automodule:: search.services.index 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | 14 | search.services.index.tests 15 | 16 | Submodules 17 | ---------- 18 | 19 | .. toctree:: 20 | 21 | search.services.index.advanced 22 | search.services.index.authors 23 | search.services.index.exceptions 24 | search.services.index.highlighting 25 | search.services.index.prepare 26 | search.services.index.results 27 | search.services.index.simple 28 | search.services.index.util 29 | 30 | -------------------------------------------------------------------------------- /docs/source/api/search.services.index.simple.rst: -------------------------------------------------------------------------------- 1 | search.services.index.simple module 2 | =================================== 3 | 4 | .. automodule:: search.services.index.simple 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.services.index.tests.rst: -------------------------------------------------------------------------------- 1 | search.services.index.tests package 2 | =================================== 3 | 4 | .. automodule:: search.services.index.tests 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | .. toctree:: 13 | 14 | search.services.index.tests.test_reindex 15 | search.services.index.tests.test_results 16 | search.services.index.tests.test_util 17 | search.services.index.tests.tests 18 | 19 | -------------------------------------------------------------------------------- /docs/source/api/search.services.index.tests.test_reindex.rst: -------------------------------------------------------------------------------- 1 | search.services.index.tests.test\_reindex module 2 | ================================================ 3 | 4 | .. automodule:: search.services.index.tests.test_reindex 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.services.index.tests.test_results.rst: -------------------------------------------------------------------------------- 1 | search.services.index.tests.test\_results module 2 | ================================================ 3 | 4 | .. automodule:: search.services.index.tests.test_results 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.services.index.tests.test_util.rst: -------------------------------------------------------------------------------- 1 | search.services.index.tests.test\_util module 2 | ============================================= 3 | 4 | .. automodule:: search.services.index.tests.test_util 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.services.index.tests.tests.rst: -------------------------------------------------------------------------------- 1 | search.services.index.tests.tests module 2 | ======================================== 3 | 4 | .. automodule:: search.services.index.tests.tests 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.services.index.util.rst: -------------------------------------------------------------------------------- 1 | search.services.index.util module 2 | ================================= 3 | 4 | .. automodule:: search.services.index.util 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.services.metadata.rst: -------------------------------------------------------------------------------- 1 | search.services.metadata module 2 | =============================== 3 | 4 | .. automodule:: search.services.metadata 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.services.rst: -------------------------------------------------------------------------------- 1 | search.services package 2 | ======================= 3 | 4 | .. automodule:: search.services 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | 14 | search.services.index 15 | search.services.tests 16 | 17 | Submodules 18 | ---------- 19 | 20 | .. toctree:: 21 | 22 | search.services.fulltext 23 | search.services.metadata 24 | 25 | -------------------------------------------------------------------------------- /docs/source/api/search.services.tests.rst: -------------------------------------------------------------------------------- 1 | search.services.tests package 2 | ============================= 3 | 4 | .. automodule:: search.services.tests 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | .. toctree:: 13 | 14 | search.services.tests.test_fulltext 15 | search.services.tests.test_metadata 16 | 17 | -------------------------------------------------------------------------------- /docs/source/api/search.services.tests.test_fulltext.rst: -------------------------------------------------------------------------------- 1 | search.services.tests.test\_fulltext module 2 | =========================================== 3 | 4 | .. automodule:: search.services.tests.test_fulltext 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.services.tests.test_metadata.rst: -------------------------------------------------------------------------------- 1 | search.services.tests.test\_metadata module 2 | =========================================== 3 | 4 | .. automodule:: search.services.tests.test_metadata 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.tests.rst: -------------------------------------------------------------------------------- 1 | search.tests package 2 | ==================== 3 | 4 | .. automodule:: search.tests 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | .. toctree:: 13 | 14 | search.tests.test_advanced_search 15 | search.tests.test_param_persistence 16 | 17 | -------------------------------------------------------------------------------- /docs/source/api/search.tests.test_advanced_search.rst: -------------------------------------------------------------------------------- 1 | search.tests.test\_advanced\_search module 2 | ========================================== 3 | 4 | .. automodule:: search.tests.test_advanced_search 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/search.tests.test_param_persistence.rst: -------------------------------------------------------------------------------- 1 | search.tests.test\_param\_persistence module 2 | ============================================ 3 | 4 | .. automodule:: search.tests.test_param_persistence 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | arXiv Search System Documentation 2 | ================================= 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | architecture.rst 9 | search_ui.rst 10 | search_api.rst 11 | api/modules.rst 12 | 13 | 14 | Indices and tables 15 | ================== 16 | 17 | * :ref:`genindex` 18 | * :ref:`modindex` 19 | * :ref:`search` 20 | -------------------------------------------------------------------------------- /docs/source/search_api.rst: -------------------------------------------------------------------------------- 1 | Search API (Alpha) 2 | ****************** 3 | 4 | Release `0.5.0-alpha` introduces support for a metadata search API service. 5 | This release targets milestone H2: Search API, with the following specific 6 | goals: 7 | 8 | - H2.1: A search API is exposed via the API gateway, with feature-parity to 9 | classic "arXiv API". 10 | 11 | - Consider content negotiation to support legacy XML and JSON(-LD). 12 | 13 | - H2.2: Opportunistic improvements, fixes, e.g. proper handling of UTF-8 14 | characters (ARXIVNG-257). 15 | - H2.3: Deprecate classic arXiv API. 16 | 17 | 18 | The current release supports only JSON serialization, provided by 19 | :class:`search.routes.api.serialize.JSONSerializer`. An Atom/XML serializer 20 | :class:`search.routes.api.serialize.AtomXMLSerializer` is planned but not yet 21 | implemented. 22 | 23 | A formal description of the API (OpenAPI 3.0) and resources (JSON Schema) can 24 | be found at ``_. 25 | 26 | The service endpoints are defined in :mod:`search.routes.api`: 27 | 28 | - The root endpoint :func:`search.routes.api.search` supports queries using the 29 | same semantics as the advanced search UI. 30 | - The paper metadata endpoint :func:`search.routes.api.paper` provides more 31 | detailed metadata for a specific arXiv e-print. 32 | 33 | Requests are handled by the controllers in :mod:`search.controllers.api`, using 34 | the :class:`search.domain.api.APIQuery` domain class. 35 | -------------------------------------------------------------------------------- /docs/source/search_ui.rst: -------------------------------------------------------------------------------- 1 | Search Interface 2 | **************** 3 | 4 | The current version of the arXiv search application is designed to meet the 5 | goals outlined in arXiv-NG milestone H1: Replace Legacy Search. 6 | 7 | - H1.1. Replace the current advanced search interface, search results, and 8 | search by author name. 9 | - H1.2. The search result view should support pagination, and ordering by 10 | publication date or relevance. 11 | - H1.3. An indexing agent updates the search index at publication time in 12 | response to a Kinesis notification, using metadata from the docmeta endpoint 13 | in the classic system. 14 | 15 | Key Requirements 16 | ================ 17 | 18 | - Simple search: 19 | 20 | - Users should be able to search for arXiv papers by title, author, and 21 | abstract. 22 | - Searches can originate from any part of the arXiv.org site, via the 23 | search bar in the site header. 24 | 25 | - Advanced search: 26 | 27 | - Users can search for papers using boolean combinations of search terms on 28 | title, author names, and/or abstract. 29 | - Users can filter results by primary classification, and submission date. 30 | - Submission date supports prior year, specific year, and date range. 31 | 32 | - Author name search: 33 | 34 | - Users should be able to search for papers by author name. 35 | - This should support queries originating on the abs page, and in search 36 | results. 37 | 38 | - UI: The overall flavor of the search views should be substantially 39 | similar to the classic views, but with styling that improves 40 | readability, usability, and accessibility. 41 | 42 | Quality Goals 43 | ============= 44 | - Code quality: 45 | 46 | - 90% test coverage on Python components that we develop/control. 47 | - Linting: ``pylint`` passes with >= 9/10. 48 | - Documentation: ``pydocstyle`` passes. 49 | - Static checking: ``mypy`` passes. 50 | 51 | - Performance & reliability: 52 | 53 | - Response time: 99% of requests have a latency of 1 second or less. 54 | - Error rate: parity with classic search. 55 | - Request rate: support request volume of existing search * safety factor 3. 56 | 57 | - Accessibility: meet or exceed WCAG 2.0 level A for accessibility. 58 | 59 | Constraints 60 | =========== 61 | - Must be implemented in Python/Flask, and be deployable behind Apache as a 62 | Python/WSGI application. 63 | - The search application itself must be stateless. It must be able to connect 64 | to an arbitrary ElasticSearch cluster, which can be specified via 65 | configuration. 66 | - Notifications about new content are delivered via the Kinesis notification 67 | broker. 68 | -------------------------------------------------------------------------------- /docs/ui/README.md: -------------------------------------------------------------------------------- 1 | #UI Reference Documents 2 | 3 | Source files are accompanied by PDF or image files with the same naming 4 | conventions to provide generic readability. 5 | 6 | **Workflow documents** are a visualization of the screens and actions that a 7 | user would take to perform a specific task (or complete a user story). 8 | 9 | **Wireframe documents** are very simple layout structures to help define the 10 | overall structure of a page and inform prototypes. They provide a high-level 11 | overview, define regions, and prompt discussion of necessary details. 12 | 13 | User interface 14 | ========================== 15 | 16 | Search workflow for users follows a cyclic pattern, with an initial query 17 | producing results which can be sorted, filtered, and inspected. For the initial 18 | phase, focus is placed on query building, reformulation, and results display. 19 | 20 | Search behaviors to promote: reformulation, orienteering, targeted search, 21 | query refinement/drilldown. 22 | 23 | Solutions that support these behaviors: 24 | - Query box with original query pre-filled on results page (reformulation) 25 | - Pagination (orienteering) 26 | - Sorting (refinement) 27 | - Advanced Search with specific field, category, and date options (targeting) 28 | - Author Search interim structure (orienteering) 29 | 30 | Some design decisions are constrained by the choice to minimize JavaScript or 31 | AJAX calls to the interface. Simplicity allows us to focus development and 32 | testing on functionality first, then add interface enhancements in later 33 | iterations. Examples: Go button required for results sorting, manual checkbox 34 | toggle for related fields with input (dates, Physics subject dropdown), error 35 | messages persistent until page reload even if corrections are made. 36 | -------------------------------------------------------------------------------- /docs/ui/advanced-query-v1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/ui/advanced-query-v1.pdf -------------------------------------------------------------------------------- /docs/ui/advanced-query.sketch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/ui/advanced-query.sketch -------------------------------------------------------------------------------- /docs/ui/arxiv-search-results1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/ui/arxiv-search-results1.png -------------------------------------------------------------------------------- /docs/ui/screenshots/arxiv-search-advanced.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/ui/screenshots/arxiv-search-advanced.png -------------------------------------------------------------------------------- /docs/ui/screenshots/arxiv-search-basic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/ui/screenshots/arxiv-search-basic.png -------------------------------------------------------------------------------- /docs/ui/screenshots/arxiv-search-mobile-advanced.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/ui/screenshots/arxiv-search-mobile-advanced.png -------------------------------------------------------------------------------- /docs/ui/screenshots/arxiv-search-mobile-basic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/ui/screenshots/arxiv-search-mobile-basic.png -------------------------------------------------------------------------------- /docs/ui/screenshots/arxiv-search-mobile-results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/ui/screenshots/arxiv-search-mobile-results.png -------------------------------------------------------------------------------- /docs/ui/screenshots/arxiv-search-results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/ui/screenshots/arxiv-search-results.png -------------------------------------------------------------------------------- /docs/ui/search-prototype.sketch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/ui/search-prototype.sketch -------------------------------------------------------------------------------- /docs/ui/search-wireframes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/ui/search-wireframes.pdf -------------------------------------------------------------------------------- /docs/ui/search-wireframes.sketch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/ui/search-wireframes.sketch -------------------------------------------------------------------------------- /docs/ui/search-workflows.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/ui/search-workflows.pdf -------------------------------------------------------------------------------- /docs/ui/search-workflows.sketch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/docs/ui/search-workflows.sketch -------------------------------------------------------------------------------- /lintstats.sh: -------------------------------------------------------------------------------- 1 | # Check pylint status 2 | if [ -z ${MIN_SCORE} ]; then MIN_SCORE="8.5"; fi 3 | PYLINT_SCORE=$( pipenv run pylint search | tail -2 | grep -Eo '[0-9\.]+/10' | tail -1 | sed s/\\/10// ) 4 | PYLINT_PASS=$(echo $PYLINT_SCORE">="$MIN_SCORE | bc -l) 5 | 6 | if [ "$TRAVIS_PULL_REQUEST_SHA" = "" ]; then SHA=$TRAVIS_COMMIT; else SHA=$TRAVIS_PULL_REQUEST_SHA; fi 7 | if [ $PYLINT_PASS -eq 1 ]; then PYLINT_STATE="success" && echo "pylint passed with score "$PYLINT_SCORE" for sha "$SHA; else PYLINT_STATE="failure" && echo "pylint failed with score "$PYLINT_SCORE" for sha "$SHA; fi 8 | 9 | echo "U=$USERNAME, U=$USER, S=$PYLINT_STATE, T=$TRAVIS_REPO_SLUG, B=$TRAVIS_BUILD_ID, P=$PYLINT_SCORE, S=$SHA, TRAVIS_PULL_REQUEST_SHA=$TRAVIS_PULL_REQUEST_SHA, TRAVIS_COMMIT=$TRAVIS_COMMIT" 10 | curl -u $USERNAME:$GITHUB_TOKEN \ 11 | -d '{"state": "'$PYLINT_STATE'", "target_url": "https://travis-ci.com/'$TRAVIS_REPO_SLUG'/builds/'$TRAVIS_BUILD_ID'", "description": "'$PYLINT_SCORE'/10", "context": "code-quality/pylint"}' \ 12 | -XPOST https://api.github.com/repos/$TRAVIS_REPO_SLUG/statuses/$SHA 13 | 14 | 15 | # Check mypy integration 16 | MYPY_STATUS=$( pipenv run mypy -p search | grep -v "test.*" | grep -v "defined here" | tee /dev/tty | wc -l | tr -d '[:space:]' ) 17 | if [ $MYPY_STATUS -ne 0 ]; then MYPY_STATE="failure" && echo "mypy failed"; else MYPY_STATE="success" && echo "mypy passed"; fi 18 | 19 | curl -u $USERNAME:$GITHUB_TOKEN \ 20 | -d '{"state": "'$MYPY_STATE'", "target_url": "https://travis-ci.org/'$TRAVIS_REPO_SLUG'/builds/'$TRAVIS_BUILD_ID'", "description": "", "context": "code-quality/mypy"}' \ 21 | -XPOST https://api.github.com/repos/$TRAVIS_REPO_SLUG/statuses/$SHA \ 22 | > /dev/null 2>&1 23 | 24 | 25 | # Check pydocstyle integration 26 | pipenv run pydocstyle --convention=numpy --add-ignore=D401,D202 search 27 | PYDOCSTYLE_STATUS=$? 28 | if [ $PYDOCSTYLE_STATUS -ne 0 ]; then PYDOCSTYLE_STATE="failure" && echo "pydocstyle failed"; else PYDOCSTYLE_STATE="success" && echo "pydocstyle passed"; fi 29 | 30 | curl -u $USERNAME:$GITHUB_TOKEN \ 31 | -d '{"state": "'$PYDOCSTYLE_STATE'", "target_url": "https://travis-ci.org/'$TRAVIS_REPO_SLUG'/builds/'$TRAVIS_BUILD_ID'", "description": "", "context": "code-quality/pydocstyle"}' \ 32 | -XPOST https://api.github.com/repos/$TRAVIS_REPO_SLUG/statuses/$SHA \ 33 | > /dev/null 2>&1 34 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """Runs arxiv-search in debug mode. 2 | 3 | Run as `python main.py`""" 4 | from search.factory import create_ui_web_app 5 | 6 | if __name__ == "__main__": 7 | app = create_ui_web_app() 8 | app.config['FLASK_DEBUG']=1 9 | app.config['TEMPLATES_AUTO_RELOAD'] = True 10 | app.run(debug=True, port=8080) 11 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | mypy_path = $MYPYPATH:./sqlalchemy-stubs 3 | 4 | # 5 | # Covered by --strict, with some turned off: 6 | # 7 | #disallow_untyped_calls=True 8 | disallow_untyped_defs=True 9 | check_untyped_defs=True 10 | # currently an issue with sql alchemy 11 | disallow_subclassing_any=false 12 | # Need to experiment/think about this one: 13 | disallow_any_decorated=false 14 | warn_redundant_casts=True 15 | warn_return_any=True 16 | #warn_unused_ignores=True 17 | # this seems to be at least somewhat non-functioning: 18 | #warn_unused_configs=True 19 | #may be worth reconsidering this one: 20 | no_implicit_optional=True 21 | strict_optional=True 22 | 23 | # 24 | # Other: 25 | # 26 | ignore_missing_imports=True 27 | 28 | 29 | [mypy-sqlalchemy.*] 30 | disallow_untyped_calls=False 31 | disallow_untyped_defs=False 32 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "arxiv-search" 3 | version = "1.0.0" 4 | description = "Search for arxiv.org " 5 | authors = ["arxiv.org"] 6 | readme = "README.md" 7 | package-mode = false 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.10" 11 | boto3 = "*" 12 | click = "*" 13 | elasticsearch = "==6.3.0" 14 | elasticsearch-dsl = "==6.4.0" 15 | feedgen = "*" 16 | flask = ">=2.2,<3.0" 17 | flask-s3 = "*" 18 | jsonschema = "*" 19 | lark-parser = "*" 20 | lxml = "*" 21 | markupsafe = "*" 22 | python-dateutil = "*" 23 | pytz = "*" 24 | referencing = "*" 25 | requests = "*" 26 | retry = "*" 27 | urllib3 = "*" 28 | #werkzeug = "2.2.3" 29 | werkzeug = "2.3.6" 30 | wtforms = "==2.1" 31 | arxiv-base = {git = "https://github.com/arXiv/arxiv-base.git", rev = "1.0.1"} 32 | mypy-extensions = "^1.0.0" 33 | 34 | [tool.poetry.group.dev.dependencies] 35 | pycodestyle = "*" 36 | pydocstyle = ">=2.1.1" 37 | mock = "==2.0.0" 38 | #mypy = "==0.720" 39 | #mypy-extensions = "*" 40 | #sphinx = "*" 41 | #sphinxcontrib-websupport = "*" 42 | #sphinx-autodoc-typehints = "*" 43 | pylint = "*" 44 | pytest = "*" 45 | pre-commit = "==2.0.1" 46 | coverage = "==4.4.2" 47 | 48 | [build-system] 49 | requires = ["poetry-core"] 50 | build-backend = "poetry.core.masonry.api" 51 | -------------------------------------------------------------------------------- /recent_docs_updated.sql: -------------------------------------------------------------------------------- 1 | # File: recent_docs_updated.sql 2 | # Desc: Find papers with metadata/author updates during the previous hour. 3 | # Based on: arxiv-bin/notify_search.pl 4 | # Running: see arxiv-bin/dotfiles/nexus.crontab 5 | # 6 | 7 | SELECT am.paper_id 8 | FROM arXiv_paper_owners apo, 9 | arXiv_metadata am 10 | WHERE apo.document_id = am.document_id 11 | AND apo.valid = 1 12 | AND apo.flag_author = 1 13 | AND apo.flag_auto = 0 14 | AND apo.date BETWEEN UNIX_TIMESTAMP(DATE_SUB(DATE_FORMAT(NOW(), "%Y-%m-%d %H:00:00"), INTERVAL 1 HOUR)) 15 | AND UNIX_TIMESTAMP(DATE_FORMAT(NOW(), "%Y-%m-%d %H:00:00")) 16 | UNION 17 | SELECT am.paper_id 18 | FROM arXiv_paper_owners apo, 19 | arXiv_metadata am, 20 | arXiv_author_ids aai 21 | WHERE apo.document_id = am.document_id 22 | AND apo.user_id = aai.user_id 23 | AND apo.valid = 1 24 | AND apo.flag_author = 1 25 | AND apo.flag_auto = 0 26 | AND aai.updated BETWEEN DATE_SUB(DATE_FORMAT(NOW(), "%Y-%m-%d %H:00:00"), INTERVAL 1 HOUR) 27 | AND DATE_FORMAT(NOW(), "%Y-%m-%d %H:00:00") 28 | UNION 29 | SELECT am.paper_id 30 | FROM arXiv_paper_owners apo, 31 | arXiv_metadata am, 32 | arXiv_orcid_ids aoi 33 | WHERE apo.document_id = am.document_id 34 | AND apo.user_id = aoi.user_id 35 | AND apo.valid = 1 36 | AND apo.flag_author = 1 37 | AND apo.flag_auto = 0 38 | AND aoi.updated BETWEEN DATE_SUB(DATE_FORMAT(NOW(), "%Y-%m-%d %H:00:00"), INTERVAL 1 HOUR) 39 | AND DATE_FORMAT(NOW(), "%Y-%m-%d %H:00:00") 40 | UNION 41 | SELECT am.paper_id 42 | FROM arXiv_metadata am 43 | WHERE 1=1 44 | AND am.modtime BETWEEN UNIX_TIMESTAMP(DATE_SUB(DATE_FORMAT(NOW(), "%Y-%m-%d %H:00:00"), INTERVAL 1 HOUR)) 45 | AND UNIX_TIMESTAMP(DATE_FORMAT(NOW(), "%Y-%m-%d %H:00:00")) 46 | ORDER BY 1 desc 47 | ; 48 | 49 | -------------------------------------------------------------------------------- /reindex.py: -------------------------------------------------------------------------------- 1 | """Helper script to reindex all arXiv papers.""" 2 | 3 | import click 4 | import time 5 | 6 | from search.factory import create_ui_web_app 7 | from search.services import index 8 | 9 | app = create_ui_web_app() 10 | 11 | 12 | @app.cli.command() 13 | @click.argument("old_index", nargs=1) 14 | @click.argument("new_index", nargs=1) 15 | def reindex(old_index: str, new_index: str): 16 | """ 17 | Reindex the documents in `old_index` to `new_index`. 18 | 19 | This will create `new_index` with the current configured mappings if it 20 | does not already exist. 21 | """ 22 | click.echo(f"Reindex papers in `{old_index}` to `{new_index}`") 23 | if not index.SearchSession.index_exists(old_index): 24 | click.echo(f"Source index `{old_index}` does not exist.") 25 | 26 | r = index.SearchSession.reindex(old_index, new_index) 27 | if not r: 28 | raise click.ClickException("Failed to get or create new index") 29 | 30 | click.echo(f"Started reindexing task") 31 | task_id = r["task"] 32 | with click.progressbar(length=100, label="percent complete") as progress: 33 | while True: 34 | status = index.SearchSession.get_task_status(task_id) 35 | total = float(status["task"]["status"]["total"]) 36 | if status["completed"] or total == 0: 37 | progress.update(100) 38 | break 39 | 40 | updated = status["task"]["status"]["updated"] 41 | created = status["task"]["status"]["created"] 42 | deleted = status["task"]["status"]["deleted"] 43 | complete = (updated + created + deleted) / total 44 | progress.update(complete * 100) 45 | if complete == 1: 46 | break 47 | time.sleep(2) 48 | 49 | 50 | if __name__ == "__main__": 51 | reindex() 52 | -------------------------------------------------------------------------------- /schema/resources/Classification.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Classification", 3 | "type": "object", 4 | "properties": { 5 | "archive": {"$ref": "./ClassificationTerm.json"}, 6 | "group": {"$ref": "./ClassificationTerm.json"}, 7 | "category": {"$ref": "./ClassificationTerm.json"} 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /schema/resources/ClassificationTerm.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "ClassificationTerm", 3 | "type": "object", 4 | "properties": { 5 | "id": { 6 | "type": "string" 7 | }, 8 | "name": { 9 | "type": "string" 10 | } 11 | }, 12 | "required": ["id", "name"] 13 | } 14 | -------------------------------------------------------------------------------- /schema/resources/DocumentSet.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "DocumentSet", 3 | "description": "A set of documents that respond to a query.", 4 | "type": "object", 5 | "required": ["metadata", "results"], 6 | "properties": { 7 | "metadata": { 8 | "description": "Summary information about the search, including pagination.", 9 | "properties": { 10 | "start": { 11 | "description": "Offset (zero-based) of first result in this documentset from start of original search results.", 12 | "type": "integer" 13 | }, 14 | "end": { 15 | "description": "Offset (zero-based) of last result in this documentset from start of original search results.", 16 | "type": "integer" 17 | }, 18 | "total": { 19 | "description": "Total number of documents that respond to this query.", 20 | "type": "integer" 21 | }, 22 | "query": { 23 | "description": "Query parameters interpreted from the request.", 24 | "type": "array", 25 | "items": { 26 | "type": "object", 27 | "properties": { 28 | "parameter": { 29 | "type": "string" 30 | }, 31 | "value": { 32 | "type": "string" 33 | } 34 | } 35 | } 36 | } 37 | } 38 | }, 39 | "results": { 40 | "type": "array", 41 | "items": { 42 | "type": "object", 43 | "$ref": "Document.json" 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /schema/resources/Person.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Person", 3 | "description": "Schema for person in metadata returned by the search API.", 4 | "type": "object", 5 | "properties": { 6 | "full_name": { 7 | "description": "The fullest representation of the person's name available in arXiv metadata.", 8 | "type": "string" 9 | }, 10 | "last_name": { 11 | "description": "The family or surname part of the person's name, if available.", 12 | "type": "string" 13 | }, 14 | "first_name": { 15 | "description": "The personal or forename part of the person's name, if available.", 16 | "type": "string" 17 | }, 18 | "suffix": { 19 | "description": "The suffix part of the person's name, if available.", 20 | "type": "string" 21 | }, 22 | "affiliation": { 23 | "description": "Institutional affiliations as entered at the time of submission, if available.", 24 | "type": "array", 25 | "items": { 26 | "type": "string" 27 | }, 28 | "minItems": 0 29 | }, 30 | "orcid": { 31 | "description": "ORCID identifier, if available.", 32 | "oneOf": [ 33 | {"type": "string"}, 34 | {"type": "null"} 35 | ] 36 | }, 37 | "author_id": { 38 | "description": "arXiv author identifier, if available.", 39 | "oneOf": [ 40 | {"type": "string"}, 41 | {"type": "null"} 42 | ] 43 | } 44 | }, 45 | "required": [ 46 | "full_name" 47 | ] 48 | } 49 | -------------------------------------------------------------------------------- /search/__init__.py: -------------------------------------------------------------------------------- 1 | """arxiv search.""" 2 | -------------------------------------------------------------------------------- /search/consts.py: -------------------------------------------------------------------------------- 1 | """Constants.""" 2 | from pytz import timezone 3 | 4 | # Sorting 5 | 6 | DEFAULT_SORT_ORDER = [ 7 | {"announced_date_first": {"order": "desc"}}, 8 | {"_doc": {"order": "asc"}}, 9 | ] 10 | 11 | 12 | # Timezones 13 | 14 | EASTERN = timezone("US/Eastern") 15 | -------------------------------------------------------------------------------- /search/context.py: -------------------------------------------------------------------------------- 1 | """Helpers for working with Flask globals.""" 2 | 3 | import os 4 | from typing import Optional, Union 5 | from flask import g, Flask 6 | from flask import current_app as flask_app 7 | import werkzeug 8 | 9 | 10 | def get_application_config( 11 | app: Optional[Union[Flask, object]] = None 12 | ) -> Union[dict, os._Environ]: 13 | """ 14 | Get a configuration from the current app, or fall back to env. 15 | 16 | Parameters 17 | ---------- 18 | app : :class:`flask.Flask` 19 | 20 | Returns 21 | ------- 22 | dict-like 23 | This is either the current Flask application configuration, or 24 | ``os.environ``. Either of these should support the ``get()`` method. 25 | """ 26 | # pylint: disable=protected-access 27 | if app is not None: 28 | if isinstance(app, Flask): 29 | return app.config # type: ignore 30 | if flask_app: # Proxy object; falsey if there is no application context. 31 | return flask_app.config # type: ignore 32 | return os.environ 33 | 34 | 35 | def get_application_global() -> Optional[werkzeug.local.LocalProxy]: 36 | """ 37 | Get the current application global proxy object. 38 | 39 | Returns 40 | ------- 41 | proxy or None 42 | """ 43 | if g: 44 | return g # type: ignore 45 | return None 46 | -------------------------------------------------------------------------------- /search/controllers/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Houses controllers for search. 3 | 4 | Each controller corresponds to a distinct search feature with its own request 5 | handling logic. Each controller API exposes a ``search()`` function that 6 | accepts a set of request parameters (``dict``-like) and returns a 3-tuple 7 | of response data (``dict``), status code (``int``), and extra response headers 8 | (``dict``). 9 | """ 10 | from http import HTTPStatus 11 | from typing import Tuple, Dict, Any 12 | 13 | from search.services import index 14 | from search.domain import SimpleQuery 15 | 16 | 17 | def health_check() -> Tuple[str, int, Dict[str, Any]]: 18 | """ 19 | Exercise the connection with the search index with a real query. 20 | 21 | Returns 22 | ------- 23 | dict 24 | Search result response data. 25 | int 26 | HTTP status code. 27 | dict 28 | Headers to add to the response. 29 | 30 | """ 31 | # We don't handle any exceptions here because we want the framework 32 | # exception handling to take care of it and log them. 33 | document_set = index.SearchSession.current_session().search( # type: ignore 34 | SimpleQuery(search_field="all", value="theory") 35 | ) 36 | if document_set["results"]: 37 | return "OK", HTTPStatus.OK, {} 38 | return "DOWN: document_set lacked results", HTTPStatus.INTERNAL_SERVER_ERROR, {} 39 | -------------------------------------------------------------------------------- /search/controllers/api/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for :mod:`.controllers.api`.""" 2 | -------------------------------------------------------------------------------- /search/controllers/classic_api/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for arXiv classic API controllers.""" 2 | -------------------------------------------------------------------------------- /search/controllers/simple/forms.py: -------------------------------------------------------------------------------- 1 | """Provides form rendering and validation for the simple search feature.""" 2 | 3 | from wtforms import Form, StringField, SelectField, validators, RadioField 4 | 5 | from search.controllers.util import ( 6 | does_not_start_with_wildcard, 7 | has_balanced_quotes, 8 | strip_white_space, 9 | ) 10 | from search.domain import Query 11 | 12 | 13 | class SimpleSearchForm(Form): 14 | """Provides a simple field-query search form.""" 15 | 16 | searchtype = SelectField("Field", choices=Query.SUPPORTED_FIELDS) 17 | query = StringField( 18 | "Search or Article ID", 19 | filters=[strip_white_space], 20 | validators=[does_not_start_with_wildcard, has_balanced_quotes], 21 | ) 22 | size = SelectField( 23 | "results per page", 24 | default=50, 25 | choices=[("25", "25"), ("50", "50"), ("100", "100"), ("200", "200")], 26 | ) 27 | order = SelectField( 28 | "Sort results by", 29 | choices=[ 30 | ("-announced_date_first", "Announcement date (newest first)"), 31 | ("announced_date_first", "Announcement date (oldest first)"), 32 | ("-submitted_date", "Submission date (newest first)"), 33 | ("submitted_date", "Submission date (oldest first)"), 34 | ("", "Relevance"), 35 | ], 36 | validators=[validators.Optional()], 37 | default="-announced_date_first", 38 | ) 39 | 40 | HIDE_ABSTRACTS = "hide" 41 | SHOW_ABSTRACTS = "show" 42 | 43 | abstracts = RadioField( 44 | "Abstracts", 45 | choices=[ 46 | (SHOW_ABSTRACTS, "Show abstracts"), 47 | (HIDE_ABSTRACTS, "Hide abstracts"), 48 | ], 49 | default=SHOW_ABSTRACTS, 50 | ) 51 | 52 | def validate_query(form: Form, field: StringField) -> None: 53 | """Validate the length of the querystring, if searchtype is set.""" 54 | if form.searchtype.data is None or form.searchtype.data == "None": 55 | return 56 | if not form.query.data or len(form.query.data) < 1: 57 | raise validators.ValidationError( 58 | "Field must be at least 1 character long." 59 | ) 60 | -------------------------------------------------------------------------------- /search/controllers/tests.py: -------------------------------------------------------------------------------- 1 | """Tests for :mod:`search.controllers`.""" 2 | 3 | from http import HTTPStatus 4 | from unittest import TestCase, mock 5 | 6 | from search.controllers import health_check 7 | from search.controllers.util import catch_underscore_syntax 8 | 9 | 10 | class TestHealthCheck(TestCase): 11 | """Tests for :func:`.health_check`.""" 12 | 13 | @mock.patch("search.controllers.index.SearchSession") 14 | def test_index_is_down(self, mock_index): 15 | """Test returns 'DOWN' + status 500 when index raises an exception.""" 16 | mock_index.search.side_effect = RuntimeError 17 | 18 | with self.assertRaises(RuntimeError): 19 | response, status_code, _ = health_check() 20 | 21 | @mock.patch("search.controllers.index.SearchSession") 22 | def test_index_returns_no_result(self, mock_index): 23 | """Test returns 'DOWN' + status 500 when index returns no results.""" 24 | mock_index.search.return_value = {"metadata": {}, "results": []} 25 | response, status_code, _ = health_check() 26 | self.assertEqual(response, "DOWN: document_set lacked results", "Response content should be DOWN: document_set lacked results") 27 | self.assertEqual( 28 | status_code, 29 | HTTPStatus.INTERNAL_SERVER_ERROR, 30 | "Should return 500 status code.", 31 | ) 32 | 33 | @mock.patch("search.controllers.index.SearchSession") 34 | def test_index_returns_result(self, mock_index): 35 | """Test returns 'OK' + status 200 when index returns results.""" 36 | mock_index.search.return_value = {"metadata": {}, "results": [{}]} 37 | response, status_code, _ = health_check() 38 | self.assertEqual(response, "OK", "Response content should be OK") 39 | self.assertEqual( 40 | status_code, HTTPStatus.OK, "Should return 200 status code." 41 | ) 42 | 43 | 44 | class TestUnderscoreHandling(TestCase): 45 | """Test :func:`.catch_underscore_syntax`.""" 46 | 47 | def test_underscore_is_rewritten(self): 48 | """User searches for an author name with `surname_f` format.""" 49 | query = "franklin_r" 50 | after, classic_name = catch_underscore_syntax(query) 51 | self.assertEqual( 52 | after, 53 | "franklin, r", 54 | "The underscore should be replaced with `, `.", 55 | ) 56 | self.assertTrue(classic_name, "Should be identified as classic") 57 | 58 | def test_false_positive(self): 59 | """The underscore is followed by more than one character.""" 60 | query = "not_aname" 61 | after, classic_name = catch_underscore_syntax(query) 62 | self.assertEqual(query, after, "The query should not be rewritten") 63 | self.assertFalse(classic_name, "Should not be identified as classic") 64 | 65 | def test_multiple_authors(self): 66 | """The user passes more than one name in classic format.""" 67 | # E-gads. 68 | query = "franklin_r dole_b" 69 | after, classic_name = catch_underscore_syntax(query) 70 | self.assertEqual( 71 | after, 72 | "franklin, r; dole, b", 73 | "The underscore should be replaced with `, `.", 74 | ) 75 | self.assertTrue(classic_name, "Should be identified as classic") 76 | 77 | def test_nonsense_input(self): 78 | """Garbage input is passed.""" 79 | try: 80 | catch_underscore_syntax("") 81 | except Exception as ex: 82 | self.fail(ex) 83 | -------------------------------------------------------------------------------- /search/controllers/util.py: -------------------------------------------------------------------------------- 1 | """Controller helpers.""" 2 | 3 | import re 4 | from typing import Tuple, Dict, Any 5 | 6 | from wtforms import Form, StringField, validators 7 | 8 | from search.domain import Query 9 | 10 | CLASSIC_AUTHOR = r"([A-Za-z]+)_([a-zA-Z])(?=$|\s)" 11 | 12 | 13 | def does_not_start_with_wildcard(form: Form, field: StringField) -> None: 14 | """Check that ``value`` does not start with a wildcard character.""" 15 | if not field.data: 16 | return 17 | if field.data.startswith("?") or field.data.startswith("*"): 18 | raise validators.ValidationError( 19 | "Search cannot start with a wildcard (? *)." 20 | ) 21 | if any( 22 | part.startswith("?") or part.startswith("*") 23 | for part in field.data.split() 24 | ): 25 | raise validators.ValidationError( 26 | "Search terms cannot start with a" " wildcard (? *)." 27 | ) 28 | 29 | 30 | def has_balanced_quotes(form: Form, field: StringField) -> None: 31 | """Check that ``value`` has balanced (paired) quotes.""" 32 | if not field.data: 33 | return 34 | if '"' in field.data and field.data.count('"') % 2 != 0: 35 | raise validators.ValidationError('Missing closing quote (").') 36 | 37 | 38 | def strip_white_space(value: str) -> str: 39 | """Strip whitespace from form input.""" 40 | if not value: 41 | return value 42 | return value.strip() 43 | 44 | 45 | # FIXME: Argument type. 46 | def paginate(query: Query, data: Dict[Any, Any]) -> Query: 47 | """ 48 | Update pagination parameters on a :class:`.Query` from request parameters. 49 | 50 | Parameters 51 | ---------- 52 | query : :class:`.Query` 53 | data : dict 54 | 55 | Returns 56 | ------- 57 | :class:`.Query` 58 | 59 | """ 60 | query.page_start = max(int(data.get("start", 0)), 0) 61 | query.size = min(int(data.get("size", 50)), Query.MAXIMUM_size) 62 | return query 63 | 64 | 65 | def catch_underscore_syntax(term: str) -> Tuple[str, bool]: 66 | """Rewrite author name strings in `surname_f` format to use commas.""" 67 | match = re.search(CLASSIC_AUTHOR, term) 68 | if not match: 69 | return term, False 70 | return re.sub(CLASSIC_AUTHOR, r"\g<1>, \g<2>;", term).rstrip(";"), True 71 | -------------------------------------------------------------------------------- /search/converters.py: -------------------------------------------------------------------------------- 1 | """URL conversion for paths containing arXiv groups or archives.""" 2 | 3 | from typing import List, Optional 4 | from arxiv import taxonomy 5 | from werkzeug.routing import BaseConverter, ValidationError 6 | 7 | 8 | class ArchiveConverter(BaseConverter): 9 | """Route converter for arXiv IDs.""" 10 | 11 | def to_python(self, value: str) -> Optional[List[str]]: 12 | """Parse URL path part to Python rep (str).""" 13 | valid_archives = [] 14 | for archive in value.split(","): 15 | if archive not in taxonomy.ARCHIVES: 16 | continue 17 | # Support old archives. 18 | if archive in taxonomy.ARCHIVES_SUBSUMED: 19 | cat = taxonomy.CATEGORIES[taxonomy.ARCHIVES_SUBSUMED[archive]] 20 | archive = cat["in_archive"] 21 | valid_archives.append(archive) 22 | if not valid_archives: 23 | raise ValidationError() 24 | return valid_archives 25 | 26 | def to_url(self, value: List[str]) -> str: 27 | """Cast Python rep (list) to URL path part.""" 28 | return ",".join(value) 29 | -------------------------------------------------------------------------------- /search/domain/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Domain classes for search service. 3 | 4 | The domain provides a description of the main data objects used in module APIs. 5 | Specifically, the :mod:`search.controllers`, :mod:`search.services`, and 6 | :mod:`search.process` modules should use the domain as their primary 7 | "language". This is intended to make static checking easier and enhance overall 8 | intelligibility of the codebase. 9 | """ 10 | 11 | __all__ = [ 12 | # base 13 | "asdict", 14 | "DocMeta", 15 | "Fulltext", 16 | "DateRange", 17 | "Classification", 18 | "ClassificationList", 19 | "Operator", 20 | "Field", 21 | "Term", 22 | "Phrase", 23 | "Phrase", 24 | "SortDirection", 25 | "SortBy", 26 | "SortOrder", 27 | "Query", 28 | "SimpleQuery", 29 | # advanced 30 | "FieldedSearchTerm", 31 | "FieldedSearchList", 32 | "AdvancedQuery", 33 | # api 34 | "APIQuery", 35 | # classic api 36 | "ClassicAPIQuery", 37 | "ClassicSearchResponseData", 38 | # documenhts 39 | "Error", 40 | "Document", 41 | "DocumentSet", 42 | "document_set_from_documents", 43 | ] 44 | 45 | # pylint: disable=wildcard-import 46 | from search.domain.base import ( 47 | asdict, 48 | DocMeta, 49 | Fulltext, 50 | DateRange, 51 | Classification, 52 | ClassificationList, 53 | Operator, 54 | Field, 55 | Term, 56 | Phrase, 57 | SortDirection, 58 | SortBy, 59 | SortOrder, 60 | Query, 61 | SimpleQuery, 62 | ) 63 | from search.domain.advanced import ( 64 | FieldedSearchTerm, 65 | FieldedSearchList, 66 | AdvancedQuery, 67 | ) 68 | from search.domain.api import APIQuery 69 | from search.domain.classic_api import ( 70 | ClassicAPIQuery, 71 | ClassicSearchResponseData, 72 | ) 73 | from search.domain.documents import ( 74 | Error, 75 | Document, 76 | DocumentSet, 77 | document_set_from_documents, 78 | ) 79 | -------------------------------------------------------------------------------- /search/domain/advanced.py: -------------------------------------------------------------------------------- 1 | """Represents fielded search terms, with multiple operators.""" 2 | 3 | from typing import Optional 4 | from dataclasses import dataclass, field 5 | 6 | from search.domain.base import DateRange, Query, ClassificationList 7 | 8 | 9 | @dataclass 10 | class FieldedSearchTerm: 11 | """Represents a fielded search term.""" 12 | 13 | operator: Optional[str] 14 | field: str 15 | term: str 16 | 17 | def __str__(self) -> str: 18 | """Build a string representation, for use in rendering.""" 19 | return f"{self.operator} {self.field}={self.term}" 20 | 21 | 22 | class FieldedSearchList(list): 23 | """Represents a list of fielded search terms.""" 24 | 25 | def __str__(self) -> str: 26 | """Build a string representation, for use in rendering.""" 27 | return "; ".join([str(item) for item in self]) 28 | 29 | 30 | @dataclass 31 | class AdvancedQuery(Query): 32 | """ 33 | Represents an advanced query. 34 | 35 | An advanced query contains fielded search terms and boolean operators. 36 | """ 37 | 38 | SUPPORTED_FIELDS = [ 39 | ("title", "Title"), 40 | ("author", "Author(s)"), 41 | ("abstract", "Abstract"), 42 | ("comments", "Comments"), 43 | ("journal_ref", "Journal reference"), 44 | ("acm_class", "ACM classification"), 45 | ("msc_class", "MSC classification"), 46 | ("report_num", "Report number"), 47 | ("paper_id", "arXiv identifier"), 48 | ("cross_list_category", "Cross-list category"), 49 | ("doi", "DOI"), 50 | ("orcid", "ORCID"), 51 | ("author_id", "arXiv author ID"), 52 | ("all", "All fields"), 53 | ] 54 | 55 | date_range: Optional[DateRange] = None 56 | 57 | classification: ClassificationList = field( 58 | default_factory=ClassificationList 59 | ) 60 | """Classification(s) by which to limit results.""" 61 | 62 | include_cross_list: bool = field(default=True) 63 | """If True, secondaries are considered when limiting by classification.""" 64 | 65 | terms: FieldedSearchList = field(default_factory=FieldedSearchList) 66 | -------------------------------------------------------------------------------- /search/domain/api.py: -------------------------------------------------------------------------------- 1 | """API-specific domain classes.""" 2 | 3 | from dataclasses import dataclass, field 4 | from typing import Optional, Tuple 5 | 6 | from search.domain.advanced import FieldedSearchList 7 | from search.domain.base import DateRange, Query, Classification, List 8 | 9 | 10 | def get_default_extra_fields() -> List[str]: 11 | """These are the default extra fields.""" 12 | return ["title"] 13 | 14 | 15 | def get_required_fields() -> List[str]: 16 | """These fields should always be included.""" 17 | return ["paper_id", "paper_id_v", "version", "href", "canonical"] 18 | 19 | 20 | @dataclass 21 | class APIQuery(Query): 22 | """ 23 | Represents an API query. 24 | 25 | Similar to an advanced query. 26 | """ 27 | 28 | date_range: Optional[DateRange] = None 29 | primary_classification: Tuple[Classification, ...] = field( 30 | default_factory=tuple 31 | ) 32 | """Limit results to a specific primary classification.""" 33 | secondary_classification: List[Tuple[Classification, ...]] = field( 34 | default_factory=list 35 | ) 36 | """Limit results by cross-list classification.""" 37 | terms: FieldedSearchList = field(default_factory=FieldedSearchList) 38 | include_fields: List[str] = field(default_factory=get_default_extra_fields) 39 | 40 | def __post_init__(self) -> None: 41 | """Be sure that the required fields are prepended to include_fields.""" 42 | self.include_fields = list( 43 | set(get_required_fields() + self.include_fields) 44 | ) 45 | -------------------------------------------------------------------------------- /search/domain/classic_api/__init__.py: -------------------------------------------------------------------------------- 1 | """Classic API Query object.""" 2 | 3 | __all__ = ["ClassicAPIQuery", "ClassicSearchResponseData"] 4 | 5 | from search.domain.classic_api.classic_query import ( 6 | ClassicAPIQuery, 7 | ClassicSearchResponseData, 8 | ) 9 | -------------------------------------------------------------------------------- /search/domain/classic_api/classic_query.py: -------------------------------------------------------------------------------- 1 | """Classic API Query object.""" 2 | 3 | from typing import Optional, List 4 | from dataclasses import dataclass, field 5 | 6 | from search.domain.base import Query, Phrase 7 | from search.domain.documents import DocumentSet 8 | from search.domain.classic_api.query_parser import parse_classic_query 9 | 10 | 11 | @dataclass 12 | class ClassicAPIQuery(Query): 13 | """Query supported by the classic arXiv API.""" 14 | 15 | search_query: Optional[str] = field(default=None) 16 | phrase: Optional[Phrase] = field(default=None) 17 | id_list: Optional[List[str]] = field(default=None) 18 | size: int = field(default=10) 19 | 20 | def __post_init__(self) -> None: 21 | """Ensure that either a phrase or id_list is set.""" 22 | if self.search_query is not None: 23 | self.phrase = parse_classic_query(self.search_query) 24 | 25 | if self.phrase is None and self.id_list is None: 26 | raise ValueError( 27 | "ClassicAPIQuery requires either a phrase, id_list, or both" 28 | ) 29 | 30 | def to_query_string(self) -> str: 31 | """Return a string representation of the API query.""" 32 | return ( 33 | f"search_query={self.search_query or ''}&" 34 | f"id_list={','.join(self.id_list) if self.id_list else ''}&" 35 | f"start={self.page_start}&" 36 | f"max_results={self.size}" 37 | ) 38 | 39 | 40 | @dataclass 41 | class ClassicSearchResponseData: 42 | """Classic API search response data.""" 43 | 44 | results: Optional[DocumentSet] = None 45 | query: Optional[ClassicAPIQuery] = None 46 | -------------------------------------------------------------------------------- /search/domain/classic_api/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for classic API domain classes.""" 2 | -------------------------------------------------------------------------------- /search/encode.py: -------------------------------------------------------------------------------- 1 | """Utilities for response encoding/serialization.""" 2 | 3 | from datetime import date, datetime 4 | 5 | from json import JSONEncoder 6 | 7 | from typing import Any, List, Union 8 | 9 | 10 | class ISO8601JSONEncoder(JSONEncoder): 11 | """Renders date and datetime objects as ISO8601 datetime strings.""" 12 | 13 | def default(self, obj: Any) -> Union[str, List[Any]]: 14 | """Overriden to render date(time)s in isoformat.""" 15 | try: 16 | if isinstance(obj, (date, datetime)): 17 | return obj.isoformat() 18 | iterable = iter(obj) 19 | except TypeError: 20 | pass 21 | else: 22 | return list(iterable) 23 | return JSONEncoder.default(self, obj) # type: ignore 24 | -------------------------------------------------------------------------------- /search/errors.py: -------------------------------------------------------------------------------- 1 | """Search error classes.""" 2 | 3 | 4 | class SearchError(Exception): 5 | """Generic search error.""" 6 | 7 | def __init__(self, message: str): 8 | """Initialize the error message.""" 9 | self.message = message 10 | 11 | @property 12 | def name(self) -> str: 13 | """Error name.""" 14 | return self.__class__.__name__ 15 | 16 | def __str__(self) -> str: 17 | """Represent error as a string.""" 18 | return f"{self.name}({self.message})" 19 | 20 | __repr__ = __str__ 21 | 22 | 23 | class ValidationError(SearchError): 24 | """Validation error.""" 25 | 26 | def __init__( 27 | self, message: str, link: str = "http://arxiv.org/api/errors" 28 | ): 29 | """Initialize the validation error.""" 30 | super().__init__(message=message) 31 | self.link = link 32 | -------------------------------------------------------------------------------- /search/filters.py: -------------------------------------------------------------------------------- 1 | """Template filters for :mod:`search`.""" 2 | 3 | from operator import attrgetter 4 | 5 | from arxiv import taxonomy 6 | from search.domain import Classification, Query 7 | 8 | 9 | def display_classification(classification: Classification) -> str: 10 | """Generate a display-friendly label for a classification.""" 11 | group = classification.get("group") 12 | category = classification.get("category") 13 | archive = classification.get("archive") 14 | parts = [] 15 | if group is not None: 16 | parts.append( 17 | group.get("name", taxonomy.get_group_display(group["id"])) 18 | ) 19 | if archive is not None: 20 | parts.append( 21 | archive.get("name", taxonomy.get_archive_display(archive["id"])) 22 | ) 23 | if category is not None: 24 | parts.append( 25 | category.get("name", taxonomy.get_category_display(category["id"])) 26 | ) 27 | return "::".join(parts) 28 | 29 | 30 | def category_name(classification: Classification) -> str: 31 | """Get the category display name for a classification.""" 32 | category = classification.get("category") 33 | if not category: 34 | raise ValueError("No category") 35 | return category.get("name", taxonomy.get_category_display(category["id"])) 36 | 37 | 38 | def display_query(query: Query) -> str: 39 | """Build a display representation of a :class:`.Query`.""" 40 | _parts = [] 41 | for attr in type(query).__dataclass_fields__.keys(): # type: ignore 42 | value = attrgetter(attr)(query) 43 | if not value: 44 | continue 45 | if attr == "classification": 46 | value = ", ".join([display_classification(v) for v in value]) 47 | _parts.append("%s: %s" % (attr, value)) 48 | return "; ".join(_parts) 49 | 50 | 51 | filters = [ 52 | ("display_classification", display_classification), 53 | ("category_name", category_name), 54 | ("display_query", display_query), 55 | ] 56 | -------------------------------------------------------------------------------- /search/process/__init__.py: -------------------------------------------------------------------------------- 1 | """arxiv search processes.""" 2 | -------------------------------------------------------------------------------- /search/routes/__init__.py: -------------------------------------------------------------------------------- 1 | """arxiv search routes.""" 2 | -------------------------------------------------------------------------------- /search/routes/api/__init__.py: -------------------------------------------------------------------------------- 1 | """Provides routing blueprint from the search API.""" 2 | 3 | __all__ = ["blueprint", "exceptions"] 4 | 5 | from flask import Blueprint, make_response, request, Response 6 | 7 | import logging 8 | 9 | 10 | # from arxiv.users.auth import scopes 11 | # from arxiv.users.auth.decorators import scoped 12 | from search import serialize 13 | from search.controllers import api 14 | from search.routes.consts import JSON 15 | from search.routes.api import exceptions 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | blueprint = Blueprint("api", __name__, url_prefix="/") 20 | 21 | 22 | @blueprint.route("/", methods=["GET"]) 23 | # @scoped(required=scopes.READ_PUBLIC) 24 | def search() -> Response: 25 | """Main query endpoint.""" 26 | logger.debug("Got query: %s", request.args) 27 | data, status_code, headers = api.search(request.args) 28 | # requested = request.accept_mimetypes.best_match([JSON, ATOM_XML]) 29 | # if requested == ATOM_XML: 30 | # return serialize.as_atom(data), status, headers 31 | response_data = serialize.as_json(data["results"], query=data["query"]) 32 | 33 | headers.update({"Content-type": JSON}) 34 | response: Response = make_response(response_data, status_code, headers) 35 | return response 36 | 37 | 38 | @blueprint.route("/v", methods=["GET"]) 39 | # @scoped(required=scopes.READ_PUBLIC) 40 | def paper(paper_id: str, version: str) -> Response: 41 | """Document metadata endpoint.""" 42 | data, status_code, headers = api.paper(f"{paper_id}v{version}") 43 | response_data = serialize.as_json(data["results"]) 44 | headers.update({"Content-type": JSON}) 45 | response: Response = make_response(response_data, status_code, headers) 46 | return response 47 | -------------------------------------------------------------------------------- /search/routes/api/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Exception handlers for API endpoints. 3 | 4 | .. todo:: This module belongs in :mod:`arxiv.base`. 5 | 6 | """ 7 | 8 | from typing import Callable, List, Tuple 9 | from http import HTTPStatus 10 | 11 | from werkzeug.exceptions import ( 12 | NotFound, 13 | Forbidden, 14 | Unauthorized, 15 | MethodNotAllowed, 16 | RequestEntityTooLarge, 17 | BadRequest, 18 | InternalServerError, 19 | HTTPException, 20 | ) 21 | from flask import make_response, Response, jsonify 22 | 23 | import logging 24 | 25 | from search.routes.consts import JSON 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | _handlers = [] 30 | 31 | 32 | def handler(exception: type) -> Callable: 33 | """Generate a decorator to register a handler for an exception.""" 34 | 35 | def deco(func: Callable) -> Callable: 36 | """Register a function as an exception handler.""" 37 | _handlers.append((exception, func)) 38 | return func 39 | 40 | return deco 41 | 42 | 43 | def get_handlers() -> List[Tuple[type, Callable]]: 44 | """ 45 | Get a list of registered exception handlers. 46 | 47 | Returns 48 | ------- 49 | list 50 | List of (:class:`.HTTPException`, callable) tuples. 51 | 52 | """ 53 | return _handlers 54 | 55 | 56 | def respond(error: HTTPException, status: HTTPStatus) -> Response: 57 | """Generate a JSON response.""" 58 | return make_response( # type: ignore 59 | jsonify({"code": error.code, "error": error.description}), 60 | status, 61 | {"Content-type": JSON}, 62 | ) 63 | 64 | 65 | @handler(NotFound) 66 | def handle_not_found(error: NotFound) -> Response: 67 | """Render the base 404 error page.""" 68 | return respond(error, HTTPStatus.NOT_FOUND) 69 | 70 | 71 | @handler(Forbidden) 72 | def handle_forbidden(error: Forbidden) -> Response: 73 | """Render the base 403 error page.""" 74 | return respond(error, HTTPStatus.FORBIDDEN) 75 | 76 | 77 | @handler(Unauthorized) 78 | def handle_unauthorized(error: Unauthorized) -> Response: 79 | """Render the base 401 error page.""" 80 | return respond(error, HTTPStatus.UNAUTHORIZED) 81 | 82 | 83 | @handler(MethodNotAllowed) 84 | def handle_method_not_allowed(error: MethodNotAllowed) -> Response: 85 | """Render the base 405 error page.""" 86 | return respond(error, HTTPStatus.METHOD_NOT_ALLOWED) 87 | 88 | 89 | @handler(RequestEntityTooLarge) 90 | def handle_request_entity_too_large(error: RequestEntityTooLarge) -> Response: 91 | """Render the base 413 error page.""" 92 | return respond(error, HTTPStatus.REQUEST_ENTITY_TOO_LARGE) 93 | 94 | 95 | @handler(BadRequest) 96 | def handle_bad_request(error: BadRequest) -> Response: 97 | """Render the base 400 error page.""" 98 | return respond(error, HTTPStatus.BAD_REQUEST) 99 | 100 | 101 | @handler(InternalServerError) 102 | def handle_internal_server_error(error: InternalServerError) -> Response: 103 | """Render the base 500 error page.""" 104 | if not isinstance(error, HTTPException): 105 | logger.error("Caught unhandled exception: %s", error) 106 | error.code = HTTPStatus.INTERNAL_SERVER_ERROR 107 | return respond(error, HTTPStatus.INTERNAL_SERVER_ERROR) 108 | -------------------------------------------------------------------------------- /search/routes/api/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for API routes.""" 2 | -------------------------------------------------------------------------------- /search/routes/classic_api/__init__.py: -------------------------------------------------------------------------------- 1 | """Provides the classic search API.""" 2 | 3 | __all__ = ["blueprint", "exceptions"] 4 | 5 | from flask import Blueprint, make_response, request, Response 6 | 7 | import logging 8 | 9 | 10 | # from arxiv.users.auth import scopes 11 | # from arxiv.users.auth.decorators import scoped 12 | from search import serialize 13 | from search.controllers import classic_api 14 | from search.routes.consts import ATOM_XML 15 | from search.routes.classic_api import exceptions 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | blueprint = Blueprint("classic_api", __name__, url_prefix="/") 20 | 21 | 22 | @blueprint.route("/api/query", methods=["GET"]) 23 | # @scoped(required=scopes.READ_PUBLIC) 24 | def query() -> Response: 25 | """Provide the main query endpoint.""" 26 | logger.debug("Got query: %s", request.args) 27 | data, status_code, headers = classic_api.query(request.args) 28 | response_data = serialize.as_atom( # type: ignore 29 | data.results, query=data.query 30 | ) # type: ignore 31 | headers.update({"Content-type": ATOM_XML}) 32 | response: Response = make_response(response_data, status_code, headers) 33 | return response 34 | 35 | 36 | #@blueprint.route("v", methods=["GET"]) 37 | ## @scoped(required=scopes.READ_PUBLIC) 38 | #def paper(paper_id: str, version: str) -> Response: 39 | # """Document metadata endpoint.""" 40 | # data, status_code, headers = classic_api.paper(f"{paper_id}v{version}") 41 | # response_data = serialize.as_atom(data.results) # type:ignore 42 | # headers.update({"Content-type": ATOM_XML}) 43 | # response: Response = make_response(response_data, status_code, headers) 44 | # return response 45 | -------------------------------------------------------------------------------- /search/routes/classic_api/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for classic arXiv API routes.""" 2 | -------------------------------------------------------------------------------- /search/routes/consts.py: -------------------------------------------------------------------------------- 1 | """Serialization MIME type and charset constants.""" 2 | 3 | ATOM_XML = "application/atom+xml; charset=utf-8" 4 | JSON = "application/json; charset=utf-8" 5 | -------------------------------------------------------------------------------- /search/serialize/__init__.py: -------------------------------------------------------------------------------- 1 | """Provides serialization functions for API responses.""" 2 | __all__ = ["JSONSerializer", "as_json", "AtomXMLSerializer", "as_atom"] 3 | 4 | from search.serialize.json import JSONSerializer, as_json 5 | from search.serialize.atom import AtomXMLSerializer, as_atom 6 | -------------------------------------------------------------------------------- /search/serialize/base.py: -------------------------------------------------------------------------------- 1 | """Base class for API serializers.""" 2 | 3 | 4 | class BaseSerializer: 5 | """Base class for API serializers.""" 6 | -------------------------------------------------------------------------------- /search/serialize/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Serialization tests.""" 2 | -------------------------------------------------------------------------------- /search/serialize/tests/test_serialize.py: -------------------------------------------------------------------------------- 1 | """Tests for serializers.""" 2 | 3 | import os 4 | import json 5 | from unittest import TestCase, mock 6 | 7 | import jsonschema 8 | 9 | from search import encode 10 | from search import serialize 11 | from search.tests import mocks 12 | from search.factory import create_api_web_app, create_classic_api_web_app 13 | 14 | 15 | def mock_jsonify(o): 16 | return json.dumps(o, cls=encode.ISO8601JSONEncoder) 17 | 18 | 19 | class TestSerializeJSONDocument(TestCase): 20 | """Serialize a single :class:`domain.Document` as JSON.""" 21 | 22 | SCHEMA_PATH = os.path.abspath("schema/resources/Document.json") 23 | 24 | def setUp(self): 25 | with open(self.SCHEMA_PATH) as f: 26 | self.schema = json.load(f) 27 | 28 | @mock.patch( 29 | f"search.serialize.json.url_for", lambda *a, **k: "http://f/12" 30 | ) 31 | @mock.patch(f"search.serialize.json.jsonify", mock_jsonify) 32 | def test_to_json(self): 33 | """Just your run-of-the-mill arXiv document generates valid JSON.""" 34 | app = create_api_web_app() 35 | with app.app_context(): 36 | document = mocks.document() 37 | srlzd = serialize.as_json(document) 38 | res = jsonschema.RefResolver( 39 | "file://%s/" 40 | % os.path.abspath(os.path.dirname(self.SCHEMA_PATH)), 41 | None, 42 | ) 43 | self.assertIsNone( 44 | jsonschema.validate( 45 | json.loads(srlzd), self.schema, resolver=res 46 | ) 47 | ) 48 | 49 | 50 | class TestSerializeJSONDocumentSet(TestCase): 51 | """Serialize a :class:`domain.DocumentSet` as JSON.""" 52 | 53 | SCHEMA_PATH = os.path.abspath("schema/resources/DocumentSet.json") 54 | 55 | def setUp(self): 56 | with open(self.SCHEMA_PATH) as f: 57 | self.schema = json.load(f) 58 | 59 | @mock.patch( 60 | f"search.serialize.json.url_for", lambda *a, **k: "http://f/12" 61 | ) 62 | @mock.patch(f"search.serialize.json.jsonify", mock_jsonify) 63 | def test_to_json(self): 64 | """Just your run-of-the-mill arXiv document generates valid JSON.""" 65 | app = create_api_web_app() 66 | with app.app_context(): 67 | document = mocks.document() 68 | meta = {"start": 0, "size": 50, "end": 50, "total": 500202} 69 | document_set = {"results": [document], "metadata": meta} 70 | srlzd = serialize.as_json(document_set) 71 | res = jsonschema.RefResolver( 72 | "file://%s/" 73 | % os.path.abspath(os.path.dirname(self.SCHEMA_PATH)), 74 | None, 75 | ) 76 | self.assertIsNone( 77 | jsonschema.validate( 78 | json.loads(srlzd), self.schema, resolver=res 79 | ) 80 | ) 81 | 82 | 83 | class TestSerializeAtomDocument(TestCase): 84 | """Serialize a single :class:`domain.Document` as Atom.""" 85 | 86 | @mock.patch( 87 | f"search.serialize.atom.url_for", lambda *a, **k: "http://f/12" 88 | ) 89 | def test_to_atom(self): 90 | """Just your run-of-the-mill arXiv document generates valid Atom.""" 91 | app = create_classic_api_web_app() 92 | with app.app_context(): 93 | document = mocks.document() 94 | _ = serialize.as_atom(document) 95 | 96 | # TODO: Verify valid AtomXML 97 | -------------------------------------------------------------------------------- /search/services/__init__.py: -------------------------------------------------------------------------------- 1 | """Provides service integration modules for use by controllers.""" 2 | 3 | __all__ = ["SearchSession"] 4 | 5 | from search.services.index import SearchSession 6 | -------------------------------------------------------------------------------- /search/services/index/classic_api/__init__.py: -------------------------------------------------------------------------------- 1 | """Provides index service integration modules for use by controllers.""" 2 | __all__ = ["classic_search"] 3 | 4 | from search.services.index.classic_api.classic_search import classic_search 5 | -------------------------------------------------------------------------------- /search/services/index/classic_api/classic_search.py: -------------------------------------------------------------------------------- 1 | """Translate classic API `Phrase` objects to Elasticsearch DSL.""" 2 | import re 3 | 4 | from elasticsearch_dsl import Q, Search 5 | 6 | from search.domain import ClassicAPIQuery, SortOrder 7 | from search.services.index.classic_api.query_builder import query_builder 8 | 9 | # FIXME: Use arxiv identifier parsing from arxiv.base when it's ready. 10 | # Also this allows version to start with 0 to mimic the old API. 11 | ENDS_WITH_VERSION = re.compile(r".*v\d+$") 12 | 13 | 14 | def classic_search(search: Search, query: ClassicAPIQuery) -> Search: 15 | """ 16 | Prepare a :class:`.Search` from a :class:`.ClassicAPIQuery`. 17 | 18 | Parameters 19 | ---------- 20 | search : :class:`.Search` 21 | An Elasticsearch search in preparation. 22 | query : :class:`.ClassicAPIQuery` 23 | An query originating from the Classic API. 24 | 25 | Returns 26 | ------- 27 | :class:`.Search` 28 | The passed ES search object, updated with specific query parameters 29 | that implement the advanced query. 30 | 31 | """ 32 | # Initialize query. 33 | if query.phrase: 34 | dsl_query = query_builder(query.phrase) 35 | else: 36 | dsl_query = Q() 37 | 38 | # Filter id_list if necessary. 39 | if query.id_list: 40 | # Separate versioned and unversioned papers. 41 | 42 | paper_ids = [] 43 | paper_ids_vs = [] 44 | for paper_id in query.id_list: 45 | if ENDS_WITH_VERSION.match(paper_id): 46 | paper_ids_vs.append(paper_id) 47 | else: 48 | paper_ids.append(paper_id) 49 | 50 | # Filter by most recent unversioned paper or any versioned paper. 51 | id_query = ( 52 | Q("terms", paper_id=paper_ids) & Q("term", is_current=True) 53 | ) | Q("terms", paper_id_v=paper_ids_vs) 54 | 55 | search = search.filter(id_query) 56 | else: 57 | # If no id_list, only display current results. 58 | search = search.filter("term", is_current=True) 59 | 60 | if not isinstance(query, SortOrder): 61 | return search.query(dsl_query) 62 | return search.query(dsl_query).sort(*query.order.to_es()) # type: ignore 63 | -------------------------------------------------------------------------------- /search/services/index/classic_api/query_builder.py: -------------------------------------------------------------------------------- 1 | """Query builder for classic API.""" 2 | from typing import Dict, Callable 3 | 4 | from elasticsearch_dsl import Q 5 | 6 | from search.domain import Phrase, Term, Field, Operator 7 | from search.services.index.prepare import ( 8 | SEARCH_FIELDS, 9 | query_any_subject_exact_raw, 10 | ) 11 | 12 | FIELD_TERM_MAPPING: Dict[Field, Callable[[str], Q]] = { 13 | Field.Author: SEARCH_FIELDS["author"], 14 | Field.Comment: SEARCH_FIELDS["comments"], 15 | Field.Identifier: SEARCH_FIELDS["paper_id"], 16 | Field.JournalReference: SEARCH_FIELDS["journal_ref"], 17 | Field.ReportNumber: SEARCH_FIELDS["report_num"], 18 | # Expects to match on primary or secondary category. 19 | Field.SubjectCategory: query_any_subject_exact_raw, 20 | Field.Title: SEARCH_FIELDS["title"], 21 | Field.All: SEARCH_FIELDS["all"], 22 | } 23 | 24 | 25 | def term_to_query(term: Term) -> Q: 26 | """ 27 | Parses a fielded term using transfromations from the current API. 28 | 29 | See Also 30 | -------- 31 | :module:`.api` 32 | """ 33 | 34 | return Q() if term.is_empty else FIELD_TERM_MAPPING[term.field](term.value) 35 | 36 | 37 | def query_builder(phrase: Phrase) -> Q: 38 | """Parses a Phrase of a Classic API request into an ES Q object.""" 39 | if isinstance(phrase, Term): 40 | return term_to_query(phrase) 41 | elif len(phrase) == 2: 42 | # This is unary ANDNOT which is just NOT 43 | return ~term_to_query(phrase[1]) 44 | elif len(phrase) == 3: 45 | binary_op, exp1, exp2 = phrase[:3] # type:ignore 46 | q1 = query_builder(exp1) 47 | q2 = query_builder(exp2) 48 | if binary_op is Operator.AND: 49 | return q1 & q2 50 | elif binary_op is Operator.OR: 51 | return q1 | q2 52 | elif binary_op is Operator.ANDNOT: 53 | return q1 & (~q2) 54 | else: 55 | # Error? 56 | return Q() 57 | -------------------------------------------------------------------------------- /search/services/index/classic_api/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for classic API index service.""" 2 | -------------------------------------------------------------------------------- /search/services/index/exceptions.py: -------------------------------------------------------------------------------- 1 | """Exceptions raised by the search index service.""" 2 | 3 | __all__ = ( 4 | "MappingError", 5 | "IndexConnectionError", 6 | "IndexingError", 7 | "QueryError", 8 | "DocumentNotFound", 9 | "OutsideAllowedRange", 10 | ) 11 | 12 | 13 | class MappingError(ValueError): 14 | """There was a problem with the search document mapping.""" 15 | 16 | 17 | class IndexConnectionError(IOError): 18 | """There was a problem connecting to the search index.""" 19 | 20 | 21 | class IndexingError(IOError): 22 | """There was a problem adding a document to the index.""" 23 | 24 | 25 | class QueryError(ValueError): 26 | """ 27 | Elasticsearch could not handle the query. 28 | 29 | This is likely due either to a programming error that resulted in a bad 30 | index, or to a malformed query. 31 | """ 32 | 33 | 34 | class DocumentNotFound(RuntimeError): 35 | """Could not find a requested document in the search index.""" 36 | 37 | 38 | class OutsideAllowedRange(RuntimeError): 39 | """A page outside of the allowed range has been requested.""" 40 | -------------------------------------------------------------------------------- /search/services/index/simple.py: -------------------------------------------------------------------------------- 1 | """Support for the simple search feature.""" 2 | 3 | from elasticsearch_dsl import Search 4 | 5 | from search.domain import SimpleQuery 6 | 7 | from .prepare import SEARCH_FIELDS, limit_by_classification 8 | from .util import sort 9 | 10 | 11 | def simple_search(search: Search, query: SimpleQuery) -> Search: 12 | """ 13 | Prepare a :class:`.Search` from a :class:`.SimpleQuery`. 14 | 15 | Parameters 16 | ---------- 17 | search : :class:`.Search` 18 | An Elasticsearch DSL search object, in preparation for execution. 19 | query : :class:`.SimpleQuery` 20 | A query originating from the simple search controller. 21 | 22 | Returns 23 | ------- 24 | :class:`.Search` 25 | The passed search object, updated with query parameters that implement 26 | the passed :class:`.SimpleQuery`. 27 | 28 | """ 29 | search = search.filter("term", is_current=True) 30 | q = SEARCH_FIELDS[query.search_field](query.value) 31 | if query.classification: 32 | _q = limit_by_classification(query.classification) 33 | if query.include_cross_list: 34 | _q |= limit_by_classification( 35 | query.classification, "secondary_classification" 36 | ) 37 | q &= _q 38 | search = search.query(q) 39 | search = sort(query, search) 40 | return search 41 | -------------------------------------------------------------------------------- /search/services/index/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for :mod:`search.services.index`.""" 2 | -------------------------------------------------------------------------------- /search/services/index/tests/test_reindex.py: -------------------------------------------------------------------------------- 1 | """Tests for reindexing.""" 2 | 3 | from unittest import TestCase, mock 4 | 5 | from search.services import index 6 | 7 | 8 | def raise_index_exists(*args, **kwargs): 9 | """Raise a resource_already_exists_exception TransportError.""" 10 | raise index.TransportError(400, "resource_already_exists_exception", {}) 11 | 12 | 13 | class TestReindexing(TestCase): 14 | """Tests for :func:`.index.reindex`.""" 15 | 16 | @mock.patch("search.services.index.Elasticsearch") 17 | def test_reindex_from_scratch(self, mock_Elasticsearch): 18 | """Reindex to an index that does not exist.""" 19 | mock_es = mock.MagicMock() 20 | mock_Elasticsearch.return_value = mock_es 21 | index.SearchSession.current_session().reindex("barindex", "bazindex") 22 | self.assertEqual( 23 | mock_es.indices.create.call_count, 24 | 1, 25 | "Should attempt to create the new index", 26 | ) 27 | self.assertEqual( 28 | mock_es.indices.create.call_args[0][0], 29 | "bazindex", 30 | "Should attempt to create the new index", 31 | ) 32 | 33 | self.assertEqual( 34 | mock_es.reindex.call_count, 35 | 1, 36 | "Should proceed to request reindexing", 37 | ) 38 | self.assertEqual( 39 | mock_es.reindex.call_args[0][0]["source"]["index"], "barindex" 40 | ) 41 | self.assertEqual( 42 | mock_es.reindex.call_args[0][0]["dest"]["index"], "bazindex" 43 | ) 44 | 45 | @mock.patch("search.services.index.Elasticsearch") 46 | def test_reindex_already_exists(self, mock_Elasticsearch): 47 | """Reindex to an index that already exists.""" 48 | mock_es = mock.MagicMock() 49 | mock_Elasticsearch.return_value = mock_es 50 | mock_es.indices.create.side_effect = raise_index_exists 51 | index.SearchSession.current_session().reindex("barindex", "bazindex") 52 | self.assertEqual( 53 | mock_es.indices.create.call_count, 54 | 1, 55 | "Should attempt to create the new index", 56 | ) 57 | self.assertEqual( 58 | mock_es.indices.create.call_args[0][0], 59 | "bazindex", 60 | "Should attempt to create the new index", 61 | ) 62 | 63 | self.assertEqual( 64 | mock_es.reindex.call_count, 65 | 1, 66 | "Should proceed to request reindexing", 67 | ) 68 | self.assertEqual( 69 | mock_es.reindex.call_args[0][0]["source"]["index"], "barindex" 70 | ) 71 | self.assertEqual( 72 | mock_es.reindex.call_args[0][0]["dest"]["index"], "bazindex" 73 | ) 74 | 75 | 76 | class TestTaskStatus(TestCase): 77 | """Tests for :func:`.index.get_task_status`.""" 78 | 79 | @mock.patch("search.services.index.Elasticsearch") 80 | def test_get_task_status(self, mock_Elasticsearch): 81 | """Get task status via the ES API.""" 82 | mock_es = mock.MagicMock() 83 | mock_Elasticsearch.return_value = mock_es 84 | 85 | task_id = "foonode:bartask" 86 | index.SearchSession.current_session().get_task_status(task_id) 87 | self.assertEqual( 88 | mock_es.tasks.get.call_count, 89 | 1, 90 | "Should call the task status endpoint", 91 | ) 92 | self.assertEqual( 93 | mock_es.tasks.get.call_args[0][0], 94 | task_id, 95 | "Should call the task status endpoint with task ID", 96 | ) 97 | -------------------------------------------------------------------------------- /search/services/index/tests/test_util.py: -------------------------------------------------------------------------------- 1 | """Tests for :mod:`search.services.index.util`.""" 2 | 3 | from unittest import TestCase 4 | 5 | from search.services.index import util 6 | 7 | 8 | class TestMatchDatePartial(TestCase): 9 | """Tests for :func:`.index.util.parse_date_partial`.""" 10 | 11 | def test_date_partial_only(self): 12 | """Term includes only a four-digit date partial.""" 13 | term, rmd = util.parse_date("1902") 14 | ym = util.parse_date_partial(term) 15 | self.assertEqual(ym, "2019-02") 16 | self.assertEqual(rmd, "", "Should have no remainder") 17 | 18 | def test_in_word(self): 19 | """A false positive in a word.""" 20 | with self.assertRaises(ValueError): 21 | term, rmd = util.parse_date("notasearch1902foradatepartial") 22 | 23 | def test_near_words(self): 24 | """Term includes date partial plus other terms.""" 25 | term, rmd = util.parse_date("foo 1902 bar") 26 | ym = util.parse_date_partial(term) 27 | self.assertEqual(ym, "2019-02") 28 | self.assertEqual(rmd, "foo bar", "Should have remainder") 29 | 30 | def test_out_of_range(self): 31 | """Term looks like a date partial, but is not a valid date.""" 32 | term, rmd = util.parse_date("0699") 33 | self.assertIsNone(util.parse_date_partial(term)) 34 | 35 | def test_last_millenium(self): 36 | """Term is for a pre-2000 paper.""" 37 | term, rmd = util.parse_date("old paper 9505") 38 | ym = util.parse_date_partial(term) 39 | self.assertEqual(ym, "1995-05") 40 | self.assertEqual(rmd, "old paper", "Should have a remainder") 41 | 42 | 43 | class TestOldPapernumDetection(TestCase): 44 | """Test :func:`.index.util.is_old_papernum`.""" 45 | 46 | def test_is_old_papernum(self): 47 | """User enters a 7-digit number that looks like an old papernum.""" 48 | self.assertFalse(util.is_old_papernum("9106001")) 49 | self.assertTrue(util.is_old_papernum("9107001")) 50 | self.assertFalse(util.is_old_papernum("9200001")) 51 | self.assertTrue(util.is_old_papernum("9201001")) 52 | self.assertTrue(util.is_old_papernum("0703999")) 53 | self.assertFalse(util.is_old_papernum("0704001")) 54 | -------------------------------------------------------------------------------- /search/services/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for :mod:`search.services`.""" 2 | -------------------------------------------------------------------------------- /search/services/tests/test_tex.py: -------------------------------------------------------------------------------- 1 | """Tests for :mod:`search.services.tex`.""" 2 | 3 | import unittest 4 | from search.services import tex 5 | 6 | dispaly_txt=r'''where $\alpha \neq 0, $ and either both $\alpha, t $ are real, or both are 7 | pure imaginary numbers. For even $n$ we prove: if $t, n $ are fixed, then, for 8 | $ \alpha \to 0, $ 9 | $$ \gamma_n = | \frac{8\alpha^n}{2^n [(n-1)!]^2} \prod_{k=1}^{n/2} (t^2 - 10 | (2k-1)^2) | $ (1 + O(\alpha)), $$ 11 | and if $ \alpha, t $ are fixed, then, for $ n \to \infty, $ 12 | $$ \gamma_n = \frac{8 |\alpha/2|^n}{[2 \cdot 4 ... (n-2)]^2} | \cos 13 | (\frac{\pi}{2} t) | [ 1 + O (\frac{\log n}{n}) ]. $$ 14 | Similar formulae (see Theorems \ref{thm2} and \ref{thm4}) hold for odd $n.$''' 15 | 16 | 17 | class TestTeX(unittest.TestCase): 18 | """test.""" 19 | 20 | 21 | def test_inline(self): 22 | """Test math postitions""" 23 | pos = tex.math_positions('some stuff $crazy tex!$ other stuff $more tex$') 24 | self.assertEqual( [(11,23),(36,46)] , pos ) 25 | 26 | pos = tex.math_positions('$crazy tex!$ other stuff $more tex$') 27 | self.assertEqual( [(0,12),(25,35)] , pos ) 28 | 29 | pos = tex.math_positions('$crazy tex!$') 30 | self.assertEqual( [(0,12)] , pos ) 31 | 32 | def test_display(self): 33 | """Test math postitions""" 34 | txt = dispaly_txt 35 | pos = tex.math_positions(txt) 36 | for start,end in pos: 37 | self.assertEqual('$' , txt[start], "should start with $ or $$ delimiter") 38 | self.assertEqual('$', txt[end-1], "should end with $ or $$ delimiter") 39 | 40 | def test_inline_pren(self): 41 | txt = 'critical density \\(p_{c}(Ng)\\) which is in the intermediate' 42 | pos = tex.math_positions(txt) 43 | self.assertEqual([(17,30)], pos) 44 | 45 | def test_display2(self): 46 | txt = "critical density \\[p_{c}\n(Ng)[something] or other \\] which is in the intermeidiate" 47 | pos = tex.math_positions(txt) 48 | self.assertEqual([(17,52)], pos) 49 | 50 | txt = "\\[p_{c}\n(Ng)[something] or other \\] which is in the intermediate" 51 | pos = tex.math_positions(txt) 52 | self.assertEqual([(0,35)], pos) 53 | 54 | txt = "critical density \\[p_{c}\n(Ng)[something] or other \\]" 55 | pos = tex.math_positions(txt) 56 | self.assertEqual([(17,52)], pos) 57 | 58 | 59 | def test_split(self): 60 | txt = 'some stuff $crazy tex!$ other stuff $more tex$ more at the end' 61 | txtf = tex.split_for_maths(tex.math_positions(txt), txt) 62 | self.assertEqual( ''.join(txtf) , txt, ) 63 | 64 | self.assertEqual( len([ True for chunk in txtf if tex.isMath(chunk)] ), 2 ) 65 | 66 | txtf = tex.split_for_maths( tex.math_positions(dispaly_txt),dispaly_txt) 67 | self.assertEqual(''.join(txtf), dispaly_txt) 68 | self.assertTrue( any( [ tex.isMath(chunk) for chunk in txtf] ) ) 69 | -------------------------------------------------------------------------------- /search/services/tex.py: -------------------------------------------------------------------------------- 1 | """Tools for dealing with tex from Elasticsearch in abstracts and titles.""" 2 | 3 | import re 4 | from typing import List, Tuple, Union, Dict, Callable, Pattern, Any 5 | 6 | 7 | class Math(str): 8 | """Marker class for tex strings.""" 9 | 10 | def __math__(self): # type: ignore 11 | """Similar to markupsafe.""" 12 | return self 13 | 14 | 15 | def isMath(checkme: Any) -> bool: 16 | """Checks if an object is Math.""" 17 | if checkme is None: 18 | return False 19 | return hasattr(checkme, "__math__") 20 | 21 | 22 | def position_f(delims: Dict[str, Tuple[str, Pattern]]) -> Callable[[str], List[Tuple[int, int]]]: 23 | """Return list of (start, end) of the locations of delimited txt.""" 24 | 25 | # Build a Pattern for the combined start delimiters 26 | rstr = "|".join([f"({start})" for start, _ in delims.values()]) 27 | starts = re.compile(rstr) 28 | 29 | def pos_func(txt: str) -> List[Tuple[int, int]]: 30 | record = [] 31 | pos = 0 32 | start_match = starts.search(txt, pos) 33 | while start_match: 34 | start_pos1, start_pos2 = start_match.span() 35 | _, end_pat = delims[start_match.group(0)] 36 | end_match = end_pat.search(txt, start_pos2) 37 | if end_match: # end found 38 | _, end_pos2 = end_match.span() 39 | record.append((start_pos1, end_pos2)) 40 | pos = end_pos2 41 | else: # end not found, just keep going 42 | pos = start_pos2 43 | 44 | start_match = starts.search(txt, pos) 45 | 46 | return record 47 | 48 | return pos_func 49 | 50 | 51 | # These are the delimiters recoganized by MathJax 52 | # represents start_regex_match: (start_regex_str, close_regex) 53 | tex_delims = { 54 | "\\(": (r"\\\(", re.compile(r"\\\)")), 55 | "$$": (r"(? List[str]: 65 | """Splits the txt based on positions.""" 66 | if not positions or not txt: 67 | return [''] 68 | 69 | pos = 0 70 | out = [] 71 | for start, end in positions: 72 | if pos < start: 73 | out.append(txt[pos:start]) 74 | out.append(Math(txt[start:end])) 75 | pos = end 76 | 77 | # add on anything left at the end 78 | out.append(txt[positions[-1][1]:]) 79 | 80 | return out 81 | -------------------------------------------------------------------------------- /search/static/css/search.css.map: -------------------------------------------------------------------------------- 1 | {"version":3,"sourceRoot":"","sources":["../sass/search.sass"],"names":[],"mappings":"AAAA;EACE;EACA;;;AAGA;EACE;;AACA;EACE;;AACF;EACE;;AACF;EACE;EACA;EACA;EACA;;AACF;EACE;;AACA;EACE;;AACJ;EACE;;AACF;EACE;EACA;;;AAEN;EACE;EACA;EACA;;;AAEF;AAEA;EACE;;;AAEF;EACE;EACA;;;AAEF;EACE;;AACA;EACE;EACA;;AACF;EACE;;;AAEJ;EACE;EACA;;;AAGA;EADF;IAEI;IACA;;;AACF;EAJF;IAKI;IACA;;;;AAEJ;EACE;EACA;EACA;EACA;;;AAEF;EACE;;;AAEF;EACE;EACA;EACA;EACA;;;AAEF;EACE;;;AAEF;EACE;EACA;EACA;EACA;EACA;EACA;EACA;EACA;;;AAEF;EACE;;;AAEF;AAIM;EACE;;AACJ;EAJF;IAKI;IACA;;EAEE;IACE;IACA;;EAEA;AAAA;AAAA;IAGE;IACA;;EAEF;AAAA;AAAA;IAGE;IACA;;EACJ;AAAA;AAAA;IAGE;;EACA;AAAA;AAAA;AAAA;AAAA;IAEE;;EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;IAIE;;EACA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;IACE;;EACN;IACE;;EACJ;IACE;;EACF;IACE;;EAEA;IACE;IACA","file":"search.css"} -------------------------------------------------------------------------------- /search/static/js/fieldset.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Provides add/remove functionality for fieldsets. 3 | * 4 | * TODO: Tests with multiple fieldsets on the same page? There would 5 | * likely be collisions, as written. 6 | **/ 7 | 8 | $(function() { 9 | $("[data-toggle=fieldset]").each(function() { 10 | var $this = $(this); 11 | 12 | //Add new entry 13 | $this.find("button[data-toggle=fieldset-add-row]").click(function() { 14 | var target = $($(this).data("target")); 15 | var last_item = target.find("[data-toggle=fieldset-entry]:last"); 16 | var new_item = last_item.clone(true, true); 17 | 18 | // Make any elements hidden in the first row visible. 19 | new_item.find(".fieldset-hidden-on-first-row").each(function() { 20 | $(this).css("visibility", "visible"); 21 | $(this).css("width", "auto"); 22 | }); 23 | 24 | // Generate a new id number for the new item. 25 | var elem_id = new_item.find(":input")[0].id; 26 | var elem_num = parseInt(elem_id.replace(/.*-(\d{1,4})-.*/m, '$1')) + 1; 27 | new_item.attr('data-id', elem_num); 28 | 29 | // Configure input element(s) in the new item. 30 | new_item.find(":input").each(function() { 31 | // Increment the field id. 32 | var id = $(this).attr('id') 33 | .replace('-' + (elem_num - 1) + '-', '-' + (elem_num) + '-'); 34 | 35 | // Clear any values from the last item. 36 | $(this).attr('name', id) 37 | .attr('id', id).val('') 38 | .removeAttr("checked"); 39 | 40 | // Set the value for the input field with the default, if 41 | // specified. 42 | var default_value = $(this).attr("default"); 43 | if (default_value) { 44 | $(this).val(default_value); 45 | } 46 | }); 47 | 48 | // Clear help text. 49 | new_item.find(".help").each(function() { 50 | $(this).empty(); 51 | }); 52 | 53 | new_item.show(); 54 | last_item.after(new_item); // Insert the new item below the last. 55 | }); 56 | 57 | //Remove row 58 | $this.find("button[data-toggle=fieldset-remove-row]").click(function() { 59 | // var to_remove = $(this). 60 | if($this.find("[data-toggle=fieldset-entry]").length > 1) { 61 | var this_row = $(this).closest("div[data-toggle=fieldset-entry]"); 62 | this_row.remove(); 63 | } 64 | }); 65 | }); 66 | }); 67 | -------------------------------------------------------------------------------- /search/templates/search/base.html: -------------------------------------------------------------------------------- 1 | {%- extends "base/base.html" %} 2 | 3 | {% block addl_head %} 4 | 5 | 6 | 10 | 11 | 12 | 17 | {% endblock addl_head %} 18 | 19 | {% block content %} 20 |
21 |
22 |

{% block title %}Search{% endblock title %}

23 |
24 |
25 | 26 | {{ config.RELEASE_NOTES_TEXT }}   27 |
28 |
29 |
30 | {% block within_content %} 31 | Specific results here 32 | {% endblock within_content %} 33 |
34 | 35 | {{ config.RELEASE_NOTES_TEXT }}   36 |
37 |
38 | {% endblock content %} 39 | -------------------------------------------------------------------------------- /search/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for the application as a whole.""" 2 | -------------------------------------------------------------------------------- /search/tests/mocks.py: -------------------------------------------------------------------------------- 1 | """Provide function to return a mock document.""" 2 | from datetime import datetime 3 | 4 | 5 | def document(): 6 | """Return a mock document.""" 7 | return { 8 | "submitted_date": datetime.now(), 9 | "submitted_date_first": datetime.now(), 10 | "announced_date_first": datetime.now(), 11 | "id": "1234.5678", 12 | "abstract": "very abstract", 13 | "authors": [{"full_name": "F. Bar", "orcid": "1234-5678-9012-3456"}], 14 | "submitter": {"full_name": "S. Ubmitter", "author_id": "su_1"}, 15 | "modified_date": datetime.now(), 16 | "updated_date": datetime.now(), 17 | "is_current": True, 18 | "is_withdrawn": False, 19 | "license": {"uri": "http://foo.license/1", "label": "Notalicense 5.4"}, 20 | "paper_id": "1234.5678", 21 | "paper_id_v": "1234.5678v6", 22 | "title": "tiiiitle", 23 | "source": {"flags": "A", "format": "pdftotex", "size_bytes": 2}, 24 | "version": 6, 25 | "latest": "1234.5678v6", 26 | "latest_version": 6, 27 | "report_num": "somenum1", 28 | "msc_class": ["c1"], 29 | "acm_class": ["z2"], 30 | "journal_ref": "somejournal (1991): 2-34", 31 | "doi": "10.123456/7890", 32 | "comments": "very science", 33 | "abs_categories": "astro-ph.CO foo.BR", 34 | "formats": ["pdf", "other"], 35 | "primary_classification": { 36 | "group": {"id": "foo", "name": "Foo Group"}, 37 | "archive": {"id": "foo", "name": "Foo Archive"}, 38 | "category": {"id": "foo.BR", "name": "Foo Category"}, 39 | }, 40 | "secondary_classification": [ 41 | { 42 | "group": {"id": "foo", "name": "Foo Group"}, 43 | "archive": {"id": "foo", "name": "Foo Archive"}, 44 | "category": {"id": "foo.BZ", "name": "Baz Category"}, 45 | } 46 | ], 47 | } 48 | -------------------------------------------------------------------------------- /search/tests/test_advanced_search.py: -------------------------------------------------------------------------------- 1 | from http import HTTPStatus 2 | from search.services.index.exceptions import IndexConnectionError 3 | from unittest import TestCase, mock 4 | 5 | from arxiv import taxonomy 6 | from search.factory import create_ui_web_app 7 | 8 | 9 | class TestAdvancedSearch(TestCase): 10 | """Test for the advanced search UI.""" 11 | 12 | def setUp(self): 13 | """Instantiate the UI application.""" 14 | self.app = create_ui_web_app() 15 | self.client = self.app.test_client() 16 | 17 | def test_archive_shortcut(self): 18 | """User requests a sub-path with classification archive.""" 19 | for archive in taxonomy.ARCHIVES.keys(): 20 | response = self.client.get(f"/advanced/{archive}") 21 | self.assertEqual( 22 | response.status_code, 23 | HTTPStatus.OK, 24 | "Should support shortcut for archive {archive}", 25 | ) 26 | 27 | def test_nonexistant_archive_shortcut(self): 28 | """User requests a sub-path with non-existant archive.""" 29 | response = self.client.get("/advanced/fooarchive") 30 | self.assertEqual( 31 | response.status_code, 32 | HTTPStatus.NOT_FOUND, 33 | "Should return a 404 error", 34 | ) 35 | 36 | @mock.patch("search.controllers.advanced.SearchSession") 37 | def test_es_unhandled(self, mock_index): 38 | """Unhandled error in ES service should result in a 500""" 39 | def raiseEr(*args, **kwargs): 40 | raise ValueError(f"Raised by {__file__}") 41 | 42 | mock_index.current_session.side_effect = raiseEr 43 | response = self.client.get("""/advanced?advanced=1&terms-0-operator=AND&""" 44 | """terms-0-term=onion&terms-0-field=title""") 45 | self.assertEqual( 46 | response.status_code, 47 | HTTPStatus.INTERNAL_SERVER_ERROR, 48 | "When service raises a strange error, 500" 49 | ) 50 | 51 | 52 | @mock.patch("search.controllers.advanced.SearchSession") 53 | def test_es_down(self, mock_index): 54 | """Failure to contact ES should result in a BAD_GATEWAY to distinguishsh it from 55 | more general 500 errors.""" 56 | def raiseEr(*args, **kwargs): 57 | raise IndexConnectionError("Raised by {__file__}") 58 | 59 | mock_index.current_session.side_effect = raiseEr 60 | response = self.client.get("""/advanced?advanced=1&terms-0-operator=AND&""" 61 | """terms-0-term=onion&terms-0-field=title""") 62 | self.assertEqual( 63 | response.status_code, 64 | HTTPStatus.BAD_GATEWAY, 65 | "When ES is down return BAD_GATEWAY. ARXIVNG-5112", 66 | ) 67 | -------------------------------------------------------------------------------- /search/tests/test_param_persistence.py: -------------------------------------------------------------------------------- 1 | """Tests related to the persistence of search parameters in a cookie.""" 2 | 3 | import json 4 | from unittest import TestCase, mock 5 | 6 | from search.factory import create_ui_web_app 7 | from search.controllers.simple.forms import SimpleSearchForm 8 | from search.routes import ui 9 | 10 | 11 | class TestParameterPersistence(TestCase): 12 | """Some search parameters should be saved in a cookie.""" 13 | 14 | def setUp(self): 15 | """Instantiate the UI application.""" 16 | self.app = create_ui_web_app() 17 | self.client = self.app.test_client() 18 | 19 | def test_request_includes_params(self): 20 | """A request is made with parameters indicated for persistence.""" 21 | ui.PARAMS_TO_PERSIST = ["foo", "baz"] 22 | ui.PARAMS_COOKIE_NAME = "foo-cookie" 23 | response = self.client.get("/?foo=bar&baz=bat") 24 | 25 | self.assertIn("Set-Cookie", response.headers, "Should set a cookie") 26 | expected = ( 27 | 'foo-cookie="{\\"foo\\": \\"bar\\"\\054 \\"baz\\": \\"bat\\"}"; ' 28 | "Path=/" 29 | ) 30 | self.assertEqual( 31 | response.headers["Set-Cookie"], 32 | expected, 33 | "Cookie should contain request params", 34 | ) 35 | 36 | def test_request_does_not_include_params(self): 37 | """The request does not include persistable params.""" 38 | ui.PARAMS_TO_PERSIST = ["foo", "baz"] 39 | ui.PARAMS_COOKIE_NAME = "foo-cookie" 40 | response = self.client.get("/?nope=nope") 41 | self.assertIn("Set-Cookie", response.headers, "Should set a cookie") 42 | self.assertEqual( 43 | response.headers["Set-Cookie"], 44 | 'foo-cookie={}; Path=/', 45 | "Cookie should not contain request params", 46 | ) 47 | 48 | @mock.patch("search.routes.ui.simple") 49 | def test_request_includes_cookie(self, mock_simple): 50 | """The request includes the params cookie.""" 51 | mock_simple.search.return_value = {"form": SimpleSearchForm()}, 200, {} 52 | ui.PARAMS_TO_PERSIST = ["foo", "baz"] 53 | ui.PARAMS_COOKIE_NAME = "foo-cookie" 54 | self.client.set_cookie( 55 | ui.PARAMS_COOKIE_NAME, json.dumps({"foo": "ack"}) 56 | ) 57 | self.client.get("/") 58 | self.assertEqual( 59 | mock_simple.search.call_args[0][0]["foo"], 60 | "ack", 61 | "The value in the cookie should be used", 62 | ) 63 | 64 | @mock.patch("search.routes.ui.simple") 65 | def test_request_includes_cookie_but_also_explicit_val(self, mock_simple): 66 | """The request includes the cookie, but also an explicit value.""" 67 | mock_simple.search.return_value = {"form": SimpleSearchForm()}, 200, {} 68 | ui.PARAMS_TO_PERSIST = ["foo", "baz"] 69 | ui.PARAMS_COOKIE_NAME = "foo-cookie" 70 | self.client.set_cookie( 71 | ui.PARAMS_COOKIE_NAME, json.dumps({"foo": "ack"}) 72 | ) 73 | self.client.get("/?foo=oof") 74 | self.assertEqual( 75 | mock_simple.search.call_args[0][0]["foo"], 76 | "oof", 77 | "The explicit value should be used", 78 | ) 79 | -------------------------------------------------------------------------------- /search/tests/test_searches.py: -------------------------------------------------------------------------------- 1 | from http import HTTPStatus 2 | from search.services.index.exceptions import IndexConnectionError 3 | from unittest import TestCase, mock 4 | 5 | from search.factory import create_ui_web_app 6 | from search.services.index import QueryError 7 | 8 | class TestSearchs(TestCase): 9 | """Test for the advanced search UI.""" 10 | 11 | def setUp(self): 12 | """Instantiate the UI application.""" 13 | self.app = create_ui_web_app() 14 | self.client = self.app.test_client() 15 | 16 | @mock.patch("search.controllers.simple.SearchSession") 17 | def test_bad_query(self, mock_index): 18 | """Bad query should result in a 400 not a 500. query from ARXIVNG-2437""" 19 | mock_index.current_session().search.side_effect = QueryError 20 | response = self.client.get("/?query=%2F%3F&searchtype=all&source=header") 21 | self.assertEqual( 22 | response.status_code, 23 | HTTPStatus.BAD_REQUEST, 24 | "A query that cannot be parsed by ES should result in 400. ARXIVNG-2437", 25 | ) 26 | 27 | response = self.client.get("/?query=+O%5E*%282.619%5Ek%29+algorithm+for+4-path+vertex+cover&searchtype=all&source=header") 28 | self.assertEqual( 29 | response.status_code, 30 | HTTPStatus.BAD_REQUEST, 31 | "A query that cannot be parsed by ES should result in 400. ARXIVNG-3971" 32 | ) 33 | 34 | @mock.patch("search.controllers.simple.SearchSession") 35 | def test_es_down(self, mock_index): 36 | """Failure to contact ES should result in a BAD_GATEWAY to distinguish it from 37 | more general 500 errors.""" 38 | mock_index.current_session().search.side_effect = IndexConnectionError 39 | 40 | response = self.client.get("/?query=cheese&searchtype=all&source=header") 41 | self.assertEqual( 42 | response.status_code, 43 | HTTPStatus.BAD_GATEWAY, 44 | "When ES is down return BAD_GATEWAY. ARXIVNG-5112", 45 | ) 46 | -------------------------------------------------------------------------------- /search/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Provides utility functions.""" 2 | __all__ = ["safe_str", "DateTime", "to_utc", "utc_now"] 3 | 4 | from search.utils.string import safe_str 5 | from search.utils.timestamp import DateTime, to_utc, utc_now 6 | -------------------------------------------------------------------------------- /search/utils/string.py: -------------------------------------------------------------------------------- 1 | """String utilities.""" 2 | 3 | from typing import Union 4 | 5 | 6 | def safe_str(s: Union[str, bytes]) -> str: 7 | """Return a UTF decoded string from bytes or the original string.""" 8 | if isinstance(s, bytes): 9 | return s.decode("utf-8") 10 | return s 11 | -------------------------------------------------------------------------------- /search/utils/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for utility functions.""" 2 | -------------------------------------------------------------------------------- /search/utils/tests/test_string.py: -------------------------------------------------------------------------------- 1 | from search.utils.string import safe_str 2 | 3 | from unittest import TestCase 4 | 5 | 6 | class TestSafeStr(TestCase): 7 | def test_safe_str(self): 8 | self.assertEqual(safe_str("foo"), "foo") 9 | self.assertEqual(safe_str(b"foo"), "foo") 10 | self.assertEqual(safe_str("Schröder"), "Schröder") 11 | self.assertEqual(safe_str("Schröder".encode("utf-8")), "Schröder") 12 | -------------------------------------------------------------------------------- /search/utils/timestamp.py: -------------------------------------------------------------------------------- 1 | """Timestamps utilities.""" 2 | from typing import Optional, Union 3 | from datetime import datetime, timezone 4 | 5 | from dateutil import parser 6 | 7 | 8 | class DateTime(datetime): 9 | """DateTime is a hack wrapper around datetime. 10 | 11 | Feedgen doesn't have custom timestamp formatting. It uses isoformat, so 12 | we use a custom class that overrides the isoformat class. 13 | """ 14 | 15 | def isoformat(self, sep: str = "T", timespec: str = "auto") -> str: 16 | """Return formatted datetime.""" 17 | return self.strftime("%Y-%m-%dT%H:%M:%SZ") 18 | 19 | @property 20 | def tzinfo(self) -> timezone: 21 | """Return the objects timezone.""" 22 | return timezone.utc 23 | 24 | 25 | def utc_now() -> Union[DateTime, datetime]: 26 | """Return timezone aware current timestamp.""" 27 | return DateTime.fromtimestamp( 28 | datetime.utcnow().astimezone(timezone.utc).timestamp() 29 | ) 30 | 31 | 32 | def to_utc( 33 | dt: Optional[Union[DateTime, datetime, str]] 34 | ) -> Union[DateTime, datetime]: 35 | """Localize datetime objects to UTC timezone. 36 | 37 | If the datetime object is None return current timestamp. 38 | """ 39 | if dt is None: 40 | return utc_now() 41 | if isinstance(dt, str): 42 | try: 43 | parsed_dt = parser.parse(dt) 44 | return DateTime.fromtimestamp(parsed_dt.timestamp()) 45 | except Exception: 46 | return utc_now() 47 | return DateTime.fromtimestamp(dt.astimezone(timezone.utc).timestamp()) 48 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203,W503 3 | 4 | [pydocstyle] 5 | convention = numpy 6 | add-ignore = D100,D101,D102,D103,D104,D202,D401 7 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/tests/__init__.py -------------------------------------------------------------------------------- /tests/base_app_tests.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run :mod:`arxiv.base.app_tests`. 3 | 4 | These are run separately from the rest of the tests in :mod:`search`. 5 | """ 6 | 7 | import unittest 8 | from search.factory import create_ui_web_app 9 | 10 | app = create_ui_web_app() 11 | app.app_context().push() 12 | 13 | if __name__ == "__main__": 14 | unittest.main() 15 | -------------------------------------------------------------------------------- /tests/data/docmeta.json: -------------------------------------------------------------------------------- 1 | {"comments":"18 pages, 5 figures. I have completely rewritten this paper because after uploading the previous version I realised that there is a much better approach. Note the change to the title. Have included minor corrections following review","announced_date_first":"2016-06","submitted_date_all":["2016-06-01T01:50:11-0400","2016-10-20T23:44:02-0400","2017-06-06T22:07:47-0400"],"formats":["pdf","other"],"is_current":true,"author_owners":[{"orcid":"0000-0002-7133-2884","last_name":"McLean","first_name":"William"}],"primary_classification":{"group":{"name":"Mathematics","id":"math"},"archive":{"name":"Mathematics","id":"math"},"category":{"name":"Numerical Analysis","id":"math.NA"}},"doi":null,"authors_utf8":"William McLean","authors":"William McLean","secondary_classification":[],"modified_date":"2017-06-07T20:03:19-0400","document_id":1166310,"msc_class":"41A25, 65D32, 65Y20","is_withdrawn":false,"comments_utf8":"18 pages, 5 figures. I have completely rewritten this paper because after uploading the previous version I realised that there is a much better approach. Note the change to the title. Have included minor corrections following review","abstract":"Given $\\beta>0$ and $\\delta>0$, the function $t^{-\\beta}$ may be approximated for $t$ in a compact interval $[\\delta,T]$ by a sum of terms of the form $we^{-at}$, with parameters $w>0$ and $a>0$. One such an approximation, studied by Beylkin and Monz\\'on, is obtained by applying the trapezoidal rule to an integral representation of $t^{-\\beta}$, after which Prony's method is applied to reduce the number of terms in the sum with essentially no loss of accuracy. We review this method, and then describe a similar approach based on an alternative integral representation. The main difference is that the new approach achieves much better results before the application of Prony's method; after applying Prony's method the performance of both is much the same.","title":"Exponential sum approximations for $t^{-\\beta}$","journal_ref":null,"proxy":null,"submitter":{"is_author":true,"email":"w.mclean@unsw.edu.au","orcid":"0000-0002-7133-2884","name":"William McLean","name_utf8":"William McLean"},"paper_id":"1606.00123","source":{"flags":"","format":"pdftex","size_bytes":78752},"title_utf8":"Exponential sum approximations for $t^{-β}$","submitted_date":"2017-06-06T22:07:47-0400","abs_categories":"math.NA","metadata_id":1948698,"acm_class":null,"report_num":null,"version":3,"abstract_utf8":"Given $β>0$ and $δ>0$, the function $t^{-β}$ may be approximated for $t$ in a compact interval $[δ,T]$ by a sum of terms of the form $we^{-at}$, with parameters $w>0$ and $a>0$. One such an approximation, studied by Beylkin and Monzón, is obtained by applying the trapezoidal rule to an integral representation of $t^{-β}$, after which Prony's method is applied to reduce the number of terms in the sum with essentially no loss of accuracy. We review this method, and then describe a similar approach based on an alternative integral representation. The main difference is that the new approach achieves much better results before the application of Prony's method; after applying Prony's method the performance of both is much the same.","journal_ref_utf8":null,"license":{"label":"arXiv.org perpetual, non-exclusive license to distribute this article","uri":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/"},"authors_parsed":[{"last_name":"McLean","first_name":"William"}],"updated_date":null} 2 | -------------------------------------------------------------------------------- /tests/data/examples/1401.1012.json: -------------------------------------------------------------------------------- 1 | {"comments": "3 figures", "announced_date_first": "2014-01", "submitted_date_all": ["2014-01-06T03:27:44-0500"], "formats": ["pdf", "ps", "other"], "is_current": true, "author_owners": [{"last_name": "Son", "first_name": "W."}], "primary_classification": {"group": {"name": "Physics", "id": "physics"}, "archive": {"name": "Quantum Physics", "id": "quant-ph"}, "category": {"name": "Quantum Physics", "id": "quant-ph"}}, "doi": "10.3938/jkps.64.499", "authors_utf8": "Wonmin Son", "authors": "Wonmin Son", "secondary_classification": [], "modified_date": "2014-07-15T20:07:28-0400", "document_id": 918523, "msc_class": null, "is_withdrawn": false, "comments_utf8": "3 figures", "abstract": "According to the theory of relativity and causality, a special type of correlation beyond quantum mechanics is possible in principle under the name of {\\it non-local box}. The concept has been introduced from the principle of non-locality which satisfies relativistic causality. In this paper, we show that a correlation leading to the non-local box is possible to be derived consistently if we release the one of major axioms in quantum mechanics, {\\it Born's rule}. This allows us to obtain a theory which in one end of the spectrum agrees with the classical probability and in the other end, agrees with the theory of non-local causality. At the same time, we argue that the correlation lies in a space with special mathematical constraints such that a physical realization of the correlation through a probability measure is not possible in one direction of its limit and is possible in the other limit.", "title": "Consistent theory for causal non-locality beyond Born's rule", "journal_ref": "Journal of the Korean Physical Society February 2014, Volume 64, Issue 4, pp 499-503", "proxy": null, "submitter": {"is_author": true, "email": "sonwm@physics.org", "name": "W. Son", "name_utf8": "W. Son"}, "paper_id": "1401.1012", "source": {"flags": "", "format": "tex", "size_bytes": 249986}, "title_utf8": "Consistent theory for causal non-locality beyond Born's rule", "submitted_date": "2014-01-06T03:27:44-0500", "abs_categories": "quant-ph", "metadata_id": 1354687, "acm_class": null, "report_num": null, "version": 1, "abstract_utf8": "According to the theory of relativity and causality, a special type of correlation beyond quantum mechanics is possible in principle under the name of {\\it non-local box}. The concept has been introduced from the principle of non-locality which satisfies relativistic causality. In this paper, we show that a correlation leading to the non-local box is possible to be derived consistently if we release the one of major axioms in quantum mechanics, {\\it Born's rule}. This allows us to obtain a theory which in one end of the spectrum agrees with the classical probability and in the other end, agrees with the theory of non-local causality. At the same time, we argue that the correlation lies in a space with special mathematical constraints such that a physical realization of the correlation through a probability measure is not possible in one direction of its limit and is possible in the other limit.", "journal_ref_utf8": "Journal of the Korean Physical Society February 2014, Volume 64, Issue 4, pp 499-503", "license": {"label": "arXiv.org perpetual, non-exclusive license to distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}, "authors_parsed": [{"last_name": "Son", "first_name": "Wonmin"}], "updated_date": null} -------------------------------------------------------------------------------- /tests/data/examples/1403.6219.json: -------------------------------------------------------------------------------- 1 | {"comments": "proof of diameter bound added in section 2, modification of the approximation scheme of cone metrics for \\lambda < 0 in section 2, results are unchanged", "announced_date_first": "2014-03", "submitted_date_all": ["2014-03-24T23:05:23-0400", "2014-07-04T11:40:06-0400"], "formats": ["pdf", "ps", "other"], "is_current": true, "author_owners": [{"last_name": "Datar", "first_name": "Ved"}], "primary_classification": {"group": {"name": "Mathematics", "id": "math"}, "archive": {"name": "Mathematics", "id": "math"}, "category": {"name": "Differential Geometry", "id": "math.DG"}}, "doi": null, "authors_utf8": "Ved V. Datar", "authors": "Ved V. Datar", "secondary_classification": [], "modified_date": "2014-07-06T20:10:33-0400", "document_id": 939398, "msc_class": null, "is_withdrawn": false, "comments_utf8": "proof of diameter bound added in section 2, modification of the approximation scheme of cone metrics for \u03bb< 0 in section 2, results are unchanged", "abstract": "In this note we prove convexity, in the sense of Colding-Naber, of the regular set of solutions to some complex Monge-Ampere equations with conical singularities along simple normal crossing divisors. In particular, any two points in the regular set can be joined by a smooth minimal geodesic lying entirely in the regular set. We show that as a result, the classical theorems of Myers and Bishop-Gromov extend almost verbatim to this singular setting.", "title": "On convexity of the regular set of conical Kahler-Einstein metrics", "journal_ref": null, "proxy": null, "submitter": {"is_author": true, "email": "veddatar@math.rutgers.edu", "name": "Ved Datar", "name_utf8": "Ved Datar"}, "paper_id": "1403.6219", "source": {"flags": "1", "format": "tex", "size_bytes": 16530}, "title_utf8": "On convexity of the regular set of conical Kahler-Einstein metrics", "submitted_date": "2014-07-04T11:40:06-0400", "abs_categories": "math.DG", "metadata_id": 1431448, "acm_class": null, "report_num": null, "version": 2, "abstract_utf8": "In this note we prove convexity, in the sense of Colding-Naber, of the regular set of solutions to some complex Monge-Ampere equations with conical singularities along simple normal crossing divisors. In particular, any two points in the regular set can be joined by a smooth minimal geodesic lying entirely in the regular set. We show that as a result, the classical theorems of Myers and Bishop-Gromov extend almost verbatim to this singular setting.", "journal_ref_utf8": null, "license": {"label": "arXiv.org perpetual, non-exclusive license to distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}, "authors_parsed": [{"last_name": "Datar", "first_name": "Ved V."}], "updated_date": null} -------------------------------------------------------------------------------- /tests/data/examples/1403.6219v1.json: -------------------------------------------------------------------------------- 1 | {"comments": null, "announced_date_first": "2014-03", "submitted_date_all": ["2014-03-24T23:05:23-0400", "2014-07-04T11:40:06-0400"], "formats": ["pdf", "ps", "other"], "is_current": false, "author_owners": [{"last_name": "Datar", "first_name": "Ved"}], "primary_classification": {"group": {"name": "Mathematics", "id": "math"}, "archive": {"name": "Mathematics", "id": "math"}, "category": {"name": "Differential Geometry", "id": "math.DG"}}, "doi": null, "authors_utf8": "Ved V. Datar", "authors": "Ved V. Datar", "secondary_classification": [], "modified_date": "2014-03-25T20:04:37-0400", "document_id": 939398, "msc_class": null, "is_withdrawn": false, "comments_utf8": null, "abstract": "In this note we prove convexity, in the sense of Colding-Naber, of the regular set of solutions to some complex Monge-Ampere equations with conical singularities along simple normal crossing divisors. In particular, any two points in the regular set can be joined by a smooth minimal geodesic lying entirely in the regular set. We show that as a result, the classical theorems of Myers and Bishop-Gromov extend almost verbatim to this singular setting.", "title": "On convexity of the regular set of conical Kahler-Einstein metrics", "journal_ref": null, "proxy": null, "submitter": {"is_author": true, "email": "veddatar@math.rutgers.edu", "name": "Ved Datar", "name_utf8": "Ved Datar"}, "paper_id": "1403.6219", "source": {"flags": "1", "format": "tex", "size_bytes": 14993}, "title_utf8": "On convexity of the regular set of conical Kahler-Einstein metrics", "submitted_date": "2014-03-24T23:05:23-0400", "abs_categories": "math.DG", "metadata_id": 1388199, "acm_class": null, "report_num": null, "version": 1, "abstract_utf8": "In this note we prove convexity, in the sense of Colding-Naber, of the regular set of solutions to some complex Monge-Ampere equations with conical singularities along simple normal crossing divisors. In particular, any two points in the regular set can be joined by a smooth minimal geodesic lying entirely in the regular set. We show that as a result, the classical theorems of Myers and Bishop-Gromov extend almost verbatim to this singular setting.", "journal_ref_utf8": null, "license": {"label": "arXiv.org perpetual, non-exclusive license to distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}, "authors_parsed": [{"last_name": "Datar", "first_name": "Ved V."}], "updated_date": null} -------------------------------------------------------------------------------- /tests/data/examples/1509.08727.json: -------------------------------------------------------------------------------- 1 | {"comments": "19 pages, 2 figures", "announced_date_first": "2015-09", "submitted_date_all": ["2015-09-29T08:58:05-0400"], "formats": ["pdf", "ps", "other"], "is_current": true, "author_owners": [{"last_name": "Schroder", "first_name": "York"}, {"last_name": "Moeller", "first_name": "Jan"}, {"last_name": "Ghi\u015foiu", "first_name": "Ioan"}], "primary_classification": {"group": {"name": "Physics", "id": "physics"}, "archive": {"name": "High Energy Physics - Phenomenology", "id": "hep-ph"}, "category": {"name": "High Energy Physics - Phenomenology", "id": "hep-ph"}}, "doi": "10.1007/JHEP11(2015)121", "authors_utf8": "Ioan Ghisoiu, Jan Moller, York Schroder", "authors": "Ioan Ghisoiu, Jan Moller, York Schroder", "secondary_classification": [{"group": {"name": "Physics", "id": "physics"}, "archive": {"name": "High Energy Physics - Lattice", "id": "hep-lat"}, "category": {"name": "High Energy Physics - Lattice", "id": "hep-lat"}}, {"group": {"name": "Physics", "id": "physics"}, "archive": {"name": "Nuclear Theory", "id": "nucl-th"}, "category": {"name": "Nuclear Theory", "id": "nucl-th"}}], "modified_date": "2016-01-20T10:56:49-0500", "document_id": 1091646, "msc_class": null, "is_withdrawn": false, "comments_utf8": "19 pages, 2 figures", "abstract": "Building upon our earlier work, we compute a Debye mass of finite-temperature Yang-Mills theory to three-loop order. As an application, we determine a $g^7$ contribution to the thermodynamic pressure of hot QCD.", "title": "Debye screening mass of hot Yang-Mills theory to three-loop order", "journal_ref": null, "proxy": null, "submitter": {"is_author": true, "email": "yschroeder@ubiobio.cl", "name": "York Schroder", "name_utf8": "York Schroder"}, "paper_id": "1509.08727", "source": {"flags": "", "format": "tex", "size_bytes": 180419}, "title_utf8": "Debye screening mass of hot Yang-Mills theory to three-loop order", "submitted_date": "2015-09-29T08:58:05-0400", "abs_categories": "hep-ph hep-lat nucl-th", "metadata_id": 1634116, "acm_class": null, "report_num": null, "version": 1, "abstract_utf8": "Building upon our earlier work, we compute a Debye mass of finite-temperature Yang-Mills theory to three-loop order. As an application, we determine a $g^7$ contribution to the thermodynamic pressure of hot QCD.", "journal_ref_utf8": null, "license": {"label": "arXiv.org perpetual, non-exclusive license to distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}, "authors_parsed": [{"last_name": "Ghisoiu", "first_name": "Ioan"}, {"last_name": "Moller", "first_name": "Jan"}, {"last_name": "Schroder", "first_name": "York"}], "updated_date": null} -------------------------------------------------------------------------------- /tests/data/examples/1511.07473.json: -------------------------------------------------------------------------------- 1 | {"comments": "This article has been administratively withdrawn because of authorship dispute and retraction by the journal", "announced_date_first": "2015-11", "submitted_date_all": ["2015-11-20T14:09:54-0500", "2016-08-11T13:56:35-0400"], "formats": ["src"], "is_current": true, "primary_classification": {"group": {"name": "Physics", "id": "physics"}, "archive": {"name": "Physics", "id": "physics"}, "category": {"name": "Instrumentation and Detectors", "id": "physics.ins-det"}}, "doi": "10.1002/cta.2157", "authors_utf8": "Mohsen Salehi", "authors": "Mohsen Salehi", "secondary_classification": [], "modified_date": "2016-08-11T20:09:47-0400", "document_id": 1108933, "msc_class": null, "is_withdrawn": true, "comments_utf8": "This article has been administratively withdrawn because of authorship dispute and retraction by the journal", "abstract": "In this paper, an ultra-fast frequency shift-keying (FSK) modulation technique based on switched capacitor resonators is presented. It is demonstrated that switching a reactive component such as a capacitor, in a high-Q resonator with proper switching signal can preserve the stored energy and shift it to a different frequency. Switching boundaries are found by continuity of electric charge and magnetic flux. It is shown that if switching time is synchronous with zero crossing of the voltage signal across the switched capacitor, impulsive components can be avoided and continuity of electric charge is satisfied without energy dissipation. We use this property to realize a fast binary frequency-shift keying (FSK) modulator with only a single RF source. In this technique, the modulation rate is independent of the resonator bandwidth and can be as high as the lower carrier frequency. Experimental results are presented to validate the simulations.", "title": "High-speed FSK Modulator Using Switched-capacitor Resonators", "journal_ref": null, "proxy": null, "submitter": {"email": "help@arxiv.org", "name": "arXiv Admin", "name_utf8": "arXiv Admin"}, "paper_id": "1511.07473", "source": {"flags": "1", "format": "withdrawn", "size_bytes": 42}, "title_utf8": "High-speed FSK Modulator Using Switched-capacitor Resonators", "submitted_date": "2016-08-11T13:56:35-0400", "abs_categories": "physics.ins-det", "metadata_id": 1791369, "acm_class": null, "report_num": null, "version": 2, "abstract_utf8": "In this paper, an ultra-fast frequency shift-keying (FSK) modulation technique based on switched capacitor resonators is presented. It is demonstrated that switching a reactive component such as a capacitor, in a high-Q resonator with proper switching signal can preserve the stored energy and shift it to a different frequency. Switching boundaries are found by continuity of electric charge and magnetic flux. It is shown that if switching time is synchronous with zero crossing of the voltage signal across the switched capacitor, impulsive components can be avoided and continuity of electric charge is satisfied without energy dissipation. We use this property to realize a fast binary frequency-shift keying (FSK) modulator with only a single RF source. In this technique, the modulation rate is independent of the resonator bandwidth and can be as high as the lower carrier frequency. Experimental results are presented to validate the simulations.", "journal_ref_utf8": null, "license": {"label": "arXiv.org perpetual, non-exclusive license to distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}, "authors_parsed": [{"last_name": "Salehi", "first_name": "Mohsen"}], "updated_date": null} -------------------------------------------------------------------------------- /tests/data/examples/1511.07473v1.json: -------------------------------------------------------------------------------- 1 | {"comments": null, "announced_date_first": "2015-11", "submitted_date_all": ["2015-11-20T14:09:54-0500", "2016-08-11T13:56:35-0400"], "formats": ["pdfonly"], "is_current": false, "primary_classification": {"group": {"name": "Physics", "id": "physics"}, "archive": {"name": "Physics", "id": "physics"}, "category": {"name": "Instrumentation and Detectors", "id": "physics.ins-det"}}, "doi": "10.1002/cta.2157", "authors_utf8": "Mohsen Salehi", "authors": "Mohsen Salehi", "secondary_classification": [], "modified_date": "2016-08-11T13:53:11-0400", "document_id": 1108933, "msc_class": null, "is_withdrawn": false, "comments_utf8": null, "abstract": "In this paper, an ultra-fast frequency shift-keying (FSK) modulation technique based on switched capacitor resonators is presented. It is demonstrated that switching a reactive component such as a capacitor, in a high-Q resonator with proper switching signal can preserve the stored energy and shift it to a different frequency. Switching boundaries are found by continuity of electric charge and magnetic flux. It is shown that if switching time is synchronous with zero crossing of the voltage signal across the switched capacitor, impulsive components can be avoided and continuity of electric charge is satisfied without energy dissipation. We use this property to realize a fast binary frequency-shift keying (FSK) modulator with only a single RF source. In this technique, the modulation rate is independent of the resonator bandwidth and can be as high as the lower carrier frequency. Experimental results are presented to validate the simulations.", "title": "High-speed FSK Modulator Using Switched-capacitor Resonators", "journal_ref": null, "proxy": null, "submitter": {"is_author": true, "email": "msalehi@vt.edu", "name": "Mohsen Salehi", "name_utf8": "Mohsen Salehi"}, "paper_id": "1511.07473", "source": {"flags": "1", "format": "pdf", "size_bytes": 840596}, "title_utf8": "High-speed FSK Modulator Using Switched-capacitor Resonators", "submitted_date": "2015-11-20T14:09:54-0500", "abs_categories": "physics.ins-det", "metadata_id": 1661914, "acm_class": null, "report_num": null, "version": 1, "abstract_utf8": "In this paper, an ultra-fast frequency shift-keying (FSK) modulation technique based on switched capacitor resonators is presented. It is demonstrated that switching a reactive component such as a capacitor, in a high-Q resonator with proper switching signal can preserve the stored energy and shift it to a different frequency. Switching boundaries are found by continuity of electric charge and magnetic flux. It is shown that if switching time is synchronous with zero crossing of the voltage signal across the switched capacitor, impulsive components can be avoided and continuity of electric charge is satisfied without energy dissipation. We use this property to realize a fast binary frequency-shift keying (FSK) modulator with only a single RF source. In this technique, the modulation rate is independent of the resonator bandwidth and can be as high as the lower carrier frequency. Experimental results are presented to validate the simulations.", "journal_ref_utf8": null, "license": {"label": "arXiv.org perpetual, non-exclusive license to distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}, "authors_parsed": [{"last_name": "Salehi", "first_name": "Mohsen"}], "updated_date": null} -------------------------------------------------------------------------------- /tests/data/examples/1607.05107.json: -------------------------------------------------------------------------------- 1 | {"comments": "7 pages, 1 supplementary file with figures of adsorption configurations and energies", "announced_date_first": "2016-07", "submitted_date_all": ["2016-07-18T10:43:28-0400"], "formats": ["pdf", "other"], "is_current": true, "author_owners": [{"author_id": "schroder_e_1", "orcid": "0000-0003-4995-3585", "last_name": "Schroder", "first_name": "Elsebeth"}], "primary_classification": {"group": {"name": "Physics", "id": "physics"}, "archive": {"name": "Physics", "id": "physics"}, "category": {"name": "Computational Physics", "id": "physics.comp-ph"}}, "doi": "10.1016/j.susc.2017.06.012", "authors_utf8": "\u00d8yvind Borck and Elsebeth Schr\u00f6der", "authors": "{\\O}yvind Borck and Elsebeth Schr\\\"oder", "secondary_classification": [{"group": {"name": "Physics", "id": "physics"}, "archive": {"name": "Physics", "id": "physics"}, "category": {"name": "Chemical Physics", "id": "physics.chem-ph"}}], "modified_date": "2017-08-23T14:40:27-0400", "document_id": 1180938, "msc_class": null, "is_withdrawn": false, "comments_utf8": "7 pages, 1 supplementary file with figures of adsorption configurations and energies", "abstract": "We present a theory study of the physisorption of the series of methylbenzenes (toluene, xylene and mesitylene), as well as benzene, on graphene. This is relevant for the basic understanding of graphene used as a material for sensors and as an idealized model for the carbon in active carbon filters. The molecules are studied in a number of positions and orientations relative graphene, using density functional theory with the van der Waals functional vdW-DF. We focus on the vdW-DF1 and vdW-DF-cx functionals, and find that the binding energy of the molecules on graphene grows linearly with the number of methyl groups, at the rate of 0.09 eV per added methyl group.", "title": "Methylbenzenes on graphene", "journal_ref": null, "proxy": null, "submitter": {"author_id": "schroder_e_1", "is_author": true, "email": "schroder@chalmers.se", "orcid": "0000-0003-4995-3585", "name": "Elsebeth Schroder", "name_utf8": "Elsebeth Schroder"}, "paper_id": "1607.05107", "source": {"flags": "A", "format": "pdftex", "size_bytes": 1292640}, "title_utf8": "Methylbenzenes on graphene", "submitted_date": "2016-07-18T10:43:28-0400", "abs_categories": "physics.comp-ph physics.chem-ph", "metadata_id": 1779359, "acm_class": null, "report_num": null, "version": 1, "abstract_utf8": "We present a theory study of the physisorption of the series of methylbenzenes (toluene, xylene and mesitylene), as well as benzene, on graphene. This is relevant for the basic understanding of graphene used as a material for sensors and as an idealized model for the carbon in active carbon filters. The molecules are studied in a number of positions and orientations relative graphene, using density functional theory with the van der Waals functional vdW-DF. We focus on the vdW-DF1 and vdW-DF-cx functionals, and find that the binding energy of the molecules on graphene grows linearly with the number of methyl groups, at the rate of 0.09 eV per added methyl group.", "journal_ref_utf8": null, "license": {"label": "arXiv.org perpetual, non-exclusive license to distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}, "authors_parsed": [{"last_name": "Borck", "first_name": "\u00d8yvind"}, {"last_name": "Schr\u00f6der", "first_name": "Elsebeth"}], "updated_date": null} -------------------------------------------------------------------------------- /tests/data/examples/1703.09067.json: -------------------------------------------------------------------------------- 1 | {"comments": "13 pages, 4 figures. To be published in Chinese Physics C", "announced_date_first": "2017-03", "submitted_date_all": ["2017-03-24T03:40:30-0400"], "formats": ["pdf", "ps", "other"], "is_current": true, "author_owners": [{"orcid": "0000-0002-6514-940X", "last_name": "Valcarce", "first_name": "Alfredo"}], "primary_classification": {"group": {"name": "Physics", "id": "physics"}, "archive": {"name": "Nuclear Theory", "id": "nucl-th"}, "category": {"name": "Nuclear Theory", "id": "nucl-th"}}, "doi": "10.1088/1674-1137/41/7/074102", "authors_utf8": "H. Garcilazo, A. Valcarce, and J. Vijande", "authors": "H. Garcilazo, A. Valcarce, and J. Vijande", "secondary_classification": [{"group": {"name": "Physics", "id": "physics"}, "archive": {"name": "High Energy Physics - Phenomenology", "id": "hep-ph"}, "category": {"name": "High Energy Physics - Phenomenology", "id": "hep-ph"}}], "modified_date": "2017-05-11T20:03:24-0400", "document_id": 1260851, "msc_class": null, "is_withdrawn": false, "comments_utf8": "13 pages, 4 figures. To be published in Chinese Physics C", "abstract": "Using local central Yukawa-type Malfliet-Tjon interactions reproducing the low-energy parameters and phase shifs of the $nn$ system and the latest updates of the $n\\Lambda$ and $\\Lambda\\Lambda$ Nijmegen ESC08c potentials we study the possible existence of a ${}_{\\Lambda\\Lambda}^{\\,\\,\\,\\,4}n$ bound state. Our results indicate that the ${}_{\\Lambda\\Lambda}^{\\,\\,\\,\\,4}n$ is unbound, being just above threshold. We discuss the role played by the $^1S_0$ $nn$ repulsive term of the Yukawa-type Malfliet-Tjon interaction.", "title": "The ${}_{\\Lambda\\Lambda}^{\\,\\,\\,\\,4}n$ system", "journal_ref": "Chin. Phys. C 41, 074102 (2017)", "proxy": null, "submitter": {"is_author": true, "email": "valcarce@usal.es", "orcid": "0000-0002-6514-940X", "name": "Alfredo Valcarce", "name_utf8": "Alfredo Valcarce"}, "paper_id": "1703.09067", "source": {"flags": "", "format": "tex", "size_bytes": 246092}, "title_utf8": "The ${}_{\u039b\u039b}^{\\,\\,\\,\\,4}n$ system", "submitted_date": "2017-03-24T03:40:30-0400", "abs_categories": "nucl-th hep-ph", "metadata_id": 1909317, "acm_class": null, "report_num": null, "version": 1, "abstract_utf8": "Using local central Yukawa-type Malfliet-Tjon interactions reproducing the low-energy parameters and phase shifs of the $nn$ system and the latest updates of the $n\u039b$ and $\u039b\u039b$ Nijmegen ESC08c potentials we study the possible existence of a ${}_{\u039b\u039b}^{\\,\\,\\,\\,4}n$ bound state. Our results indicate that the ${}_{\u039b\u039b}^{\\,\\,\\,\\,4}n$ is unbound, being just above threshold. We discuss the role played by the $^1S_0$ $nn$ repulsive term of the Yukawa-type Malfliet-Tjon interaction.", "journal_ref_utf8": "Chin. Phys. C 41, 074102 (2017)", "license": {"label": "arXiv.org perpetual, non-exclusive license to distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}, "authors_parsed": [{"last_name": "Garcilazo", "first_name": "H."}, {"last_name": "Valcarce", "first_name": "A."}, {"last_name": "Vijande", "first_name": "J."}], "updated_date": null} -------------------------------------------------------------------------------- /tests/data/examples/1708.07156.json: -------------------------------------------------------------------------------- 1 | {"comments": null, "announced_date_first": "2017-08", "submitted_date_all": ["2017-08-23T15:12:30-0400"], "formats": ["pdf", "other"], "is_current": true, "author_owners": [{"last_name": "Jacobs", "first_name": "Gustaaf"}], "primary_classification": {"group": {"name": "Mathematics", "id": "math"}, "archive": {"name": "Mathematics", "id": "math"}, "category": {"name": "Numerical Analysis", "id": "math.NA"}}, "doi": null, "authors_utf8": "Wissink B.W and Jacobs G.B. and Ryan J.K. and Don W-S and van der Weide E.T.A", "authors": "Wissink B.W and Jacobs G.B. and Ryan J.K. and Don W-S and van der Weide E.T.A", "secondary_classification": [], "modified_date": "2017-08-24T20:00:46-0400", "document_id": 1310448, "msc_class": null, "is_withdrawn": false, "comments_utf8": null, "abstract": "A smoothness-increasing accuracy conserving filtering approach to the regularization of discontinuities is presented for single domain spectral collocation approximations of hyperbolic conservation laws. The filter is based on convolution of a polynomial kernel that approximates a delta-sequence. The kernel combines a $k^{th}$ order smoothness with an arbitrary number of ${m}$ zero moments. The zero moments ensure a $m^{th}$ order accurate approximation of the delta-sequence to the delta function. Through exact quadrature the projection error of the polynomial kernel on the spectral basis is ensured to be less than the moment error. A number of test cases on the advection equation, Burger's equation and Euler equations in 1D and 2D shown that the filter regularizes discontinuities while preserving high-order resolution", "title": "Shock Regularization with Smoothness-Increasing Accuracy-Conserving Dirac-Delta Polynomial Kernels", "journal_ref": null, "proxy": null, "submitter": {"is_author": true, "email": "gjacobs@mail.sdsu.edu", "name": "Gustaaf Jacobs", "name_utf8": "Gustaaf Jacobs"}, "paper_id": "1708.07156", "source": {"flags": "", "format": "pdftex", "size_bytes": 808067}, "title_utf8": "Shock Regularization with Smoothness-Increasing Accuracy-Conserving Dirac-Delta Polynomial Kernels", "submitted_date": "2017-08-23T15:12:30-0400", "abs_categories": "math.NA", "metadata_id": 1990179, "acm_class": null, "report_num": null, "version": 1, "abstract_utf8": "A smoothness-increasing accuracy conserving filtering approach to the regularization of discontinuities is presented for single domain spectral collocation approximations of hyperbolic conservation laws. The filter is based on convolution of a polynomial kernel that approximates a delta-sequence. The kernel combines a $k^{th}$ order smoothness with an arbitrary number of ${m}$ zero moments. The zero moments ensure a $m^{th}$ order accurate approximation of the delta-sequence to the delta function. Through exact quadrature the projection error of the polynomial kernel on the spectral basis is ensured to be less than the moment error. A number of test cases on the advection equation, Burger's equation and Euler equations in 1D and 2D shown that the filter regularizes discontinuities while preserving high-order resolution", "journal_ref_utf8": null, "license": {"label": "arXiv.org perpetual, non-exclusive license to distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}, "authors_parsed": [{"last_name": "W", "first_name": "Wissink B."}, {"last_name": "B.", "first_name": "Jacobs G."}, {"last_name": "K.", "first_name": "Ryan J."}, {"last_name": "W-S", "first_name": "Don"}, {"last_name": "A", "first_name": "van der Weide E. T."}], "updated_date": null} -------------------------------------------------------------------------------- /tests/data/examples/1712.04442.json: -------------------------------------------------------------------------------- 1 | {"comments": "11 pages,11 figures, 1 table", "announced_date_first": "2017-12", "submitted_date_all": ["2017-12-11T20:10:00-0500", "2017-12-26T15:05:30-0500", "2018-02-21T10:41:55-0500"], "formats": ["pdfonly"], "is_current": true, "author_owners": [{"last_name": "Grebel", "first_name": "Haim"}], "primary_classification": {"group": {"name": "Physics", "id": "physics"}, "archive": {"name": "Physics", "id": "physics"}, "category": {"name": "General Physics", "id": "physics.gen-ph"}}, "doi": null, "authors_utf8": "Haim Grebel", "authors": "Haim Grebel", "secondary_classification": [], "modified_date": "2018-02-21T20:09:14-0500", "document_id": 1351321, "msc_class": null, "is_withdrawn": false, "comments_utf8": "11 pages,11 figures, 1 table", "abstract": "Capacitors are typically connected together in one of two configurations: either in series, or in parallel. Here, a new capacitive element is introduced: a capacitor-within-capacitor (CWC). The overall capacitance of the new structure is larger than an ordinary two-plate capacitor by at least 50% and its capacitance may be electrically controlled. When the 'ordinary' dielectric layers between plates were replaced by tissues soaked with ionic liquid, an increase of ca 10,000 was observed. Overall, this concept is deemed suitable for large capacity energy storage and tunable electronic circuitry.", "title": "Capacitor-within-Capacitor: Electrically Controlled Capacitors", "journal_ref": null, "proxy": null, "submitter": {"is_author": true, "email": "grebel@njit.edu", "name": "Haim Grebel", "name_utf8": "Haim Grebel"}, "paper_id": "1712.04442", "source": {"flags": "1", "format": "pdf", "size_bytes": 624721}, "title_utf8": "Capacitor-within-Capacitor: Electrically Controlled Capacitors", "submitted_date": "2018-02-21T10:41:55-0500", "abs_categories": "physics.gen-ph", "metadata_id": 2095158, "acm_class": null, "report_num": null, "version": 3, "abstract_utf8": "Capacitors are typically connected together in one of two configurations: either in series, or in parallel. Here, a new capacitive element is introduced: a capacitor-within-capacitor (CWC). The overall capacitance of the new structure is larger than an ordinary two-plate capacitor by at least 50% and its capacitance may be electrically controlled. When the 'ordinary' dielectric layers between plates were replaced by tissues soaked with ionic liquid, an increase of ca 10,000 was observed. Overall, this concept is deemed suitable for large capacity energy storage and tunable electronic circuitry.", "journal_ref_utf8": null, "license": {"label": "arXiv.org perpetual, non-exclusive license to distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}, "authors_parsed": [{"last_name": "Grebel", "first_name": "Haim"}], "updated_date": null} -------------------------------------------------------------------------------- /tests/data/examples/1712.04442v1.json: -------------------------------------------------------------------------------- 1 | {"comments": "6 pages, 5 figures", "announced_date_first": "2017-12", "submitted_date_all": ["2017-12-11T20:10:00-0500", "2017-12-26T15:05:30-0500", "2018-02-21T10:41:55-0500"], "formats": ["pdfonly"], "is_current": false, "author_owners": [{"last_name": "Grebel", "first_name": "Haim"}], "primary_classification": {"group": {"name": "Physics", "id": "physics"}, "archive": {"name": "Physics", "id": "physics"}, "category": {"name": "General Physics", "id": "physics.gen-ph"}}, "doi": null, "authors_utf8": "Haim Grebel", "authors": "Haim Grebel", "secondary_classification": [], "modified_date": "2017-12-13T20:00:10-0500", "document_id": 1351321, "msc_class": null, "is_withdrawn": false, "comments_utf8": "6 pages, 5 figures", "abstract": "The capacitance of a capacitor-within-capacitor structure is assessed through simulations. The overall capacitance of the structure may substantially increase and the overall capacitance may be electrically controlled.", "title": "Capacitor-within-Capacitor: Electrically Controlled Capacitors", "journal_ref": null, "proxy": null, "submitter": {"is_author": true, "email": "grebel@njit.edu", "name": "Haim Grebel", "name_utf8": "Haim Grebel"}, "paper_id": "1712.04442", "source": {"flags": "1", "format": "pdf", "size_bytes": 347173}, "title_utf8": "Capacitor-within-Capacitor: Electrically Controlled Capacitors", "submitted_date": "2017-12-11T20:10:00-0500", "abs_categories": "physics.gen-ph", "metadata_id": 2055884, "acm_class": null, "report_num": null, "version": 1, "abstract_utf8": "The capacitance of a capacitor-within-capacitor structure is assessed through simulations. The overall capacitance of the structure may substantially increase and the overall capacitance may be electrically controlled.", "journal_ref_utf8": null, "license": {"label": "arXiv.org perpetual, non-exclusive license to distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}, "authors_parsed": [{"last_name": "Grebel", "first_name": "Haim"}], "updated_date": null} -------------------------------------------------------------------------------- /tests/data/examples/1712.04442v2.json: -------------------------------------------------------------------------------- 1 | {"comments": "9 pages,10 figures", "announced_date_first": "2017-12", "submitted_date_all": ["2017-12-11T20:10:00-0500", "2017-12-26T15:05:30-0500", "2018-02-21T10:41:55-0500"], "formats": ["pdfonly"], "is_current": false, "author_owners": [{"last_name": "Grebel", "first_name": "Haim"}], "primary_classification": {"group": {"name": "Physics", "id": "physics"}, "archive": {"name": "Physics", "id": "physics"}, "category": {"name": "General Physics", "id": "physics.gen-ph"}}, "doi": null, "authors_utf8": "Haim Grebel", "authors": "Haim Grebel", "secondary_classification": [], "modified_date": "2017-12-28T20:00:57-0500", "document_id": 1351321, "msc_class": null, "is_withdrawn": false, "comments_utf8": "9 pages,10 figures", "abstract": "Capacitors are typically connected together in one of two configurations: either in series, or in parallel. Here, a new configuration is introduced: a capacitor-within-capacitor (CWC). The overall capacitance of the new structure is larger than an ordinary two-plate capacitor and may be electrically controlled. It thus has implications to electronic circuitry and energy storage elements alike.", "title": "Capacitor-within-Capacitor: Electrically Controlled Capacitors", "journal_ref": null, "proxy": null, "submitter": {"is_author": true, "email": "grebel@njit.edu", "name": "Haim Grebel", "name_utf8": "Haim Grebel"}, "paper_id": "1712.04442", "source": {"flags": "1", "format": "pdf", "size_bytes": 624243}, "title_utf8": "Capacitor-within-Capacitor: Electrically Controlled Capacitors", "submitted_date": "2017-12-26T15:05:30-0500", "abs_categories": "physics.gen-ph", "metadata_id": 2063705, "acm_class": null, "report_num": null, "version": 2, "abstract_utf8": "Capacitors are typically connected together in one of two configurations: either in series, or in parallel. Here, a new configuration is introduced: a capacitor-within-capacitor (CWC). The overall capacitance of the new structure is larger than an ordinary two-plate capacitor and may be electrically controlled. It thus has implications to electronic circuitry and energy storage elements alike.", "journal_ref_utf8": null, "license": {"label": "arXiv.org perpetual, non-exclusive license to distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}, "authors_parsed": [{"last_name": "Grebel", "first_name": "Haim"}], "updated_date": null} -------------------------------------------------------------------------------- /tests/examples/advanced_search.feature.example: -------------------------------------------------------------------------------- 1 | Feature: Advanced Search 2 | 3 | Scenario: date range search 4 | Dates can be searched across all versions and the results show the most 5 | current version. Not sure how to write the test to reflect this. 6 | 7 | Given Specific year field is "2015" 8 | When a user performs a search 9 | Then a list of results is returned 10 | And all entries contain at least one version with a submitted date field 11 | that matches "2015" 12 | 13 | Scenario: multiple terms search success 14 | 15 | Note that the term "jqk" doesn't match any strings in our existing database 16 | and is expected to fail. This test, combined with the one below, also checks 17 | the boolean operators as well as whether two fields at once can be searched. 18 | Anded results will have no results, ored results will have results. 19 | 20 | Given first term field is "schroder" 21 | And first field selected is "Author(s)" 22 | And second term field is "jqk" 23 | And second field selected is "Title" 24 | And boolean operator is "OR" 25 | When a user performs a search 26 | Then a list of results is returned 27 | And all entries contain the name "schroder" or the term "jqk" 28 | 29 | Scenario: multiple terms with no results 30 | 31 | Given first term field is "schroder" 32 | And first field selected is "Author(s)" 33 | And second term field is "jqk" 34 | And second field selected is "Title" 35 | And boolean operator is "AND" 36 | When a user performs a search 37 | Then no results are returned 38 | And a message is displayed to the user 39 | -------------------------------------------------------------------------------- /tests/examples/author_search.feature.example: -------------------------------------------------------------------------------- 1 | Feature: Author Search 2 | 3 | Users should be able to search for all or part of an author's name. 4 | This should produce a list of sortable results, including variations on 5 | possible diacritic letters. 6 | - Wildcards allowed except in first letter position 7 | - Surname required, forename optional 8 | 9 | Scenario: searching for a surname 10 | Given surname field is "schroder" 11 | When user performs a search 12 | Then a list of results is returned 13 | And all entries contain at least one author with surname "schroder" or diacritic variant "Schrøder" or "Schröder" or "Schrœder" (or others) 14 | 15 | Scenario: searching for a partial name 16 | 17 | Given forename field is "W." 18 | And surname field is "W" 19 | When a user performs a search 20 | Then a list of results is returned 21 | And all entries contain at least one author with the forename "W." and the surname fragment "W" 22 | -------------------------------------------------------------------------------- /tests/examples/ordering_pagination.feature.example: -------------------------------------------------------------------------------- 1 | Feature: Ordering and Pagination 2 | 3 | Users should be able to change the number of entries per page for results. 4 | Users should also be able to reorder results by relevance, ascending date, 5 | descending date. 6 | 7 | Scenario: reordering from author search 8 | Given surname field is "cat*" 9 | And search form used is "Author Search" 10 | And "Submission date (ascending)" is selected 11 | When user presses "Go" to trigger reordering 12 | Then the list of results is displayed in order of ascending submission date 13 | -------------------------------------------------------------------------------- /tests/examples/simple_search.feature.example: -------------------------------------------------------------------------------- 1 | Feature: Simple Search 2 | 3 | Users should be able to enter terms and expressions into a basic search box. 4 | Users should also be able to select from a simple list of metadata in which 5 | to search. A list of results is generated containing the results across the 6 | metadata fields selected. 7 | 8 | Scenario: simple term search across all fields 9 | Given search term is "flux capacitor" 10 | And selected field to search is "All fields" 11 | When a user performs a search 12 | Then a list of results is displayed 13 | And all entries contain a metadata field that contains "flux" 14 | And all entries contain a metadata field that contains "capacitor" 15 | 16 | Scenario: search for TeX terms 17 | This could be a scenario for any of the search forms and is a test of how 18 | and whether the TeX tokenizing works. 19 | A search for a TeX expression should match similar metadata strings. 20 | Note, I am not sure if this is supposed to happen: 21 | This "$z$" also produces results for "z" (not wrapped in TeX) 22 | "λ" produces no results but "lambda" and "$\lambda$" do 23 | 24 | Given search entry is "$z_1$" 25 | And selected field to search is "All fields" 26 | When a user performs a search 27 | Then a list of results is displayed 28 | And all entries contain a metadata field that contains the expression "z_1" 29 | -------------------------------------------------------------------------------- /tests/integration/README.md: -------------------------------------------------------------------------------- 1 | # Integration tests 2 | 3 | Tests in this folder exercise the entire arXiv-search service against a 4 | local instance of Elasticsearch. 5 | -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-search/4c12f7ba5fc6db0143770b4f53c7a607315ef92f/tests/integration/__init__.py -------------------------------------------------------------------------------- /tests/stubs/docmeta.py: -------------------------------------------------------------------------------- 1 | """Stub for the docmeta service.""" 2 | import os 3 | import json 4 | from flask import Flask 5 | from flask.json import jsonify 6 | from werkzeug.exceptions import NotFound, InternalServerError 7 | 8 | from arxiv.base import Base 9 | from arxiv.base.converter import ArXivConverter 10 | from arxiv.base import logging 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | METADATA_DIR = os.environ.get("METADATA_DIR") 15 | 16 | 17 | app = Flask("metadata") 18 | Base(app) 19 | 20 | app.url_map.converters["arxiv"] = ArXivConverter 21 | 22 | 23 | @app.route("/docmeta/", methods=["GET"]) 24 | def docmeta(document_id): 25 | """Retrieve document metadata.""" 26 | logger.debug(f"Get metadata for {document_id}") 27 | logger.debug(f"Metadata base is {METADATA_DIR}") 28 | if not METADATA_DIR: 29 | raise InternalServerError("Metadata directory not set") 30 | metadata_path = os.path.join(METADATA_DIR, f"{document_id}.json") 31 | logger.debug(f"Metadata path is {metadata_path}") 32 | if not os.path.exists(metadata_path): 33 | raise NotFound("No such document") 34 | with open(metadata_path) as f: 35 | return jsonify(json.load(f)) 36 | 37 | 38 | def application(environ, start_response): 39 | """WSGI application factory.""" 40 | for key, value in environ.items(): 41 | os.environ[key] = str(value) 42 | return app(environ, start_response) 43 | -------------------------------------------------------------------------------- /tests/test_exceptions.py: -------------------------------------------------------------------------------- 1 | """Tests exception handling in :mod:`arxiv.base.exceptions`.""" 2 | 3 | from http import HTTPStatus 4 | from unittest import TestCase, mock 5 | 6 | from werkzeug.exceptions import InternalServerError 7 | 8 | from search.controllers import simple 9 | from search.factory import create_ui_web_app 10 | from search.services.index import IndexConnectionError, QueryError 11 | 12 | 13 | class TestExceptionHandling(TestCase): 14 | """HTTPExceptions should be handled with custom templates.""" 15 | 16 | def setUp(self): 17 | """Initialize an app and install :class:`.Base`.""" 18 | self.app = create_ui_web_app() 19 | self.client = self.app.test_client() 20 | 21 | def test_404(self): 22 | """A 404 response should be returned.""" 23 | response = self.client.get("/foo") 24 | self.assertEqual(response.status_code, HTTPStatus.NOT_FOUND) 25 | self.assertIn("text/html", response.content_type) 26 | 27 | def test_405(self): 28 | """A 405 response should be returned.""" 29 | response = self.client.post("/") 30 | self.assertEqual(response.status_code, HTTPStatus.METHOD_NOT_ALLOWED) 31 | self.assertIn("text/html", response.content_type) 32 | 33 | @mock.patch("search.controllers.simple.search") 34 | def test_500(self, mock_search): 35 | """A 500 response should be returned.""" 36 | # Raise an internal server error from the search controller. 37 | mock_search.side_effect = InternalServerError 38 | 39 | response = self.client.get("/") 40 | self.assertEqual( 41 | response.status_code, HTTPStatus.INTERNAL_SERVER_ERROR 42 | ) 43 | self.assertIn("text/html", response.content_type) 44 | 45 | @mock.patch(f"{simple.__name__}.SearchSession.search") 46 | def test_index_connection_error(self, mock_search): 47 | """When an IndexConnectionError occurs, an error page is displayed.""" 48 | mock_search.side_effect = IndexConnectionError 49 | response = self.client.get("/?searchtype=title&query=foo") 50 | self.assertEqual( 51 | response.status_code, HTTPStatus.BAD_GATEWAY 52 | ) 53 | self.assertIn("text/html", response.content_type) 54 | 55 | @mock.patch(f"{simple.__name__}.SearchSession.search") 56 | def test_query_error(self, mock_search): 57 | """When a QueryError occurs, an error page is displayed.""" 58 | mock_search.side_effect = QueryError 59 | response = self.client.get("/?searchtype=title&query=foo") 60 | self.assertEqual( 61 | response.status_code, HTTPStatus.BAD_REQUEST 62 | ) 63 | self.assertIn("text/html", response.content_type) 64 | -------------------------------------------------------------------------------- /update-docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SRCDOCS=`pwd`/docs/source/_build/html 3 | REPO=arXiv/arxiv-search 4 | echo $SRCDOCS 5 | 6 | cd `pwd`/docs 7 | make html 8 | 9 | cd $SRCDOCS 10 | MSG="Adding gh-pages docs for `git log -1 --pretty=short --abbrev-commit`" 11 | 12 | TMPREPO=/tmp/docs/$REPO 13 | rm -rf $TMPREPO 14 | mkdir -p -m 0755 $TMPREPO 15 | echo $MSG 16 | 17 | git clone git@github.com:$REPO.git $TMPREPO 18 | cd $TMPREPO 19 | 20 | ## checkout the branch if it exists, if not then create it and detach it from the history 21 | if ! git checkout gh-pages; then 22 | git checkout --orphan gh-pages 23 | git rm -rf . 24 | touch .nojekyll 25 | git add .nojekyll 26 | else 27 | git checkout gh-pages ###gh-pages has previously one off been set to be nothing but html 28 | fi 29 | 30 | cp -r $SRCDOCS/* $TMPREPO 31 | git add -A 32 | git commit -m "$MSG" && git push origin gh-pages 33 | -------------------------------------------------------------------------------- /upload_static_assets.py: -------------------------------------------------------------------------------- 1 | """Use this to upload static content to S3.""" 2 | 3 | import flask_s3 4 | from search.factory import create_ui_web_app 5 | 6 | app = create_ui_web_app() 7 | 8 | flask_s3.create_all(app, filepath_filter_regex=r"(base|css|images|js|sass)") 9 | -------------------------------------------------------------------------------- /uwsgi.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | http-socket = :8000 3 | chdir = /app 4 | wsgi-file = wsgi.py 5 | callable = application 6 | master = true 7 | harakiri = 3000 8 | manage-script-name = true 9 | processes = 1 10 | queue = 0 11 | threads = 1 12 | single-interpreter = true 13 | mount = /=wsgi.py 14 | logformat = "%(addr) %(addr) - %(user_id)|%(session_id) [%(rtime)] [%(uagent)] \"%(method) %(uri) %(proto)\" %(status) %(size) %(micros) %(ttfb)" 15 | buffer-size = 65535 16 | wsgi-disable-file-wrapper = true 17 | -------------------------------------------------------------------------------- /wsgi-api.py: -------------------------------------------------------------------------------- 1 | """Web Server Gateway Interface entry-point for API.""" 2 | 3 | import os 4 | 5 | __flask_app__ = None 6 | 7 | 8 | def application(environ, start_response): 9 | """WSGI application factory.""" 10 | for key, value in environ.items(): 11 | # In some deployment scenarios (e.g. uWSGI on k8s), uWSGI will pass in 12 | # the hostname as part of the request environ. This will usually just 13 | # be a container ID, which is not helpful for things like building 14 | # URLs. We want to keep ``SERVER_NAME`` explicitly configured, either 15 | # in config.py or via an os.environ var loaded by config.py. 16 | if key == "SERVER_NAME": 17 | continue 18 | if type(value) is str: 19 | os.environ[key] = value 20 | global __flask_app__ 21 | if __flask_app__ is None: 22 | from search.factory import create_api_web_app 23 | __flask_app__ = create_api_web_app() 24 | 25 | return __flask_app__(environ, start_response) 26 | -------------------------------------------------------------------------------- /wsgi-classic-api.py: -------------------------------------------------------------------------------- 1 | """Web Server Gateway Interface entry-point for classic API.""" 2 | 3 | import os 4 | from arxiv.base import logging 5 | 6 | __flask_app__ = None 7 | 8 | 9 | def application(environ, start_response): 10 | """WSGI application factory.""" 11 | for key, value in environ.items(): 12 | # In some deployment scenarios (e.g. uWSGI on k8s), uWSGI will pass in 13 | # the hostname as part of the request environ. This will usually just 14 | # be a container ID, which is not helpful for things like building 15 | # URLs. We want to keep ``SERVER_NAME`` explicitly configured, either 16 | # in config.py or via an os.environ var loaded by config.py. 17 | if key == "SERVER_NAME": 18 | continue 19 | if type(value) is str: 20 | os.environ[key] = value 21 | global __flask_app__ 22 | if __flask_app__ is None: 23 | from search.factory import create_classic_api_web_app 24 | __flask_app__ = create_classic_api_web_app() 25 | 26 | return __flask_app__(environ, start_response) 27 | -------------------------------------------------------------------------------- /wsgi.py: -------------------------------------------------------------------------------- 1 | """Web Server Gateway Interface entry-point for UI.""" 2 | 3 | import os 4 | 5 | __flask_app__ = None 6 | 7 | 8 | def application(environ, start_response): 9 | """WSGI application factory.""" 10 | for key, value in environ.items(): 11 | # Copy string WSGI environ to os.environ. This is to get apache 12 | # SetEnv vars. It needs to be done before the call to 13 | # create_web_app() due to how config is setup from os in 14 | # search/config.py. 15 | # 16 | # In some deployment scenarios (e.g. uWSGI on k8s), uWSGI will pass in 17 | # the hostname as part of the request environ. This will usually just 18 | # be a container ID, which is not helpful for things like building 19 | # URLs. We want to keep ``SERVER_NAME`` explicitly configured, either 20 | # in config.py or via an os.environ var loaded by config.py. 21 | if key == "SERVER_NAME": 22 | continue 23 | if type(value) is str: 24 | os.environ[key] = value 25 | 26 | global __flask_app__ 27 | if __flask_app__ is None: 28 | from search.factory import create_ui_web_app 29 | __flask_app__ = create_ui_web_app() 30 | 31 | return __flask_app__(environ, start_response) 32 | --------------------------------------------------------------------------------