├── .cc-metadata.yml ├── .gitattributes ├── .github ├── dependabot.yml └── workflows │ ├── IssueAndPR.yml │ └── integration-tests.yml ├── .gitignore ├── .idea └── dictionaries │ └── alden.xml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── CONTRIBUTORS.md ├── DOCUMENTATION_GUIDELINES.md ├── LICENSE ├── README.md ├── analytics ├── Dockerfile ├── Pipfile ├── Pipfile.lock ├── README.md ├── __init__.py ├── alembic.ini ├── attribution_worker.py ├── backdate.py ├── docs │ ├── redoc.html │ └── swagger.yaml ├── event_controller.py ├── gen_daily_report.py ├── migrations │ ├── README │ ├── env.py │ ├── script.py.mako │ └── versions │ │ ├── 0cd416f5a7d2_add_attribution_events_table.py │ │ ├── 54e56668b66a_regenerate_initial_migration.py │ │ ├── 7695412f8a64_switch_to_boolean_search_rating_instead_.py │ │ └── beb6d39f2dfd_add_reporting_tables.py ├── models.py ├── report_controller.py ├── server.py ├── settings.py └── tests.py ├── cccatalog-api ├── Dockerfile ├── Pipfile ├── Pipfile.lock ├── cccatalog │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── admin.py │ │ ├── apps.py │ │ ├── controllers │ │ │ ├── __init__.py │ │ │ ├── link_controller.py │ │ │ └── search_controller.py │ │ ├── licenses.py │ │ ├── migrations │ │ │ ├── 0001_initial.py │ │ │ ├── 0002_auto_20180723_1737.py │ │ │ ├── 0003_image_view_count.py │ │ │ ├── 0004_shortenedlink.py │ │ │ ├── 0005_auto_20180803_1905.py │ │ │ ├── 0006_image_watermarked.py │ │ │ ├── 0007_auto_20180803_1909.py │ │ │ ├── 0008_imagelist_slug.py │ │ │ ├── 0009_auto_20180831_1425.py │ │ │ ├── 0010_auto_20180831_1815.py │ │ │ ├── 0011_auto_20181117_0029.py │ │ │ ├── 0012_auto_20190102_2012.py │ │ │ ├── 0013_contentprovider.py │ │ │ ├── 0014_auto_20190122_1853.py │ │ │ ├── 0015_contentprovider_notes.py │ │ │ ├── 0016_auto_20190122_1908.py │ │ │ ├── 0017_remove_contentprovider_updated_on.py │ │ │ ├── 0018_auto_20190122_1917.py │ │ │ ├── 0019_auto_20190307_1830.py │ │ │ ├── 0020_auto_20190918_1954.py │ │ │ ├── 0021_deletedimages.py │ │ │ ├── 0022_reportimage.py │ │ │ ├── 0023_auto_20200423_1526.py │ │ │ ├── 0024_auto_20200423_1601.py │ │ │ ├── 0025_auto_20200429_1401.py │ │ │ ├── 0026_imagereport_date.py │ │ │ ├── 0027_auto_20200515_2037.py │ │ │ ├── 0028_sourcelogo.py │ │ │ └── __init__.py │ │ ├── models.py │ │ ├── serializers │ │ │ ├── __init__.py │ │ │ ├── image_serializers.py │ │ │ ├── link_serializers.py │ │ │ ├── list_serializers.py │ │ │ └── oauth2_serializers.py │ │ ├── tests.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── ccrel.py │ │ │ ├── dead_link_mask.py │ │ │ ├── exceptions.py │ │ │ ├── fonts │ │ │ │ ├── SourceCodePro-Bold.ttf │ │ │ │ └── SourceSansPro-Bold.ttf │ │ │ ├── oauth2_helper.py │ │ │ ├── scheduled_tasks.py │ │ │ ├── throttle.py │ │ │ ├── validate_images.py │ │ │ └── watermark.py │ │ └── views │ │ │ ├── __init__.py │ │ │ ├── image_views.py │ │ │ ├── link_views.py │ │ │ ├── list_views.py │ │ │ └── site_views.py │ ├── custom_auto_schema.py │ ├── example_responses.py │ ├── scripts │ │ ├── api_load_testing │ │ │ ├── common_english_words.txt │ │ │ └── locustfile.py │ │ ├── migration │ │ │ └── migrate_lists.py │ │ └── thumbnail_load_test │ │ │ └── locustfile.py │ ├── settings.py │ ├── urls.py │ └── wsgi.py ├── manage.py ├── pytest.ini ├── run.sh ├── terms_of_service.html └── test │ ├── README │ ├── __init__.py │ ├── api_live_integration_test.py │ ├── api_live_search_qa.py │ ├── run_test.sh │ ├── search_qa_test.py │ └── v1_integration_test.py ├── docker-compose.yml ├── ingestion_server ├── .dockerignore ├── Dockerfile ├── Dockerfile-worker ├── Pipfile ├── Pipfile.lock ├── README.md ├── config │ └── supervisord.conf ├── howitworks.png ├── ingestion_server │ ├── __init__.py │ ├── api.py │ ├── authority.py │ ├── categorize.py │ ├── cleanup.py │ ├── distributed_reindex_scheduler.py │ ├── elasticsearch_models.py │ ├── es_mapping.py │ ├── indexer.py │ ├── indexer_worker.py │ ├── ingest.py │ ├── qa.py │ ├── state.py │ └── tasks.py ├── publish_release.sh └── test │ ├── __init__.py │ ├── generate_integration_test_docker_compose.py │ ├── integration-test-docker-compose.yml │ ├── integration_tests.py │ ├── mock_data │ ├── mocked_images.csv │ ├── no_constraints_schema.sql │ ├── schema.sql │ └── update_mocks.sh │ └── unit_tests.py ├── initialization.PNG ├── load_sample_data.sh ├── local_api_documentation.PNG ├── localhost_request.PNG ├── sample_data ├── make_sample_pop.py ├── pop_col.csv └── sample_data.csv └── system_architecture.png /.cc-metadata.yml: -------------------------------------------------------------------------------- 1 | # Whether this GitHub repo is engineering related 2 | engineering_project: true 3 | # Name of the repository/project in English 4 | english_name: CC Catalog API 5 | # All technologies used 6 | technologies: Python, Django, Django REST Framework, Elasticsearch 7 | # Whether this repository should be featured on the CC Open Source site 8 | featured: false 9 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set default behavior to automatically normalize line endings 2 | * text=auto 3 | 4 | # Force all files to always use LF line endings so that if a repo is accessed 5 | # in Unix via a file share from Windows, the files will work 6 | * text eol=lf -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | ################################# 2 | # Dependabot Configuration File # 3 | ################################# 4 | 5 | # current Github-native version of Dependabot 6 | version: 2 7 | 8 | updates: 9 | # Enable version updates for Docker 10 | - package-ecosystem: 'docker' 11 | # Look for a `Dockerfile` in the `/cccatalog-api` directory 12 | directory: '/cccatalog-api' 13 | # Check for updates once a week 14 | schedule: 15 | interval: 'weekly' 16 | 17 | # Enable version updates for Python 18 | - package-ecosystem: 'pip' 19 | # Look for a `Pipfile` in the `/cccatalog-api` directory 20 | directory: '/cccatalog-api' 21 | # Check for updates once a week 22 | schedule: 23 | interval: 'weekly' 24 | 25 | -------------------------------------------------------------------------------- /.github/workflows/IssueAndPR.yml: -------------------------------------------------------------------------------- 1 | name: "Project Board Automation" 2 | 3 | on: 4 | issues: 5 | types: [ opened ] 6 | pull_request: 7 | types: [ opened ] 8 | jobs: 9 | join_issue_pr_to_project: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: "Automate adding issues to Backlog" 13 | uses: docker://takanabe/github-actions-automate-projects:v0.0.1 14 | if: github.event_name == 'issues' 15 | env: 16 | GITHUB_TOKEN: ${{ secrets.ADMIN_GITHUB_TOKEN }} 17 | GITHUB_PROJECT_URL: https://github.com/orgs/creativecommons/projects/10 18 | GITHUB_PROJECT_COLUMN_NAME: "Pending Review" 19 | - name: "Automate adding PRs to Active Sprint" 20 | uses: docker://takanabe/github-actions-automate-projects:v0.0.1 21 | if: github.event_name == 'pull_request' 22 | continue-on-error: true 23 | env: 24 | GITHUB_TOKEN: ${{ secrets.ADMIN_GITHUB_TOKEN }} 25 | GITHUB_PROJECT_URL: https://github.com/orgs/creativecommons/projects/7 26 | GITHUB_PROJECT_COLUMN_NAME: "In Progress" 27 | -------------------------------------------------------------------------------- /.github/workflows/integration-tests.yml: -------------------------------------------------------------------------------- 1 | name: Automated tests 2 | on: 3 | pull_request: 4 | branches: [ master ] 5 | push: 6 | branches: [ master ] 7 | 8 | jobs: 9 | Style: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/setup-python@v2 13 | - name: Install pycodestyle 14 | run: pip install pycodestyle 15 | - name: Checkout 16 | uses: actions/checkout@v2 17 | - name: Check API style 18 | run: pycodestyle cccatalog-api/cccatalog --exclude='cccatalog-api/cccatalog/api/migrations,cccatalog-api/cccatalog/example_responses.py' --max-line-length=80 --ignore=E402,E702 19 | - name: Check ingestion-server style 20 | run: pycodestyle ingestion_server/ingestion_server --max-line-length=80 --ignore=E402 21 | Tests: 22 | timeout-minutes: 15 23 | runs-on: ubuntu-latest 24 | steps: 25 | - uses: actions/checkout@v2 26 | - uses: actions/setup-python@v1 27 | - name: Install dependencies 28 | run: | 29 | pip install pytest pipenv 30 | sudo apt-get install libexempi3 librdkafka-dev 31 | PIPENV_PIPFILE=./cccatalog-api/Pipfile pipenv install --system --deploy --dev & 32 | - name: Start API 33 | run: docker-compose up --build -d 34 | - name: Wait for API to come up 35 | run: bash -c 'while [[ "$(curl --insecure -s -o /dev/null -w ''%{http_code}'' http://localhost:8000/healthcheck)" != "200" ]]; do sleep 10; done' 36 | - name: Ingest and index test data 37 | run: ./load_sample_data.sh 38 | - name: Wait for data to be indexed in Elasticsearch 39 | run: bash -c 'while [[ "$(curl -sb -H "Accept:application/json" http://localhost:9200/_cat/aliases/image | grep -c image-)" == "0" ]]; do sleep 5 && docker-compose logs; done' 40 | - name: Run API tests 41 | run: cd cccatalog-api && test/run_test.sh 42 | - name: Run analytics tests 43 | run: cd ./analytics && docker exec -i cccatalog-api_analytics_1 /bin/bash -c 'PYTHONPATH=. pipenv run pytest tests.py' 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | es-venv 3 | 4 | # IDE junk 5 | .idea 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # Environments 91 | .env 92 | .venv 93 | env/ 94 | venv/ 95 | ENV/ 96 | env.bak/ 97 | venv.bak/ 98 | 99 | # Spyder project settings 100 | .spyderproject 101 | .spyproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | # mkdocs documentation 107 | /site 108 | 109 | # mypy 110 | .mypy_cache/ 111 | 112 | # Local .terraform directories 113 | **/.terraform/* 114 | 115 | # .tfstate files 116 | *.tfstate 117 | *.tfstate.* 118 | 119 | # Crash log files 120 | crash.log 121 | 122 | # Ignore any .tfvars files that are generated automatically for each Terraform run. Most 123 | # .tfvars files are managed as part of configuration and so should be included in 124 | # version control. 125 | # 126 | # example.tfvars 127 | 128 | # Ignore crawler data files 129 | ccbot/url_dump.csv 130 | ccbot/crawl_plan.yml 131 | 132 | .DS_Store 133 | *.iml 134 | .idea 135 | -------------------------------------------------------------------------------- /.idea/dictionaries/alden.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | cccatalog 5 | daemonized 6 | elasticsearch 7 | itersize 8 | syncable 9 | syncer 10 | synchronizer 11 | 12 | 13 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | The Creative Commons team is committed to fostering a welcoming community. This 4 | project and all other Creative Commons open source projects are governed by our 5 | [Code of Conduct][code_of_conduct]. Please report unacceptable behavior to 6 | [conduct@creativecommons.org](mailto:conduct@creativecommons.org) per our 7 | [reporting guidelines][reporting_guide]. 8 | 9 | For a history of updates, see the [page history here][updates]. 10 | 11 | [code_of_conduct]:https://creativecommons.github.io/community/code-of-conduct/ 12 | [reporting_guide]:https://creativecommons.github.io/community/code-of-conduct/enforcement/ 13 | [updates]:https://github.com/creativecommons/creativecommons.github.io-source/commits/master/content/community/code-of-conduct/contents.lr 14 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to CC Open Source 2 | 3 | Thank you for your interest in contributing to CC Open Source! This document is a set of guidelines to help you contribute to this project. 4 | 5 |
6 | 7 | ## Code of Conduct 8 | 9 | By participating in this project, you are expected to uphold our [Code of Conduct](https://creativecommons.github.io/community/code-of-conduct/). 10 | 11 |
12 | 13 | ## Project Documentation 14 | 15 | Please consult the [README](./README.md) and [CODEBASE](./CODEBASE.md) files at the root of this repository. 16 | 17 |
18 | 19 | ## How to Contribute 20 | 21 | Please read the processes in our general [Contributing Code](https://creativecommons.github.io/contributing-code/) guidelines on the Creative Common Open Source website. It contains some general instructions that should be followed when contributing to any of the Creative Commons open-source repositories. 22 | 23 |
24 | 25 | ### Bugs 26 | 27 | If you find a bug, please open an issue in this repository describing the bug. You can file a bug [here](https://github.com/creativecommons/cccatalog-api/issues/new?template=bug_report.md). You will see a bug report template with the required information you should provide. 28 | 29 | After that, don't forget to tag the issue with the "Bug" label. 30 | 31 |
32 | 33 | ### Proposing changes or new features 34 | 35 | If you have an idea of a new feature or change to how the CC Catalog API works, please [file an issue](https://github.com/creativecommons/cccatalog-api/issues/new?template=feature_request.md) so we can discuss the possibility of that change or new feature being implemented and released in the future. This lets us come to an agreement about the proposed idea before any work is done. 36 | 37 | If you'd like to build a new feature but don't have a specific idea, please check our [public roadmap](https://docs.google.com/document/d/19yH2V5K4nzWgEXaZhkzD1egzrRayyDdxlzxZOTCm_pc/). Choose something from the pipeline of ideas and follow the same process as above. 38 | 39 |
40 | 41 | ### Pull requests 42 | 43 | Before you start writing code, make sure there is an issue open. Pull requests without a link to an existing issue won't be merged. 44 | 45 | If you want to get started contributing code to this project but don't know exactly what to work on, we compiled a good list of issues labeled as [`good first issue`](https://github.com/creativecommons/cccatalog-api/labels/good%20first%20issue) which are small in scope and not so complex to solve. There are also issues labeled as [`help wanted`](https://github.com/creativecommons/cccatalog-api/labels/help%20wanted) which can be a bit more complex but are good examples of things we are currently accepting help from the community. 46 | 47 | Any code modifications will have to be accompanied by the appropriate unit tests. This will be checked and verified during code review. Once the Pull Request is opened, our CI server will run the unit test suite and run a code linter to verify that the code follows the coding guidelines. 48 | 49 |
50 | 51 | ## Running the tests 52 | 53 | ### How to Run API live integration tests 54 | You can check the health of a live deployment of the API by running the live integration tests. 55 | 56 | 1. Change directory to CC Catalog API 57 | ``` 58 | cd cccatalog-api 59 | ``` 60 | 61 | 2. Install all dependencies for CC Catalog API 62 | ``` 63 | pipenv install 64 | ``` 65 | 66 | 3. Launch a new shell session 67 | ``` 68 | pipenv shell 69 | ``` 70 | 71 | 4. Run API live integration test 72 | ``` 73 | ./test/run_test.sh 74 | ``` 75 | 76 |
77 | 78 | ### How to Run Ingestion Server tests 79 | You can ingest and index some dummy data using the Ingestion Server API. 80 | 81 | 1. Change directory to ingestion server 82 | ``` 83 | cd ingestion_server 84 | ``` 85 | 86 | 2. Install all dependencies for Ingestion Server API 87 | ``` 88 | pipenv install 89 | ``` 90 | 91 | 3. Launch a new shell session 92 | ``` 93 | pipenv shell 94 | ``` 95 | 96 | 4. Run the integration tests 97 | ``` 98 | python3 test/integration_tests.py 99 | ``` 100 | 101 |
102 | 103 | ## Questions or Thoughts? 104 | 105 | Talk to us on [our developer mailing list or Slack community](https://creativecommons.github.io/community/). 106 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | cccatalog-api contributors (sorted alphabetically by last name) 2 | ============================================ 3 | 4 | * **[Liza Daley](https://github.com/lizadaly)** 5 | * Built CC Search prototype, bits of which live on in this repository to this day 6 | * **[Alden Page](https://github.com/aldenstpage)** 7 | * Author and maintainer of current implementation 8 | * **[Paulo Rosário](https://github.com/paulofilip3)** 9 | * Contributed to solution for consistent link rot filtering without impacting result count, improved test suite 10 | * **[Krystle Salazar](https://github.com/krysal)** 11 | * Implemented image takedown endpoint 12 | * **[Habeeb Shopeju](https://github.com/HAKSOAT)** 13 | * Fixed issue with error handling 14 | * **[Vignesh Ram Somnath](https://github.com/VIGS25)** 15 | * Implemented exclusion of known dead links from the search index 16 | 17 | -------------------------------------------------------------------------------- /DOCUMENTATION_GUIDELINES.md: -------------------------------------------------------------------------------- 1 | # Documentation Guidelines 2 | 3 | Interested in improving our documentation? Here’s what you need to know before making any changes to the documentation. 4 | 5 |
6 | 7 | ## Introduction 8 | 9 | CC Catalog API uses [drf-yasg](https://github.com/axnsan12/drf-yasg), which is a tool that generates real Swagger/OpenAPI 2.0 specifications from a Django Rest Framework API. 10 | 11 |
12 | 13 | ## How to Start Contributing 14 | 15 | - Run the server locally by following this [link](https://github.com/creativecommons/cccatalog-api#running-the-server-locally) 16 | - Update documentation 17 | - Make sure the updates passed the automated tests in this [file](https://github.com/creativecommons/cccatalog-api/blob/master/.github/workflows/integration-tests.yml) 18 | - Commit and push 19 | - Create pull request by following [GitHub Repo Guidelines](https://opensource.creativecommons.org/contributing-code/github-repo-guidelines/) 20 | 21 |
22 | 23 | ## Documentation Styles 24 | 25 | - All documentation must be written in American English with no contractions. 26 | - Descriptions must be written using simple yet concise explanations. 27 | - Codes are preferred over videos and screenshots. 28 | 29 |
30 | 31 | ## Cheat Sheet for drf-yasg 32 | This is a quick syntax guide with examples on how to add or update the documentation for API endpoints. 33 | 34 |
35 | 36 | ### Operation ID 37 | The name of API endpoint. 38 | 39 | **Example** 40 | ``` 41 | @swagger_auto_schema(operation_id='image_stats') 42 | ``` 43 | 44 |
45 | 46 | ### Operation Description 47 | The description for API endpoint. 48 | 49 | **Example** 50 | ``` 51 | image_stats_description = \ 52 | """ 53 | image_stats is an API endpoint to get a list of all content providers 54 | and their respective number of images in the Creative Commons catalog. 55 | 56 | You can use this endpoint to get details about content providers 57 | such as `source_name`, `image_count`, `display_name`, and `source_url`. 58 | 59 | You can refer to Bash's Request Samples for example on how to use 60 | this endpoint. 61 | """ # noqa 62 | 63 | @swagger_auto_schema(operation_id='image_stats', 64 | operation_description=image_stats_description) 65 | ``` 66 | 67 |
68 | 69 | ### Responses 70 | The response received after submitting an API request. The current API documentation includes response schemas and response samples based on their response codes. 71 | 72 | **Example** 73 | ``` 74 | image_stats_200_example = { 75 | "application/json": { 76 | "source_name": "flickr", 77 | "image_count": 465809213, 78 | "display_name": "Flickr", 79 | "source_url": "https://www.flickr.com" 80 | } 81 | } 82 | 83 | image_stats_response = { 84 | "200": openapi.Response( 85 | description="OK", 86 | examples=image_stats_200_example, 87 | schema=AboutImageResponse(many=True) 88 | ) 89 | } 90 | 91 | @swagger_auto_schema(operation_id='image_stats', 92 | operation_description=image_stats_description, 93 | responses=image_stats_response) 94 | ``` 95 | 96 |
97 | 98 | ### Request Body 99 | The data sent to the server when submitting an API request. 100 | 101 | **Example** 102 | ``` 103 | register_api_oauth2_request = openapi.Schema( 104 | type=openapi.TYPE_OBJECT, 105 | required=['name', 'description', 'email'], 106 | properties={ 107 | 'name': openapi.Schema( 108 | title="Name", 109 | type=openapi.TYPE_STRING, 110 | min_length=1, 111 | max_length=150, 112 | unique=True, 113 | description="A unique human-readable name for your application " 114 | "or project requiring access to the CC Catalog API." 115 | ), 116 | 'description': openapi.Schema( 117 | title="Description", 118 | type=openapi.TYPE_STRING, 119 | min_length=1, 120 | max_length=10000, 121 | description="A description of what you are trying to achieve " 122 | "with your project using the API. Please provide " 123 | "as much detail as possible!" 124 | ), 125 | 'email': openapi.Schema( 126 | title="Email", 127 | type=openapi.TYPE_STRING, 128 | min_length=1, 129 | max_length=254, 130 | format=openapi.FORMAT_EMAIL, 131 | description="A valid email that we can reach you at if we " 132 | "have any questions about your use case or " 133 | "data consumption." 134 | ) 135 | }, 136 | example={ 137 | "name": "My amazing project", 138 | "description": "To access CC Catalog API", 139 | "email": "cccatalog-api@creativecommons.org" 140 | } 141 | ) 142 | 143 | @swagger_auto_schema(operation_id='register_api_oauth2', 144 | operation_description=register_api_oauth2_description, 145 | request_body=register_api_oauth2_request, 146 | responses=register_api_oauth2_response) 147 | ``` 148 | 149 |
150 | 151 | ### Code Examples 152 | Code examples on how to use the API endpoints. The current API documentation provides code examples in Bash. 153 | 154 | **Example** 155 | ``` 156 | image_stats_bash = \ 157 | """ 158 | # Get a list of content providers and their image count 159 | curl -H "Authorization: Bearer DLBYIcfnKfolaXKcmMC8RIDCavc2hW" http://api.creativecommons.engineering/v1/sources 160 | """ # noqa 161 | 162 | @swagger_auto_schema(operation_id='image_stats', 163 | operation_description=image_stats_description, 164 | responses=image_stats_response, 165 | code_examples=[ 166 | { 167 | 'lang': 'Bash', 168 | 'source': image_stats_bash 169 | } 170 | ]) 171 | ``` 172 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Creative Commons 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /analytics/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | 3 | ENV PYTHONBUFFERED 1 4 | ENV PYTHONPATH . 5 | WORKDIR /analytics 6 | 7 | # Install Python dependency management tools 8 | RUN pip install --upgrade pip \ 9 | && pip install --upgrade setuptools \ 10 | && pip install --upgrade pipenv 11 | 12 | # Copy the Pipenv files into the container 13 | COPY . /analytics/ 14 | 15 | RUN pipenv install 16 | EXPOSE 8090 17 | ENTRYPOINT pipenv run gunicorn -b '0.0.0.0:8090' server:api 18 | -------------------------------------------------------------------------------- /analytics/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | falcon = "*" 10 | sqlalchemy = "*" 11 | psycopg2 = "*" 12 | alembic = "*" 13 | gunicorn = "*" 14 | requests = "*" 15 | pytest = "*" 16 | falcon-cors = "*" 17 | confluent-kafka = "*" 18 | -------------------------------------------------------------------------------- /analytics/README.md: -------------------------------------------------------------------------------- 1 | # CC Search Analytics 2 | 3 | ## Purpose 4 | 5 | The `analytics` server collects information about anonymous usage of CC Search. 6 | We intend to use this information to generate statistics about the quality of 7 | search results; the API may be extended in the future to produce usage data 8 | reports. 9 | 10 | To minimize risks to privacy, data is only connected to an anonymous session 11 | UUID, which changes every time that a user visits CC Search. No other 12 | identifying information is collected for analytical purposes. We intend to 13 | consume this raw data to produce aggregated reports, after which the raw 14 | data (along with session UUIDs) will be promptly deleted. 15 | 16 | ## Running the server 17 | 18 | The analytics server is automatically started by `docker-compose` in the parent 19 | directory. Before analytics endpoints can be called, the database needs to 20 | be set up with `../load_sample_data.sh`. 21 | 22 | To run the `analytics` container by itself: 23 | 24 | ``` 25 | cd ../ 26 | docker-compose up db analytics 27 | # Set up the database. 28 | cd analytics 29 | alembic upgrade head 30 | ``` 31 | 32 | ## Generating new database migrations 33 | After updating `models.py`, you will need to produce new database migrations. 34 | 35 | `alembic revision --autogenerate -m "A message concisely explaining the purpose of your new migration` 36 | 37 | ## Running the tests 38 | 39 | ``` 40 | pipenv install 41 | pipenv run pytest tests.py 42 | ``` 43 | 44 | ## Documentation 45 | 46 | After starting the server, you can view the documentation by visiting the 47 | root path (e.g. localhost:8090/). You may have to tweak `docs/redoc.html` for 48 | this to work on your local machine. 49 | 50 | Alternatively, you can view the production version of the documentation at 51 | `https://api.creativecommons.engineering/analytics`. 52 | 53 | ## Contributing / Code Structure 54 | 55 | Pull requests are welcome. Please make sure to update the unit tests and 56 | OpenAPI documentation (`docs/swagger.yaml`) where appropriate. 57 | 58 | `analytics` uses a model-view-controller pattern. It is intended to be simple 59 | and idiomatic Python. You shouldn't need to know much else besides that to get 60 | started. 61 | 62 | Key technologies to familiarize yourself with include: 63 | * [Falcon](https://falcon.readthedocs.io/en/stable/), a backend API web framework. 64 | * [SQLAlchemy](https://www.sqlalchemy.org/), a database ORM. 65 | * [Alembic](https://alembic.sqlalchemy.org/en/latest/), a lightweight database migration tool for SQLAlchemy. 66 | * [pipenv](https://docs.pipenv.org/en/latest/) for packaging. 67 | * [Docker](https://www.docker.com/) for containerization. 68 | * [OpenAPI](https://www.openapis.org/) (AKA Swagger) for human and machine readable documentation. 69 | -------------------------------------------------------------------------------- /analytics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/analytics/__init__.py -------------------------------------------------------------------------------- /analytics/alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [alembic] 4 | # path to migration scripts 5 | script_location = migrations 6 | 7 | # template used to generate migration files 8 | # file_template = %%(rev)s_%%(slug)s 9 | 10 | # timezone to use when rendering the date 11 | # within the migration file as well as the filename. 12 | # string value is passed to dateutil.tz.gettz() 13 | # leave blank for localtime 14 | # timezone = 15 | 16 | # max length of characters to apply to the 17 | # "slug" field 18 | # truncate_slug_length = 40 19 | 20 | # set to 'true' to run the environment during 21 | # the 'revision' command, regardless of autogenerate 22 | # revision_environment = false 23 | 24 | # set to 'true' to allow .pyc and .pyo files without 25 | # a source .py file to be detected as revisions in the 26 | # versions/ directory 27 | # sourceless = false 28 | 29 | # version location specification; this defaults 30 | # to migrations/versions. When using multiple version 31 | # directories, initial revisions must be specified with --version-path 32 | # version_locations = %(here)s/bar %(here)s/bat migrations/versions 33 | 34 | # the output encoding used when revision files 35 | # are written from script.py.mako 36 | # output_encoding = utf-8 37 | 38 | # Logging configuration 39 | [loggers] 40 | keys = root,sqlalchemy,alembic 41 | 42 | [handlers] 43 | keys = console 44 | 45 | [formatters] 46 | keys = generic 47 | 48 | [logger_root] 49 | level = WARN 50 | handlers = console 51 | qualname = 52 | 53 | [logger_sqlalchemy] 54 | level = WARN 55 | handlers = 56 | qualname = sqlalchemy.engine 57 | 58 | [logger_alembic] 59 | level = INFO 60 | handlers = 61 | qualname = alembic 62 | 63 | [handler_console] 64 | class = StreamHandler 65 | args = (sys.stderr,) 66 | level = NOTSET 67 | formatter = generic 68 | 69 | [formatter_generic] 70 | format = %(levelname)-5.5s [%(name)s] %(message)s 71 | datefmt = %H:%M:%S 72 | -------------------------------------------------------------------------------- /analytics/attribution_worker.py: -------------------------------------------------------------------------------- 1 | import settings 2 | import json 3 | import logging as log 4 | import urllib.parse as urlparse 5 | from urllib.parse import parse_qs 6 | from uuid import UUID 7 | from models import AttributionReferrerEvent 8 | from sqlalchemy import create_engine 9 | from sqlalchemy.orm import sessionmaker 10 | from confluent_kafka import Consumer 11 | 12 | 13 | def parse_identifier(resource): 14 | identifier = None 15 | parsed_url = urlparse.urlparse(resource) 16 | query = parsed_url.query 17 | if query: 18 | try: 19 | query_parsed = parse_qs(query) 20 | image_id = query_parsed['image_id'][0] 21 | identifier = str(UUID(image_id)) 22 | except (KeyError, ValueError, TypeError): 23 | identifier = None 24 | return identifier 25 | 26 | 27 | def parse_message(msg): 28 | if msg is None: 29 | return None 30 | try: 31 | decoded = json.loads(msg) 32 | decoded = json.loads(scrub_malformed(decoded['message'])) 33 | resource = decoded['request'].split(' ')[1] 34 | _id = parse_identifier(resource) 35 | parsed = { 36 | 'http_referer': decoded['http_referer'], 37 | 'resource': decoded['request'].split(' ')[1], 38 | 'identifier': _id 39 | } 40 | except (json.JSONDecodeError, KeyError): 41 | log.warning(f'Failed to parse {msg}. Reason: ', exc_info=True) 42 | parsed = None 43 | return parsed 44 | 45 | 46 | def save_message(validated_msg: dict, session): 47 | event = AttributionReferrerEvent( 48 | image_uuid=validated_msg['identifier'], 49 | full_referer=validated_msg['http_referer'], 50 | referer_domain=urlparse.urlparse(validated_msg['http_referer']).netloc, 51 | resource=validated_msg['resource'] 52 | ) 53 | session.add(event) 54 | session.commit() 55 | 56 | 57 | def scrub_malformed(_json: str): 58 | """ Remove some invalid JSON that NGINX sometimes spits out """ 59 | return _json.replace('\"upstream_response_time\":,', '') 60 | 61 | 62 | def is_valid(parsed_msg: dict): 63 | """ 64 | We are only interested in attribution image logs for images that are 65 | embedded in domains not owned by Creative Commons. We also want to make 66 | sure that we're only tracking hits on embedded content. 67 | """ 68 | if parsed_msg is None: 69 | return False 70 | try: 71 | referer = parsed_msg['http_referer'] 72 | resource = parsed_msg['resource'] 73 | valid = 'creativecommons.org' not in referer and '.svg' in resource 74 | except KeyError: 75 | valid = False 76 | return valid 77 | 78 | 79 | def listen(consumer, database): 80 | saved = 0 81 | ignored = 0 82 | timeout = 30 83 | while True: 84 | msg = consumer.poll(timeout=timeout) 85 | if msg: 86 | parsed_msg = parse_message(str(msg.value(), 'utf-8')) 87 | if is_valid(parsed_msg): 88 | save_message(parsed_msg, database) 89 | saved += 1 90 | else: 91 | ignored += 1 92 | else: 93 | log.info('No message received in {timeout}') 94 | if saved + ignored % 100 == 0: 95 | log.info(f'Saved {saved} attribution events, ignored {ignored}') 96 | 97 | 98 | if __name__ == '__main__': 99 | log.basicConfig( 100 | filename=settings.ATTRIBUTION_LOGFILE, 101 | format='%(asctime)s %(message)s', 102 | level=log.INFO 103 | ) 104 | consumer_settings = { 105 | 'bootstrap.servers': settings.KAFKA_HOSTS, 106 | 'group.id': 'attribution_streamer', 107 | 'auto.offset.reset': 'earliest' 108 | } 109 | c = Consumer(consumer_settings) 110 | c.subscribe([settings.KAFKA_TOPIC_NAME]) 111 | engine = create_engine(settings.DATABASE_CONNECTION) 112 | session_maker = sessionmaker(bind=engine) 113 | session = session_maker() 114 | listen(c, session) 115 | -------------------------------------------------------------------------------- /analytics/backdate.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import settings 3 | from sqlalchemy import create_engine 4 | from sqlalchemy.orm import sessionmaker 5 | from analytics.report_controller import ( 6 | generate_usage_report, generate_source_usage_report, 7 | generate_referrer_usage_report, generate_top_searches, 8 | generate_top_result_clicks 9 | ) 10 | """ 11 | A one-off script for generating analytics reports back to September 2019, when 12 | we first started collecting analytics data. 13 | """ 14 | 15 | 16 | engine = create_engine(settings.DATABASE_CONNECTION) 17 | session_maker = sessionmaker(bind=engine) 18 | session = session_maker() 19 | backdate_limit = datetime.datetime(year=2019, month=9, day=10) 20 | current_end_date = datetime.datetime.utcnow() 21 | while current_end_date > backdate_limit: 22 | start_date = current_end_date - datetime.timedelta(days=1) 23 | 24 | generate_usage_report(session, start_date, current_end_date) 25 | generate_source_usage_report(session, start_date, current_end_date) 26 | generate_referrer_usage_report(session, start_date, current_end_date) 27 | generate_top_searches(session, start_date, current_end_date) 28 | generate_top_result_clicks(session, start_date, current_end_date) 29 | 30 | current_end_date -= datetime.timedelta(days=1) 31 | print(f'Generated backdated reports for {current_end_date}') 32 | -------------------------------------------------------------------------------- /analytics/docs/redoc.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ReDoc 5 | 6 | 7 | 8 | 9 | 10 | 13 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /analytics/docs/swagger.yaml: -------------------------------------------------------------------------------- 1 | swagger: "2.0" 2 | info: 3 | description: "An API for registering anonymous usage data events in CC Search, which we intend to use to improve the quality of the search results." 4 | version: "1.0.0" 5 | title: "CC Search Usage Data API" 6 | termsOfService: "https://api.creativecommons.engineering/terms_of_service.html" 7 | contact: 8 | email: "alden@creativecommons.org" 9 | license: 10 | name: "MIT License" 11 | url: "https://github.com/creativecommons/cccatalog-api/blob/master/LICENSE" 12 | host: "api.creativecommons.engineering" 13 | basePath: "/analytics" 14 | tags: 15 | - name: "Register events" 16 | description: "Send events to the analytics server." 17 | schemes: 18 | - "https" 19 | paths: 20 | /search_event: 21 | post: 22 | tags: 23 | - "Register events" 24 | summary: "Register a search query event." 25 | description: "" 26 | operationId: "addSearch" 27 | consumes: 28 | - "application/json" 29 | produces: 30 | - "application/json" 31 | parameters: 32 | - in: "body" 33 | name: "body" 34 | description: "The user's search query and unique session UUID." 35 | required: true 36 | schema: 37 | $ref: "#/definitions/CreateSearchEvent" 38 | 39 | responses: 40 | 400: 41 | description: "Invalid input" 42 | /search_rating_event: 43 | post: 44 | tags: 45 | - "Register events" 46 | summary: "Submit a user's rating of a search." 47 | description: "" 48 | operationId: "addSearchRating" 49 | consumes: 50 | - "application/json" 51 | produces: 52 | - "application/json" 53 | parameters: 54 | - in: "body" 55 | name: "body" 56 | required: true 57 | schema: 58 | $ref: "#/definitions/CreateSearchRatingEvent" 59 | 60 | responses: 61 | 201: 62 | description: "Created" 63 | 400: 64 | description: "Invalid input" 65 | /result_click_event: 66 | post: 67 | tags: 68 | - "Register events" 69 | summary: "Submit an event indicating which result was clicked for a given search query." 70 | description: "" 71 | operationId: "addResultClick" 72 | consumes: 73 | - "application/json" 74 | produces: 75 | - "application/json" 76 | parameters: 77 | - in: "body" 78 | name: "body" 79 | required: true 80 | schema: 81 | $ref: "#/definitions/CreateResultClickEvent" 82 | responses: 83 | 201: 84 | description: "Created" 85 | 400: 86 | description: "Invalid input" 87 | /detail_page_event: 88 | post: 89 | tags: 90 | - "Register events" 91 | summary: "Record events occurring on detail pages, such as sharing an image to social media or clicking through to its source." 92 | description: "" 93 | operationId: "addDetailPageEvent" 94 | consumes: 95 | - "application/json" 96 | produces: 97 | - "application/json" 98 | parameters: 99 | - in: "body" 100 | name: "body" 101 | required: true 102 | schema: 103 | $ref: "#/definitions/CreateDetailPageEvent" 104 | responses: 105 | 201: 106 | description: "Created" 107 | 400: 108 | description: "Invalid input" 109 | 110 | definitions: 111 | CreateSearchEvent: 112 | type: "object" 113 | required: 114 | - query 115 | - session_uuid 116 | properties: 117 | query: 118 | type: "string" 119 | session_uuid: 120 | type: "string" 121 | example: "12345678-1234-1234-1234-1234567890ab" 122 | 123 | CreateSearchRatingEvent: 124 | type: "object" 125 | required: 126 | - query 127 | - relevant 128 | properties: 129 | query: 130 | type: "string" 131 | description: "A unique identifier labeling an anonymous user's session." 132 | relevant: 133 | type: "boolean" 134 | example: true 135 | 136 | CreateResultClickEvent: 137 | type: "object" 138 | required: 139 | - query 140 | - session_uuid 141 | - result_uuid 142 | - result_rank 143 | properties: 144 | query: 145 | type: "string" 146 | result_rank: 147 | type: "integer" 148 | example: 2 149 | description: "The position of the result in the search results grid, e.g. 0 for the first result, or 22 for the 21st result." 150 | result_uuid: 151 | type: "string" 152 | example: "12345678-1234-1234-1234-1234567890ab" 153 | description: "The unique identifier for the result that was clicked." 154 | session_uuid: 155 | type: "string" 156 | example: "12345678-1234-1234-1234-1234567890ab" 157 | description: "A unique identifier labeling an anonymous user's session." 158 | 159 | CreateDetailPageEvent: 160 | type: "object" 161 | required: 162 | - event_type 163 | - result_uuid 164 | properties: 165 | event_type: 166 | type: "string" 167 | description: > 168 | Supported event types: 169 | * `ATTRIBUTION_CLICKED` - The user generated an attribution string for this result. 170 | * `REUSE_SURVEY` - The user took a reuse survey. 171 | * `SOURCE_CLICKED` - The user visited the source page of the work. 172 | * `CREATOR_CLICKED` - The user visited the creator of the work's page. 173 | * `SHARED_SOCIAL` - The user shared a link to the work on social media. 174 | example: "ATTRIBUTION_CLICKED" 175 | enum: 176 | - ATTRIBUTION_CLICKED 177 | - REUSE_SURVEY 178 | - SOURCE_CLICKED 179 | - CREATOR_CLICKED 180 | - SHARED_SOCIAL 181 | result_uuid: 182 | type: "string" 183 | example: "12345678-1234-1234-1234-1234567890ab" 184 | description: "The unique identifier for the detail page associated with the event." 185 | 186 | externalDocs: 187 | description: "The Creative Commons search API" 188 | url: "https://api.creativecommons.engineering" 189 | -------------------------------------------------------------------------------- /analytics/event_controller.py: -------------------------------------------------------------------------------- 1 | from models import SearchEvent, SearchRatingEvent, ResultClickedEvent, \ 2 | DetailPageEvent, DetailPageEvents 3 | from sqlalchemy import create_engine 4 | from sqlalchemy.orm import sessionmaker 5 | from settings import DATABASE_CONNECTION 6 | 7 | class EventController: 8 | def __init__(self): 9 | self.engine = create_engine(DATABASE_CONNECTION) 10 | 11 | def _persist(self, _object): 12 | Session = sessionmaker(bind=self.engine) 13 | session = Session() 14 | session.add(_object) 15 | session.commit() 16 | 17 | def create_search(self, session_uuid, query): 18 | search = SearchEvent( 19 | session_uuid=session_uuid, 20 | query=query 21 | ) 22 | self._persist(search) 23 | 24 | def create_search_rating(self, query, relevant): 25 | if type(relevant) != bool: 26 | raise ValueError('Invalid rating; must be a boolean.') 27 | search_rating = SearchRatingEvent( 28 | query=query, 29 | relevant=relevant 30 | ) 31 | self._persist(search_rating) 32 | 33 | def create_result_click(self, session_uuid, result_uuid, query, rank): 34 | result_click = ResultClickedEvent( 35 | session_uuid=session_uuid, 36 | result_uuid=result_uuid, 37 | query=query, 38 | result_rank=rank 39 | ) 40 | self._persist(result_click) 41 | 42 | def create_detail_event(self, event, result_uuid): 43 | _event = DetailPageEvents[event] 44 | detail_event = DetailPageEvent( 45 | event_type=_event, 46 | result_uuid=result_uuid 47 | ) 48 | self._persist(detail_event) 49 | 50 | def list_valid_detail_events(self): 51 | return [k.name for k in DetailPageEvents] 52 | -------------------------------------------------------------------------------- /analytics/gen_daily_report.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import settings 3 | import logging as log 4 | from sqlalchemy import create_engine 5 | from sqlalchemy.orm import sessionmaker 6 | from analytics.report_controller import ( 7 | generate_usage_report, generate_source_usage_report, 8 | generate_referrer_usage_report, generate_top_searches, 9 | generate_top_result_clicks 10 | ) 11 | 12 | engine = create_engine(settings.DATABASE_CONNECTION) 13 | session_maker = sessionmaker(bind=engine) 14 | session = session_maker() 15 | end_date = datetime.datetime.utcnow() 16 | start_date = end_date - datetime.timedelta(days=1) 17 | 18 | generate_usage_report(session, start_date, end_date) 19 | generate_source_usage_report(session, start_date, end_date) 20 | generate_referrer_usage_report(session, start_date, end_date) 21 | generate_top_searches(session, start_date, end_date) 22 | generate_top_result_clicks(session, start_date, end_date) 23 | 24 | log.info(f'Generated analytics reports for {end_date}') 25 | -------------------------------------------------------------------------------- /analytics/migrations/README: -------------------------------------------------------------------------------- 1 | Generic single-database configuration. -------------------------------------------------------------------------------- /analytics/migrations/env.py: -------------------------------------------------------------------------------- 1 | 2 | from logging.config import fileConfig 3 | 4 | from sqlalchemy import engine_from_config 5 | from sqlalchemy import pool 6 | 7 | from alembic import context 8 | import inspect 9 | from settings import DATABASE_CONNECTION 10 | import models 11 | from models import * 12 | # this is the Alembic Config object, which provides 13 | # access to the values within the .ini file in use. 14 | config = context.config 15 | config.set_main_option('sqlalchemy.url', DATABASE_CONNECTION) 16 | # Interpret the config file for Python logging. 17 | # This line sets up loggers basically. 18 | fileConfig(config.config_file_name) 19 | 20 | # add your model's MetaData object here 21 | # for 'autogenerate' support 22 | # from myapp import mymodel 23 | # target_metadata = mymodel.Base.metadata 24 | target_metadata = Base.metadata 25 | 26 | # other values from the config, defined by the needs of env.py, 27 | # can be acquired: 28 | # my_important_option = config.get_main_option("my_important_option") 29 | # ... etc. 30 | 31 | 32 | def include_object(object, name, type_, reflected, compare_to): 33 | """ 34 | Tells Alembic whether it owns an object. This can be used to exclude 35 | objects from autogenerated migrations. 36 | """ 37 | valid_names = set() 38 | for name, obj in inspect.getmembers(models): 39 | if inspect.isclass(obj): 40 | if hasattr(obj, '__tablename__'): 41 | valid_names.add(str(obj.__tablename__)) 42 | if type_ == "table": 43 | if str(object) == "image": 44 | return False 45 | elif str(object) in valid_names: 46 | return True 47 | else: 48 | return False 49 | else: 50 | return True 51 | 52 | 53 | def run_migrations_offline(): 54 | """Run migrations in 'offline' mode. 55 | 56 | This configures the context with just a URL 57 | and not an Engine, though an Engine is acceptable 58 | here as well. By skipping the Engine creation 59 | we don't even need a DBAPI to be available. 60 | 61 | Calls to context.execute() here emit the given string to the 62 | script output. 63 | 64 | """ 65 | url = DATABASE_CONNECTION 66 | context.configure( 67 | url=url, target_metadata=target_metadata, literal_binds=True, 68 | include_object=include_object 69 | ) 70 | 71 | with context.begin_transaction(): 72 | context.run_migrations() 73 | 74 | 75 | def run_migrations_online(): 76 | """Run migrations in 'online' mode. 77 | 78 | In this scenario we need to create an Engine 79 | and associate a connection with the context. 80 | 81 | """ 82 | connectable = engine_from_config( 83 | config.get_section(config.config_ini_section), 84 | prefix="sqlalchemy.", 85 | poolclass=pool.NullPool, 86 | ) 87 | 88 | with connectable.connect() as connection: 89 | context.configure( 90 | connection=connection, target_metadata=target_metadata, 91 | include_object=include_object 92 | ) 93 | 94 | with context.begin_transaction(): 95 | context.run_migrations() 96 | 97 | 98 | if context.is_offline_mode(): 99 | run_migrations_offline() 100 | else: 101 | run_migrations_online() 102 | -------------------------------------------------------------------------------- /analytics/migrations/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | ${imports if imports else ""} 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = ${repr(up_revision)} 14 | down_revision = ${repr(down_revision)} 15 | branch_labels = ${repr(branch_labels)} 16 | depends_on = ${repr(depends_on)} 17 | 18 | 19 | def upgrade(): 20 | ${upgrades if upgrades else "pass"} 21 | 22 | 23 | def downgrade(): 24 | ${downgrades if downgrades else "pass"} 25 | -------------------------------------------------------------------------------- /analytics/migrations/versions/0cd416f5a7d2_add_attribution_events_table.py: -------------------------------------------------------------------------------- 1 | """Add attribution events table 2 | 3 | Revision ID: 0cd416f5a7d2 4 | Revises: 7695412f8a64 5 | Create Date: 2020-09-11 15:43:24.507088 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | from sqlalchemy.dialects import postgresql 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '0cd416f5a7d2' 14 | down_revision = '7695412f8a64' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.create_table('attribution_referrer_event', 22 | sa.Column('id', sa.Integer(), nullable=False), 23 | sa.Column('timestamp', sa.DateTime(), server_default=sa.text('now()'), nullable=True), 24 | sa.Column('image_uuid', postgresql.UUID(), nullable=True), 25 | sa.Column('full_referer', sa.String(), nullable=True), 26 | sa.Column('referer_domain', sa.String(), nullable=True), 27 | sa.Column('resource', sa.String(), nullable=True), 28 | sa.PrimaryKeyConstraint('id') 29 | ) 30 | op.create_index(op.f('ix_attribution_referrer_event_image_uuid'), 'attribution_referrer_event', ['image_uuid'], unique=False) 31 | op.create_index(op.f('ix_attribution_referrer_event_referer_domain'), 'attribution_referrer_event', ['referer_domain'], unique=False) 32 | op.create_index(op.f('ix_attribution_referrer_event_resource'), 'attribution_referrer_event', ['resource'], unique=False) 33 | op.create_index(op.f('ix_attribution_referrer_event_timestamp'), 'attribution_referrer_event', ['timestamp'], unique=False) 34 | # ### end Alembic commands ### 35 | 36 | 37 | def downgrade(): 38 | # ### commands auto generated by Alembic - please adjust! ### 39 | op.drop_index(op.f('ix_attribution_referrer_event_timestamp'), table_name='attribution_referrer_event') 40 | op.drop_index(op.f('ix_attribution_referrer_event_resource'), table_name='attribution_referrer_event') 41 | op.drop_index(op.f('ix_attribution_referrer_event_referer_domain'), table_name='attribution_referrer_event') 42 | op.drop_index(op.f('ix_attribution_referrer_event_image_uuid'), table_name='attribution_referrer_event') 43 | op.drop_table('attribution_referrer_event') 44 | # ### end Alembic commands ### 45 | -------------------------------------------------------------------------------- /analytics/migrations/versions/54e56668b66a_regenerate_initial_migration.py: -------------------------------------------------------------------------------- 1 | """Regenerate initial migration 2 | 3 | Revision ID: 54e56668b66a 4 | Revises: 5 | Create Date: 2019-11-07 13:57:47.146441 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | from sqlalchemy.dialects import postgresql 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '54e56668b66a' 14 | down_revision = None 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.create_table('detail_page_event', 22 | sa.Column('id', sa.Integer(), nullable=False), 23 | sa.Column('timestamp', sa.DateTime(), server_default=sa.text('now()'), nullable=True), 24 | sa.Column('result_uuid', postgresql.UUID(), nullable=True), 25 | sa.Column('event_type', sa.Enum('ATTRIBUTION_CLICKED', 'REUSE_SURVEY', 'SOURCE_CLICKED', 'CREATOR_CLICKED', 'SHARED_SOCIAL', name='detailpageevents'), nullable=True), 26 | sa.PrimaryKeyConstraint('id') 27 | ) 28 | op.create_index(op.f('ix_detail_page_event_event_type'), 'detail_page_event', ['event_type'], unique=False) 29 | op.create_index(op.f('ix_detail_page_event_result_uuid'), 'detail_page_event', ['result_uuid'], unique=False) 30 | op.create_index(op.f('ix_detail_page_event_timestamp'), 'detail_page_event', ['timestamp'], unique=False) 31 | op.create_table('result_clicked_event', 32 | sa.Column('id', sa.Integer(), nullable=False), 33 | sa.Column('timestamp', sa.DateTime(), server_default=sa.text('now()'), nullable=True), 34 | sa.Column('session_uuid', postgresql.UUID(), nullable=True), 35 | sa.Column('result_uuid', postgresql.UUID(), nullable=True), 36 | sa.Column('query', sa.String(), nullable=True), 37 | sa.Column('result_rank', sa.Integer(), nullable=True), 38 | sa.PrimaryKeyConstraint('id') 39 | ) 40 | op.create_index(op.f('ix_result_clicked_event_query'), 'result_clicked_event', ['query'], unique=False) 41 | op.create_index(op.f('ix_result_clicked_event_result_uuid'), 'result_clicked_event', ['result_uuid'], unique=False) 42 | op.create_index(op.f('ix_result_clicked_event_session_uuid'), 'result_clicked_event', ['session_uuid'], unique=False) 43 | op.create_index(op.f('ix_result_clicked_event_timestamp'), 'result_clicked_event', ['timestamp'], unique=False) 44 | op.create_table('search_event', 45 | sa.Column('id', sa.Integer(), nullable=False), 46 | sa.Column('timestamp', sa.DateTime(), server_default=sa.text('now()'), nullable=True), 47 | sa.Column('query', sa.String(), nullable=True), 48 | sa.Column('session_uuid', postgresql.UUID(), nullable=True), 49 | sa.PrimaryKeyConstraint('id') 50 | ) 51 | op.create_index(op.f('ix_search_event_query'), 'search_event', ['query'], unique=False) 52 | op.create_index(op.f('ix_search_event_session_uuid'), 'search_event', ['session_uuid'], unique=False) 53 | op.create_index(op.f('ix_search_event_timestamp'), 'search_event', ['timestamp'], unique=False) 54 | op.create_table('search_rating_event', 55 | sa.Column('id', sa.Integer(), nullable=False), 56 | sa.Column('timestamp', sa.DateTime(), server_default=sa.text('now()'), nullable=True), 57 | sa.Column('query', sa.String(), nullable=True), 58 | sa.Column('rating', sa.Integer(), nullable=True), 59 | sa.PrimaryKeyConstraint('id') 60 | ) 61 | op.create_index(op.f('ix_search_rating_event_query'), 'search_rating_event', ['query'], unique=False) 62 | op.create_index(op.f('ix_search_rating_event_timestamp'), 'search_rating_event', ['timestamp'], unique=False) 63 | # ### end Alembic commands ### 64 | 65 | 66 | def downgrade(): 67 | # ### commands auto generated by Alembic - please adjust! ### 68 | op.drop_index(op.f('ix_search_rating_event_timestamp'), table_name='search_rating_event') 69 | op.drop_index(op.f('ix_search_rating_event_query'), table_name='search_rating_event') 70 | op.drop_table('search_rating_event') 71 | op.drop_index(op.f('ix_search_event_timestamp'), table_name='search_event') 72 | op.drop_index(op.f('ix_search_event_session_uuid'), table_name='search_event') 73 | op.drop_index(op.f('ix_search_event_query'), table_name='search_event') 74 | op.drop_table('search_event') 75 | op.drop_index(op.f('ix_result_clicked_event_timestamp'), table_name='result_clicked_event') 76 | op.drop_index(op.f('ix_result_clicked_event_session_uuid'), table_name='result_clicked_event') 77 | op.drop_index(op.f('ix_result_clicked_event_result_uuid'), table_name='result_clicked_event') 78 | op.drop_index(op.f('ix_result_clicked_event_query'), table_name='result_clicked_event') 79 | op.drop_table('result_clicked_event') 80 | op.drop_index(op.f('ix_detail_page_event_timestamp'), table_name='detail_page_event') 81 | op.drop_index(op.f('ix_detail_page_event_result_uuid'), table_name='detail_page_event') 82 | op.drop_index(op.f('ix_detail_page_event_event_type'), table_name='detail_page_event') 83 | op.drop_table('detail_page_event') 84 | # ### end Alembic commands ### 85 | -------------------------------------------------------------------------------- /analytics/migrations/versions/7695412f8a64_switch_to_boolean_search_rating_instead_.py: -------------------------------------------------------------------------------- 1 | """Switch to boolean search rating instead of 1-5 star rating 2 | 3 | Revision ID: 7695412f8a64 4 | Revises: 54e56668b66a 5 | Create Date: 2019-11-07 14:13:50.764789 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '7695412f8a64' 14 | down_revision = '54e56668b66a' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column('search_rating_event', sa.Column('relevant', sa.Boolean(), nullable=True)) 22 | op.create_index(op.f('ix_search_rating_event_relevant'), 'search_rating_event', ['relevant'], unique=False) 23 | op.drop_column('search_rating_event', 'rating') 24 | # ### end Alembic commands ### 25 | 26 | 27 | def downgrade(): 28 | # ### commands auto generated by Alembic - please adjust! ### 29 | op.add_column('search_rating_event', sa.Column('rating', sa.INTEGER(), autoincrement=False, nullable=True)) 30 | op.drop_index(op.f('ix_search_rating_event_relevant'), table_name='search_rating_event') 31 | op.drop_column('search_rating_event', 'relevant') 32 | # ### end Alembic commands ### 33 | -------------------------------------------------------------------------------- /analytics/models.py: -------------------------------------------------------------------------------- 1 | import enum 2 | from sqlalchemy import Integer, Column, Enum, String, DateTime, Boolean, Float 3 | from sqlalchemy.dialects.postgresql import UUID 4 | from sqlalchemy.sql import func 5 | from sqlalchemy.ext.declarative import declarative_base 6 | 7 | Base = declarative_base() 8 | 9 | class Image(Base): 10 | __tablename__ = "image" 11 | # Managed by Django ORM; partially duplicated here so we can join 12 | # analytics and image data together. This is excluded from migrations. 13 | id = Column(Integer, primary_key=True) 14 | identifier = Column(UUID) 15 | source = Column(String) 16 | provider = Column(String) 17 | title = Column(String) 18 | 19 | 20 | class EventMixin(object): 21 | id = Column(Integer, primary_key=True) 22 | timestamp = Column(DateTime, server_default=func.now(), index=True) 23 | 24 | 25 | class ReportMixin(object): 26 | id = Column(Integer, primary_key=True) 27 | start_time = Column(DateTime, index=True) 28 | end_time = Column(DateTime, index=True) 29 | 30 | 31 | class SearchEvent(Base, EventMixin): 32 | """ 33 | Store searches linked to a session UUID. 34 | """ 35 | __tablename__ = "search_event" 36 | 37 | query = Column(String, index=True) 38 | session_uuid = Column(UUID, index=True) 39 | 40 | 41 | class SearchRatingEvent(Base, EventMixin): 42 | """ 43 | Users can provide feedback about the quality of search results. 44 | """ 45 | __tablename__= "search_rating_event" 46 | 47 | query = Column(String, index=True) 48 | relevant = Column(Boolean, index=True) 49 | 50 | 51 | class ResultClickedEvent(Base, EventMixin): 52 | """ 53 | Link result clicks to search sessions. 54 | """ 55 | __tablename__ = "result_clicked_event" 56 | 57 | session_uuid = Column(UUID, index=True) 58 | result_uuid = Column(UUID, index=True) 59 | query = Column(String, index=True) 60 | result_rank = Column(Integer) 61 | 62 | 63 | class DetailPageEvents(enum.Enum): 64 | ATTRIBUTION_CLICKED = enum.auto() 65 | REUSE_SURVEY = enum.auto() 66 | SOURCE_CLICKED = enum.auto() 67 | CREATOR_CLICKED = enum.auto() 68 | SHARED_SOCIAL = enum.auto() 69 | 70 | 71 | class DetailPageEvent(Base, EventMixin): 72 | """ 73 | Events that happen on result pages, such as clicking an attribution button 74 | or sharing the result on social media. 75 | """ 76 | __tablename__ = "detail_page_event" 77 | 78 | result_uuid = Column(UUID, index=True) 79 | event_type = Column(Enum(DetailPageEvents), index=True) 80 | 81 | 82 | class AttributionReferrerEvent(Base, EventMixin): 83 | """ 84 | Triggered by a user's browser loading one of our static assets on a non-CC 85 | site. By parsing server logs, we can determine which work was embedded and 86 | on which domain it appeared. 87 | """ 88 | __tablename__ = "attribution_referrer_event" 89 | 90 | image_uuid = Column(UUID, index=True) 91 | full_referer = Column(String) 92 | referer_domain = Column(String, index=True) 93 | # The path to the embedded asset on our server. ex: /static/img/cc-by.svg 94 | resource = Column(String, index=True) 95 | 96 | # Reports 97 | 98 | 99 | class UsageReport(Base, ReportMixin): 100 | """ Tracks statistics for the last 24 hours """ 101 | __tablename__ = "usage_reports" 102 | results_clicked = Column(Integer) 103 | attribution_buttonclicks = Column(Integer) 104 | survey_responses = Column(Integer) 105 | source_clicked = Column(Integer) 106 | creator_clicked = Column(Integer) 107 | shared_social = Column(Integer) 108 | sessions = Column(Integer) 109 | searches = Column(Integer) 110 | attribution_referer_hits = Column(Integer) 111 | avg_rating = Column(Float) 112 | avg_searches_per_session = Column(Float) 113 | 114 | 115 | class SourceUsageReport(Base, ReportMixin): 116 | __tablename__ = "source_report" 117 | 118 | source_id = Column(String, index=True) 119 | result_clicks = Column(Integer, index=True) 120 | 121 | 122 | class AttributionRefererReport(Base, ReportMixin): 123 | __tablename__ = "attribution_referer_report" 124 | 125 | domain = Column(String, index=True) 126 | hits = Column(Integer, index=True) 127 | 128 | 129 | class TopSearchesReport(Base, ReportMixin): 130 | __tablename__ = "top_searches" 131 | term = Column(String, index=True) 132 | hits = Column(Integer, index=True) 133 | 134 | 135 | class TopResultsReport(Base, ReportMixin): 136 | __tablename__ = "top_results" 137 | result_uuid = Column(UUID, index=True) 138 | hits = Column(Integer, index=True) 139 | source = Column(String, index=True) 140 | title = Column(String, index=True) -------------------------------------------------------------------------------- /analytics/server.py: -------------------------------------------------------------------------------- 1 | import falcon 2 | from falcon_cors import CORS 3 | from event_controller import EventController 4 | 5 | event_controller = EventController() 6 | 7 | class SearchEventResource: 8 | def on_post(self, req, resp): 9 | j = req.media 10 | event_controller.create_search( 11 | query=j['query'], 12 | session_uuid=j['session_uuid'] 13 | ) 14 | resp.status = falcon.HTTP_201 15 | 16 | 17 | class SearchRatingEventResource: 18 | def on_post(self, req, resp): 19 | j = req.media 20 | try: 21 | event_controller.create_search_rating( 22 | query=j['query'], 23 | relevant=j['relevant'] 24 | ) 25 | resp.status = falcon.HTTP_201 26 | except ValueError: 27 | resp.body = '{"message": "Rating must be True or False"}' 28 | resp.status = falcon.HTTP_400 29 | 30 | 31 | class ResultClickEventResource: 32 | def on_post(self, req, resp): 33 | j = req.media 34 | event_controller.create_result_click( 35 | session_uuid=j['session_uuid'], 36 | result_uuid=j['result_uuid'], 37 | query=j['query'], 38 | rank=j['result_rank'] 39 | ) 40 | resp.status = falcon.HTTP_201 41 | 42 | 43 | class DetailEventResource: 44 | def on_post(self, req, resp): 45 | j = req.media 46 | try: 47 | event_controller.create_detail_event( 48 | event=j['event_type'], 49 | result_uuid=j['result_uuid'] 50 | ) 51 | resp.status = falcon.HTTP_201 52 | except KeyError: 53 | valid_events = event_controller.list_valid_detail_events() 54 | resp.body = \ 55 | '{{"message": "Invalid event_type. Valid types: {}"}}' \ 56 | .format(valid_events) 57 | resp.status = falcon.HTTP_400 58 | 59 | 60 | class RedocResource: 61 | def on_get(self, req, resp): 62 | resp.status = falcon.HTTP_200 63 | resp.content_type = 'text/html' 64 | with open('docs/redoc.html', 'r') as f: 65 | resp.body = f.read() 66 | 67 | 68 | class OpenAPISpecResource: 69 | def on_get(self, req, resp): 70 | resp.status = falcon.HTTP_200 71 | resp.content_type = 'text/html' 72 | with open('docs/swagger.yaml', 'r') as f: 73 | resp.body = f.read() 74 | 75 | origins = [ 76 | 'https://ccsearch.creativecommons.org', 77 | 'https://ccsearch-dev.creativecommons.org', 78 | 'https://search.creativecommons.org' 79 | ] 80 | cors = CORS( 81 | allow_origins_list=origins, 82 | allow_all_methods=True, 83 | allow_all_headers=True 84 | ) 85 | api = falcon.API(middleware=[cors.middleware]) 86 | api.add_route('/', RedocResource()) 87 | api.add_route('/swagger.yaml', OpenAPISpecResource()) 88 | api.add_route('/search_event', SearchEventResource()) 89 | api.add_route('/search_rating_event', SearchRatingEventResource()) 90 | api.add_route('/result_click_event', ResultClickEventResource()) 91 | api.add_route('/detail_page_event', DetailEventResource()) 92 | -------------------------------------------------------------------------------- /analytics/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | DATABASE_CONNECTION = os.getenv( 4 | 'DATABASE_CONN', 'postgres+psycopg2://deploy:deploy@localhost/openledger' 5 | ) 6 | 7 | # Attribution events stream configuration 8 | KAFKA_HOSTS = os.getenv('KAFKA_HOSTS', 'kafka:9092') 9 | KAFKA_TOPIC_NAME = os.getenv('KAFKA_TOPIC', 'attribution_events_dev') 10 | ATTRIBUTION_LOGFILE = os.getenv('LOGFILE', '/var/log/attribution_worker.log') 11 | -------------------------------------------------------------------------------- /cccatalog-api/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-stretch 2 | 3 | ENV PYTHONBUFFERED 1 4 | 5 | RUN apt-get update \ 6 | && apt-get install libexempi3 \ 7 | && mkdir /cccatalog-api \ 8 | && mkdir -p /var/log/cccatalog-api/cccatalog-api.log 9 | 10 | ADD cccatalog/api/utils/fonts/SourceSansPro-Bold.ttf /usr/share/fonts/truetype/SourceSansPro-Bold.ttf 11 | 12 | WORKDIR /cccatalog-api 13 | 14 | # Install Python dependency management tools 15 | RUN pip install --upgrade pip \ 16 | && pip install --upgrade setuptools \ 17 | && pip install --upgrade pipenv 18 | 19 | # Copy the Pipenv files into the container 20 | COPY Pipfile /cccatalog-api/ 21 | COPY Pipfile.lock /cccatalog-api/ 22 | 23 | # Install the dependencies system-wide 24 | # TODO: Use build args to avoid installing dev dependencies in production 25 | RUN pipenv install --deploy --system --dev 26 | 27 | ENTRYPOINT ["./run.sh"] 28 | -------------------------------------------------------------------------------- /cccatalog-api/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | remote-pdb = "*" 8 | ipython = "*" 9 | pipdeptree = "*" 10 | pycodestyle = "*" 11 | 12 | [packages] 13 | psycopg2-binary = "*" 14 | redlock-py = "*" 15 | hvac = "*" 16 | PyJWT = "*" 17 | python3-openid = "*" 18 | wsgi-basic-auth = "*" 19 | grequests = "*" 20 | requests-oauthlib = "*" 21 | aws-requests-auth = "*" 22 | Django = "==2.2.13" 23 | Pillow = "*" 24 | django-cors-headers = "*" 25 | django-uuslug = "*" 26 | django-sslserver = "*" 27 | django-oauth-toolkit = "==1.1.2" 28 | django-braces = "*" 29 | django-redis = "*" 30 | pytest-django = ">=3.5" 31 | djangorestframework = "*" 32 | drf-yasg = "*" 33 | elasticsearch-dsl = "==7.2.1" 34 | piexif = "*" 35 | python-xmp-toolkit = "*" 36 | deepdiff = "*" 37 | djangorestframework-xml = "*" 38 | gevent = "*" 39 | django-storages = "*" 40 | boto3 = "*" 41 | 42 | [packages.future] 43 | version = "*" 44 | 45 | [packages.ipaddress] 46 | version = "*" 47 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/__init__.py -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/__init__.py -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | from cccatalog.api.models import ( 3 | ImageReport, MatureImage, DeletedImage, ContentProvider, SourceLogo, PENDING 4 | ) 5 | 6 | 7 | @admin.register(ImageReport) 8 | class ImageReportAdmin(admin.ModelAdmin): 9 | list_display = ( 10 | 'reason', 'status', 'image_url', 'description', 'created_at' 11 | ) 12 | list_filter = ('status', 'reason') 13 | list_display_links = ('status',) 14 | search_fields = ('description', 'identifier') 15 | actions = None 16 | 17 | def get_readonly_fields(self, request, obj=None): 18 | if obj is None: 19 | return [] 20 | always_readonly = [ 21 | 'reason', 'image_url', 'description', 'identifier', 'created_at' 22 | ] 23 | if obj.status == PENDING: 24 | return always_readonly 25 | else: 26 | status_readonly = ['status'] 27 | status_readonly.extend(always_readonly) 28 | return status_readonly 29 | 30 | 31 | @admin.register(MatureImage) 32 | class MatureImageAdmin(admin.ModelAdmin): 33 | search_fields = ('identifier',) 34 | 35 | 36 | @admin.register(DeletedImage) 37 | class DeletedImage(admin.ModelAdmin): 38 | search_fields = ('identifier',) 39 | 40 | 41 | class InlineImage(admin.TabularInline): 42 | model = SourceLogo 43 | 44 | 45 | @admin.register(ContentProvider) 46 | class ProviderAdmin(admin.ModelAdmin): 47 | list_display = ('provider_name', 'provider_identifier') 48 | search_fields = ('provider_name', 'provider_identifier') 49 | exclude = ('notes', 'created_on') 50 | inlines = [InlineImage] 51 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class ApiConfig(AppConfig): 5 | name = 'api' 6 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/controllers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/controllers/__init__.py -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/controllers/link_controller.py: -------------------------------------------------------------------------------- 1 | # All possible letters that can appear in a shortened URL path 2 | URL_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' 3 | # Inverted index of the alphabet 4 | ALPHABET_INDEX = {c: idx for idx, c in enumerate(URL_ALPHABET)} 5 | 6 | 7 | def get_next_shortened_path(last_url): 8 | """ 9 | Produce a short URL. Each URL is guaranteed to be the shortest possible 10 | path available. 11 | :param last_url: The last allocated URL. 12 | :return: A short URL path, such as '9abx' 13 | """ 14 | def get_next_char(c): 15 | c_idx = ALPHABET_INDEX[c] 16 | next_char_idx = (c_idx + 1) % len(URL_ALPHABET) 17 | return URL_ALPHABET[next_char_idx] 18 | 19 | if last_url is None: 20 | return URL_ALPHABET[0] 21 | 22 | last_character = last_url[-1] 23 | next_character = get_next_char(last_character) 24 | 25 | temp_path = last_url 26 | if next_character == URL_ALPHABET[0]: 27 | # Iterate backwards to carry the last digit. 28 | carry = True 29 | idx = len(temp_path) - 1 30 | while idx >= 0 and carry: 31 | c = temp_path[idx] 32 | if c == URL_ALPHABET[-1]: 33 | if idx == 0: 34 | # Overflowed; add a new digit 35 | temp_path = temp_path + URL_ALPHABET[0] 36 | else: 37 | carry = False 38 | temp_path = \ 39 | temp_path[:idx] + get_next_char(c) + temp_path[idx + 1:] 40 | idx -= 1 41 | next_path = temp_path 42 | else: 43 | # Increment the last digit. 44 | next_path = temp_path[:-1] + next_character 45 | return next_path 46 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/licenses.py: -------------------------------------------------------------------------------- 1 | LICENSES = ( 2 | ("BY", "Attribution"), 3 | ("BY-NC", "Attribution NonCommercial"), 4 | ("BY-ND", "Attribution NoDerivatives"), 5 | ("BY-SA", "Attribution ShareAlike"), 6 | ("BY-NC-ND", "Attribution NonCommercial NoDerivatives"), 7 | ("BY-NC-SA", "Attribution NonCommercial ShareAlike"), 8 | ("PDM", "Public Domain Mark"), 9 | ("CC0", "Public Domain Dedication"), 10 | ) 11 | 12 | LICENSE_GROUPS = { 13 | # All open licenses 14 | "all": {'BY', 'BY-NC', 'BY-ND', 'BY-SA', 'BY-NC-ND', 'BY-NC-SA', 'PDM', 15 | 'CC0'}, 16 | # All CC licenses 17 | "all-cc": {'BY', 'BY-NC', 'BY-ND', 'BY-SA', 'BY-NC-ND', 'BY-NC-SA', 'CC0'}, 18 | # All licenses allowing commercial use 19 | "commercial": {'BY', 'BY-SA', 'BY-ND', 'CC0', 'PDM'}, 20 | # All licenses allowing modifications 21 | "modification": {'BY', 'BY-SA', 'BY-NC', 'BY-NC-SA', 'CC0', 'PDM'}, 22 | } 23 | 24 | ATTRIBUTION = \ 25 | "{title} {creator}is licensed under CC-{_license} {version}. To view a " \ 26 | "copy of this license, visit {license_url}." 27 | 28 | 29 | def get_license_url(_license, version, meta_data=None): 30 | license_overridden = meta_data and 'license_url' in meta_data 31 | if license_overridden and meta_data['license_url'] is not None: 32 | return meta_data['license_url'] 33 | elif _license.lower() == 'pdm': 34 | return 'https://creativecommons.org/publicdomain/mark/1.0/' 35 | else: 36 | return f'https://creativecommons.org/licenses/{_license}/{version}/' 37 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0002_auto_20180723_1737.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.5 on 2018-07-23 17:37 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0001_initial'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='imagelist', 15 | name='id', 16 | field=models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0003_image_view_count.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.5 on 2018-07-26 19:53 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0002_auto_20180723_1737'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AddField( 14 | model_name='image', 15 | name='view_count', 16 | field=models.IntegerField(default=0), 17 | ), 18 | migrations.RunSQL('ALTER TABLE image ALTER view_count SET DEFAULT 0') 19 | ] 20 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0004_shortenedlink.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.5 on 2018-08-01 17:46 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0003_image_view_count'), 10 | ] 11 | 12 | operations = [ 13 | migrations.CreateModel( 14 | name='ShortenedLink', 15 | fields=[ 16 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 17 | ('updated_on', models.DateTimeField(auto_now=True)), 18 | ('shortened_path', models.CharField(db_index=True, help_text='The path to the shortened URL, e.g. tc3n834. The resulting URL will be shares.cc/tc3n834.', max_length=10, unique=True)), 19 | ('full_url', models.URLField(max_length=1000, unique=True)), 20 | ('created_on', models.DateTimeField(auto_now_add=True, db_index=True)), 21 | ], 22 | options={ 23 | 'abstract': False, 24 | }, 25 | ), 26 | ] 27 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0005_auto_20180803_1905.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.5 on 2018-08-03 19:05 2 | 3 | import django.contrib.postgres.fields.jsonb 4 | from django.db import migrations 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('api', '0004_shortenedlink'), 11 | ] 12 | 13 | operations = [ 14 | migrations.RemoveField( 15 | model_name='image', 16 | name='tags', 17 | ), 18 | migrations.AddField( 19 | model_name='image', 20 | name='tags', 21 | field=django.contrib.postgres.fields.jsonb.JSONField(default=None), 22 | preserve_default=False, 23 | ), 24 | ] 25 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0006_image_watermarked.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.5 on 2018-08-03 19:08 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0005_auto_20180803_1905'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AddField( 14 | model_name='image', 15 | name='watermarked', 16 | field=models.BooleanField(default=None), 17 | preserve_default=False, 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0007_auto_20180803_1909.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.5 on 2018-08-03 19:09 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0006_image_watermarked'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='image', 15 | name='foreign_identifier', 16 | field=models.CharField(blank=True, db_index=True, max_length=1000, null=True, unique=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0008_imagelist_slug.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.5 on 2018-08-07 17:37 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0007_auto_20180803_1909'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AddField( 14 | model_name='imagelist', 15 | name='slug', 16 | field=models.CharField(default=None, help_text='A unique identifier used to make a friendly URL for external downstream API consumers.', max_length=200, unique=True), 17 | preserve_default=False, 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0009_auto_20180831_1425.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.5 on 2018-08-31 14:25 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0008_imagelist_slug'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AddField( 14 | model_name='imagelist', 15 | name='auth', 16 | field=models.CharField(default='fdsadfwetyhegaerg', help_text='A randomly generated string assigned upon list creation. Used to authenticate updates and deletions.', max_length=64), 17 | preserve_default=False, 18 | ), 19 | migrations.AlterField( 20 | model_name='imagelist', 21 | name='slug', 22 | field=models.CharField(help_text='A unique identifier used to make a friendly URL for downstream API consumers.', max_length=200, unique=True), 23 | ), 24 | migrations.AlterField( 25 | model_name='shortenedlink', 26 | name='full_url', 27 | field=models.URLField(db_index=True, max_length=1000, unique=True), 28 | ), 29 | ] 30 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0010_auto_20180831_1815.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.5 on 2018-08-31 18:15 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0009_auto_20180831_1425'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='imagelist', 15 | name='slug', 16 | field=models.CharField(db_index=True, help_text='A unique identifier used to make a friendly URL for downstream API consumers.', max_length=200, unique=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0011_auto_20181117_0029.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.8 on 2018-11-17 00:29 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0010_auto_20180831_1815'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RemoveField( 14 | model_name='image', 15 | name='perceptual_hash', 16 | ), 17 | migrations.AlterField( 18 | model_name='image', 19 | name='identifier', 20 | field=models.UUIDField(db_index=True, unique=True), 21 | ), 22 | migrations.AlterField( 23 | model_name='imagelist', 24 | name='images', 25 | field=models.ManyToManyField(help_text='A list of identifier keys corresponding to images.', related_name='lists', to='api.Image'), 26 | ), 27 | ] 28 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0012_auto_20190102_2012.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.8 on 2019-01-02 20:12 2 | 3 | import django.contrib.postgres.fields.jsonb 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('api', '0011_auto_20181117_0029'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterUniqueTogether( 15 | name='usertags', 16 | unique_together=set(), 17 | ), 18 | migrations.RemoveField( 19 | model_name='usertags', 20 | name='image', 21 | ), 22 | migrations.RemoveField( 23 | model_name='usertags', 24 | name='tag', 25 | ), 26 | migrations.RemoveField( 27 | model_name='usertags', 28 | name='user', 29 | ), 30 | migrations.AlterField( 31 | model_name='image', 32 | name='tags', 33 | field=django.contrib.postgres.fields.jsonb.JSONField(blank=True, null=True), 34 | ), 35 | migrations.AlterField( 36 | model_name='image', 37 | name='watermarked', 38 | field=models.NullBooleanField(), 39 | ), 40 | migrations.DeleteModel( 41 | name='UserTags', 42 | ), 43 | ] 44 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0013_contentprovider.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.8 on 2019-01-22 18:51 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0012_auto_20190102_2012'), 10 | ] 11 | 12 | operations = [ 13 | migrations.CreateModel( 14 | name='ContentProvider', 15 | fields=[ 16 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 17 | ('created_on', models.DateTimeField(auto_now_add=True)), 18 | ('updated_on', models.DateTimeField(auto_now=True)), 19 | ('provider_identifier', models.CharField(max_length=50)), 20 | ('provider_name', models.CharField(max_length=250)), 21 | ('domain_name', models.CharField(max_length=500)), 22 | ('filter_content', models.BooleanField(default=False)), 23 | ], 24 | options={ 25 | 'db_table': 'content_provider', 26 | }, 27 | ), 28 | ] 29 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0014_auto_20190122_1853.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.8 on 2019-01-22 18:53 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0013_contentprovider'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='contentprovider', 15 | name='provider_name', 16 | field=models.CharField(max_length=250, unique=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0015_contentprovider_notes.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.8 on 2019-01-22 19:04 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0014_auto_20190122_1853'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AddField( 14 | model_name='contentprovider', 15 | name='notes', 16 | field=models.TextField(default=''), 17 | preserve_default=False, 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0016_auto_20190122_1908.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.8 on 2019-01-22 19:08 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0015_contentprovider_notes'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='contentprovider', 15 | name='created_on', 16 | field=models.DateTimeField(), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0017_remove_contentprovider_updated_on.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.8 on 2019-01-22 19:16 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0016_auto_20190122_1908'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RemoveField( 14 | model_name='contentprovider', 15 | name='updated_on', 16 | ), 17 | ] 18 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0018_auto_20190122_1917.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.8 on 2019-01-22 19:17 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0017_remove_contentprovider_updated_on'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='contentprovider', 15 | name='notes', 16 | field=models.TextField(null=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0019_auto_20190307_1830.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.0.13 on 2019-03-07 18:30 2 | 3 | from django.conf import settings 4 | from django.db import migrations, models 5 | import django.db.models.deletion 6 | import oauth2_provider.generators 7 | import oauth2_provider.validators 8 | 9 | 10 | class Migration(migrations.Migration): 11 | 12 | dependencies = [ 13 | migrations.swappable_dependency(settings.AUTH_USER_MODEL), 14 | ('api', '0018_auto_20190122_1917'), 15 | ] 16 | 17 | operations = [ 18 | migrations.CreateModel( 19 | name='OAuth2Registration', 20 | fields=[ 21 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 22 | ('name', models.CharField(help_text='A unique human-readable name for your application or project requiring access to the CC Catalog API.', max_length=150, unique=True)), 23 | ('description', models.CharField(help_text='A description of what you are trying to achieve with your project using the API. Please provide as much detail as possible!', max_length=10000)), 24 | ('email', models.EmailField(help_text='A valid email that we can reach you at if we have any questions about your use case or data consumption.', max_length=254)), 25 | ], 26 | ), 27 | migrations.CreateModel( 28 | name='ThrottledApplication', 29 | fields=[ 30 | ('id', models.BigAutoField(primary_key=True, serialize=False)), 31 | ('client_id', models.CharField(db_index=True, default=oauth2_provider.generators.generate_client_id, max_length=100, unique=True)), 32 | ('redirect_uris', models.TextField(blank=True, help_text='Allowed URIs list, space separated', validators=[oauth2_provider.validators.validate_uris])), 33 | ('client_type', models.CharField(choices=[('confidential', 'Confidential'), ('public', 'Public')], max_length=32)), 34 | ('authorization_grant_type', models.CharField(choices=[('authorization-code', 'Authorization code'), ('implicit', 'Implicit'), ('password', 'Resource owner password-based'), ('client-credentials', 'Client credentials')], max_length=32)), 35 | ('client_secret', models.CharField(blank=True, db_index=True, default=oauth2_provider.generators.generate_client_secret, max_length=255)), 36 | ('name', models.CharField(blank=True, max_length=255)), 37 | ('skip_authorization', models.BooleanField(default=False)), 38 | ('created', models.DateTimeField(auto_now_add=True)), 39 | ('updated', models.DateTimeField(auto_now=True)), 40 | ('rate_limit_model', models.CharField(choices=[('standard', 'standard'), ('enhanced', 'enhanced')], default='standard', max_length=20)), 41 | ('user', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='api_throttledapplication', to=settings.AUTH_USER_MODEL)), 42 | ], 43 | options={ 44 | 'abstract': False, 45 | }, 46 | ), 47 | migrations.AlterField( 48 | model_name='image', 49 | name='foreign_identifier', 50 | field=models.CharField(blank=True, db_index=True, help_text='The identifier provided by the upstream source.', max_length=1000, null=True, unique=True), 51 | ), 52 | migrations.AlterField( 53 | model_name='image', 54 | name='foreign_landing_url', 55 | field=models.CharField(blank=True, help_text='The landing page of the work.', max_length=1000, null=True), 56 | ), 57 | migrations.AlterField( 58 | model_name='image', 59 | name='identifier', 60 | field=models.UUIDField(db_index=True, help_text='A unique identifier that we assign on ingestion.', unique=True), 61 | ), 62 | migrations.AlterField( 63 | model_name='image', 64 | name='provider', 65 | field=models.CharField(blank=True, db_index=True, help_text='The content provider, e.g. Flickr, 500px...', max_length=80, null=True), 66 | ), 67 | migrations.AlterField( 68 | model_name='image', 69 | name='source', 70 | field=models.CharField(blank=True, db_index=True, help_text='The source of the data, meaning a particular dataset. Source and provider can be different: the Google Open Images dataset is source=openimages., but provider=Flickr.', max_length=80, null=True), 71 | ), 72 | migrations.AlterField( 73 | model_name='image', 74 | name='thumbnail', 75 | field=models.URLField(blank=True, help_text='The thumbnail for the image, if any.', max_length=1000, null=True), 76 | ), 77 | migrations.AlterField( 78 | model_name='image', 79 | name='url', 80 | field=models.URLField(help_text='The actual URL to the image.', max_length=1000, unique=True), 81 | ), 82 | ] 83 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0020_auto_20190918_1954.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.2.4 on 2019-09-18 19:54 2 | 3 | from django.conf import settings 4 | from django.db import migrations, models 5 | import django.db.models.deletion 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ('api', '0019_auto_20190307_1830'), 12 | ] 13 | 14 | operations = [ 15 | migrations.AddField( 16 | model_name='throttledapplication', 17 | name='verified', 18 | field=models.BooleanField(default=False), 19 | ), 20 | migrations.AlterField( 21 | model_name='image', 22 | name='identifier', 23 | field=models.UUIDField(db_index=True, help_text='Our unique identifier for a CC work.', unique=True), 24 | ), 25 | migrations.CreateModel( 26 | name='OAuth2Verification', 27 | fields=[ 28 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 29 | ('email', models.EmailField(max_length=254)), 30 | ('code', models.CharField(db_index=True, max_length=256)), 31 | ('associated_application', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.OAUTH2_PROVIDER_APPLICATION_MODEL)), 32 | ], 33 | ), 34 | ] 35 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0021_deletedimages.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.2.4 on 2020-01-16 18:56 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0020_auto_20190918_1954'), 10 | ] 11 | 12 | operations = [ 13 | migrations.CreateModel( 14 | name='DeletedImages', 15 | fields=[ 16 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 17 | ('created_on', models.DateTimeField(auto_now_add=True)), 18 | ('updated_on', models.DateTimeField(auto_now=True)), 19 | ('deleted_id', models.UUIDField(db_index=True, help_text='The identifier of the deleted image.', unique=True)), 20 | ('deleting_user', models.CharField(help_text='The user that deleted the image.', max_length=50)), 21 | ], 22 | options={ 23 | 'abstract': False, 24 | }, 25 | ), 26 | ] 27 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0022_reportimage.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.2.10 on 2020-04-12 19:54 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0021_deletedimages'), 10 | ] 11 | 12 | operations = [ 13 | migrations.CreateModel( 14 | name='ImageReport', 15 | fields=[ 16 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 17 | ('identifier', models.UUIDField()), 18 | ('reason', models.CharField(choices=[('mature', 'mature'), ('dmca', 'dmca'), ('other', 'other')], max_length=10)), 19 | ('description', models.TextField(max_length=500)), 20 | ], 21 | options={ 22 | 'db_table': 'nsfw_reports', 23 | }, 24 | ), 25 | ] 26 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0023_auto_20200423_1526.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.2.10 on 2020-04-23 15:26 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0022_reportimage'), 10 | ] 11 | 12 | operations = [ 13 | migrations.CreateModel( 14 | name='MatureImages', 15 | fields=[ 16 | ('identifier', models.UUIDField(primary_key=True, serialize=False, unique=True)), 17 | ('created_on', models.DateTimeField(auto_now_add=True)), 18 | ], 19 | ), 20 | migrations.RemoveField( 21 | model_name='deletedimages', 22 | name='deleted_id', 23 | ), 24 | migrations.RemoveField( 25 | model_name='deletedimages', 26 | name='deleting_user', 27 | ), 28 | migrations.RemoveField( 29 | model_name='deletedimages', 30 | name='id', 31 | ), 32 | migrations.AddField( 33 | model_name='deletedimages', 34 | name='identifier', 35 | field=models.UUIDField(default='c9341bce-6e8b-4d6a-b098-29f5ca1253ac', help_text='The identifier of the deleted image.', primary_key=True, serialize=False, unique=True), 36 | preserve_default=False, 37 | ), 38 | migrations.AddField( 39 | model_name='imagereport', 40 | name='status', 41 | field=models.CharField(choices=[('pending', 'pending'), ('confirmed', 'confirmed'), ('rejected', 'rejected')], default='pending', max_length=20), 42 | ), 43 | migrations.AlterField( 44 | model_name='imagereport', 45 | name='description', 46 | field=models.TextField(blank=True, max_length=500, null=True), 47 | ), 48 | migrations.AlterField( 49 | model_name='imagereport', 50 | name='reason', 51 | field=models.CharField(choices=[('mature', 'mature'), ('dmca', 'dmca'), ('other', 'other')], max_length=20), 52 | ), 53 | ] 54 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0024_auto_20200423_1601.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.2.10 on 2020-04-23 16:01 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0023_auto_20200423_1526'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RenameModel( 14 | old_name='DeletedImages', 15 | new_name='DeletedImage', 16 | ), 17 | migrations.RenameModel( 18 | old_name='MatureImages', 19 | new_name='MatureImage', 20 | ), 21 | migrations.AlterField( 22 | model_name='imagereport', 23 | name='status', 24 | field=models.CharField(choices=[('pending_review', 'pending_review'), ('mature_filter', 'mature_filter'), ('deindex', 'deindex'), ('do_nothing', 'do_nothing')], default='pending', max_length=20), 25 | ), 26 | ] 27 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0025_auto_20200429_1401.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.2.10 on 2020-04-29 14:01 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0024_auto_20200423_1601'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='imagereport', 15 | name='status', 16 | field=models.CharField(choices=[('pending_review', 'pending_review'), ('mature_filtered', 'mature_filtered'), ('deindexed', 'deindexed'), ('no_action', 'no_action')], default='pending_review', max_length=20), 17 | ), 18 | migrations.DeleteModel( 19 | name='ImageTags', 20 | ), 21 | ] 22 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0026_imagereport_date.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.2.10 on 2020-05-15 17:44 2 | 3 | from django.db import migrations, models 4 | import django.utils.timezone 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('api', '0025_auto_20200429_1401'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AddField( 15 | model_name='imagereport', 16 | name='date', 17 | field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now), 18 | preserve_default=False, 19 | ), 20 | ] 21 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0027_auto_20200515_2037.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.2.10 on 2020-05-15 20:37 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0026_imagereport_date'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RenameField( 14 | model_name='imagereport', 15 | old_name='date', 16 | new_name='created_at', 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/0028_sourcelogo.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.2.13 on 2020-06-30 19:36 2 | 3 | from django.db import migrations, models 4 | import django.db.models.deletion 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('api', '0027_auto_20200515_2037'), 11 | ] 12 | 13 | operations = [ 14 | migrations.CreateModel( 15 | name='SourceLogo', 16 | fields=[ 17 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 18 | ('image', models.ImageField(upload_to='')), 19 | ('source', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to='api.ContentProvider')), 20 | ], 21 | ), 22 | ] 23 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/migrations/__init__.py -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/serializers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/serializers/__init__.py -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/serializers/link_serializers.py: -------------------------------------------------------------------------------- 1 | import redlock 2 | import os 3 | import logging as log 4 | from rest_framework.serializers import ModelSerializer, Serializer, URLField, \ 5 | ValidationError 6 | from cccatalog.api.controllers.link_controller import get_next_shortened_path 7 | from cccatalog.api.models import ShortenedLink 8 | from cccatalog import settings 9 | from urllib.parse import urlparse 10 | from rest_framework import serializers 11 | 12 | # Create a lock inside of Redis to ensure that multiple server workers don't 13 | # try to create the same shortened URL. 14 | __parsed_redis_url = urlparse(settings.CACHES['locks']['LOCATION']) 15 | __host, __port = __parsed_redis_url.netloc.split(':') 16 | __db_num = __parsed_redis_url.path[1] if __parsed_redis_url.path else None 17 | __password = os.environ.get("REDIS_PASSWORD") 18 | # Clients will attempt to acquire the lock infinitely with a 1 second delay. 19 | url_lock = redlock.Redlock( 20 | [{"host": __host, "port": __port, "db": __db_num, "password": __password}], 21 | retry_count=1, retry_delay=1000 22 | ) 23 | 24 | 25 | class ShortenedLinkResponseSerializer(Serializer): 26 | shortened_url = URLField( 27 | help_text="A shortened link on the `shares.cc` domain." 28 | ) 29 | 30 | 31 | class ShortenedLinkSerializer(ModelSerializer): 32 | """ 33 | A single shortened URL, mapping a shortened path at shares.cc to a full 34 | URL elsewhere on the CC Catalog platform. 35 | """ 36 | full_url = serializers.URLField( 37 | max_length=1000, 38 | help_text="The URL to shorten. Only URLs on the CC Catalog domain will" 39 | " be accepted. Valid domains: `{}`. " 40 | "Valid paths: `{}`".format(settings.SHORT_URL_WHITELIST, 41 | settings.SHORT_URL_PATH_WHITELIST) 42 | ) 43 | 44 | class Meta: 45 | model = ShortenedLink 46 | fields = ('full_url',) 47 | 48 | def validate_full_url(self, value): 49 | parsed_url = urlparse(value) 50 | url = '{url.netloc}'.format(url=parsed_url) 51 | path = '{url.path}'.format(url=parsed_url) 52 | if url not in settings.SHORT_URL_WHITELIST: 53 | raise ValidationError( 54 | "You can only create a short URL to items inside of the CC " 55 | "Catalog. Pointing to other domains is not allowed." 56 | ) 57 | 58 | found_allowed_path = False 59 | for allowed_path in settings.SHORT_URL_PATH_WHITELIST: 60 | if path.startswith(allowed_path): 61 | found_allowed_path = True 62 | 63 | if not found_allowed_path: 64 | raise ValidationError( 65 | "Illegal path. Valid paths must start with {}".format( 66 | str(settings.SHORT_URL_PATH_WHITELIST) 67 | ) 68 | ) 69 | 70 | return value 71 | 72 | def save(self): 73 | two_seconds_ms = 1000 * 2 74 | lock = url_lock.lock('unique_url_lock', ttl=two_seconds_ms) 75 | shortened_path = None 76 | if lock: 77 | try: 78 | last_url = str( 79 | ShortenedLink 80 | .objects 81 | .latest(field_name='created_on') 82 | .shortened_path 83 | ) 84 | except ShortenedLink.DoesNotExist: 85 | # No URLs exist. Create the first one. 86 | last_url = None 87 | 88 | shortened_path = get_next_shortened_path(last_url) 89 | full_url = self.validated_data['full_url'] 90 | shortened_link_instance = ShortenedLink( 91 | shortened_path=shortened_path, 92 | full_url=full_url 93 | ) 94 | shortened_link_instance.save() 95 | url_lock.unlock(lock) 96 | return shortened_path 97 | else: 98 | log.error('Failed to acquire URL lock.') 99 | return shortened_path 100 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/serializers/list_serializers.py: -------------------------------------------------------------------------------- 1 | from rest_framework import serializers 2 | from cccatalog.api.models import ImageList, Image 3 | from cccatalog.api.serializers.image_serializers import ImageDetailSerializer 4 | import secrets 5 | 6 | 7 | class ImageListBaseSerializer(serializers.ModelSerializer): 8 | images = serializers.SlugRelatedField( 9 | many=True, 10 | queryset=Image.objects.all(), 11 | slug_field='identifier', 12 | help_text='A list of unique IDs.' 13 | ) 14 | 15 | class Meta: 16 | fields = ('images',) 17 | 18 | def validate_images(self, image_keys): 19 | if len(image_keys) > 500: 20 | raise serializers.ValidationError( 21 | "Only up to 500 images can be added to a list." 22 | ) 23 | return image_keys 24 | 25 | 26 | class ImageListCreateSerializer(ImageListBaseSerializer): 27 | """ 28 | Responsible for parsing POST JSON body and persisting to the database. 29 | """ 30 | lookup_field = 'id' 31 | id = serializers.ReadOnlyField() 32 | auth = serializers.ReadOnlyField() 33 | 34 | class Meta: 35 | model = ImageList 36 | fields = ('id', 'title', 'images', 'auth') 37 | 38 | def save(self): 39 | title = self.validated_data['title'] 40 | images = self.validated_data['images'] 41 | auth = secrets.token_urlsafe(48) 42 | image_list = ImageList(title=title, auth=auth) 43 | image_list.save() 44 | image_list.images.add(*images) 45 | 46 | return image_list 47 | 48 | 49 | class ImageListResponseSerializer(serializers.Serializer): 50 | """ 51 | Return a list of fully resolved images. 52 | """ 53 | lookup_field = 'slug' 54 | id = serializers.ReadOnlyField() 55 | title = serializers.CharField() 56 | images = ImageDetailSerializer(many=True) 57 | 58 | 59 | class ImageListUpdateSerializer(ImageListBaseSerializer): 60 | lookup_field = 'id' 61 | 62 | class Meta: 63 | model = ImageList 64 | fields = ('images',) 65 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/serializers/oauth2_serializers.py: -------------------------------------------------------------------------------- 1 | from rest_framework import serializers 2 | from cccatalog.api.models import OAuth2Registration 3 | from oauth2_provider.models import Application 4 | 5 | 6 | class OAuth2RegistrationSerializer(serializers.ModelSerializer): 7 | class Meta: 8 | model = OAuth2Registration 9 | fields = ('name', 'description', 'email') 10 | 11 | 12 | class OAuth2RegistrationSuccessful(serializers.ModelSerializer): 13 | name = serializers.CharField( 14 | help_text="A unique human-readable name for your application " 15 | "or project requiring access to the CC Catalog API." 16 | ) 17 | client_id = serializers.CharField( 18 | help_text="A publicly exposed string used by CC Catalog API " 19 | "to identify the application." 20 | ) 21 | client_secret = serializers.CharField( 22 | help_text="A private string that authenticates the identity " 23 | "of the application to the CC Catalog API." 24 | ) 25 | 26 | class Meta: 27 | model = Application 28 | fields = ('name', 'client_id', 'client_secret') 29 | 30 | 31 | class OAuth2KeyInfo(serializers.Serializer): 32 | requests_this_minute = serializers.IntegerField( 33 | help_text="The number of requests your key has performed in the last " 34 | "minute.", 35 | allow_null=True 36 | ) 37 | requests_today = serializers.IntegerField( 38 | help_text="The number of requests your key has performed in the last " 39 | "day.", 40 | allow_null=True 41 | ) 42 | rate_limit_model = serializers.CharField( 43 | help_text="The type of rate limit applied to your key. Can be " 44 | "'standard' or 'enhanced'; enhanced users enjoy higher rate " 45 | "limits than their standard key counterparts. Contact " 46 | "Creative Commons if you need a higher rate limit." 47 | ) 48 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/utils/__init__.py -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/utils/ccrel.py: -------------------------------------------------------------------------------- 1 | from libxmp.consts import XMP_NS_CC, XMP_NS_XMP_Rights, XMP_NS_XMP 2 | import libxmp 3 | import io 4 | import os 5 | import uuid 6 | 7 | """ 8 | Tools for embedding Creative Commons Rights Expression Language (ccREL) data 9 | into files using Extensible Metadata Platform (XMP). 10 | 11 | This implementation is specifically for embedding ccREL inside of images, but it 12 | could be extended to handle other types of content. 13 | 14 | For more information, see the ccREL W3 standard [0]. 15 | [0] https://www.w3.org/Submission/ccREL/ 16 | """ 17 | 18 | 19 | def embed_xmp_bytes(image: io.BytesIO, work_properties): 20 | """ 21 | Given a file-like `io.BytesIO` object, embed ccREL metadata inside of it. 22 | For our purposes, we assume that the file is an image. 23 | 24 | :param image: A BytesIO representation of an image. 25 | :param work_properties: A dictionary with keys 'license_url' and 26 | 'attribution'. 'creator', and 'work_landing_page' are optional (but highly 27 | recommended) 28 | :return: An `io.BytesIO` object containing XMP metadata. 29 | """ 30 | 31 | # libxmp only works with actual file locations on the disk. To work around 32 | # this limitation, rather than embedding the metadata directly into the 33 | # `io.BytesIO` object, we have to use a temporary file and then convert it 34 | # back. 35 | # https://github.com/python-xmp-toolkit/python-xmp-toolkit/issues/46 36 | filename = '/tmp/{}'.format(uuid.uuid4()) 37 | with open(filename, 'w+b') as xmp_temp: 38 | xmp_temp.write(image.getvalue()) 39 | xmp_temp.flush() 40 | xmpfile = libxmp.XMPFiles(file_path=xmp_temp.name, open_forupdate=True) 41 | 42 | # Set CC rights. 43 | xmp = xmpfile.get_xmp() 44 | xmp.register_namespace(XMP_NS_CC, 'cc') 45 | xmp.set_property(XMP_NS_CC, 'license', work_properties['license_url']) 46 | if 'creator' in work_properties: 47 | if not xmp.does_property_exist(XMP_NS_CC, 'attributionName'): 48 | xmp.set_property( 49 | XMP_NS_CC, 'attributionName', work_properties['creator'] 50 | ) 51 | if 'work_landing_page' in work_properties: 52 | if not xmp.does_property_exist(XMP_NS_CC, 'attributionURL'): 53 | xmp.set_property( 54 | XMP_NS_CC, 55 | 'attributionURL', 56 | work_properties['work_landing_page'] 57 | ) 58 | xmp.register_namespace(XMP_NS_XMP, 'xmp') 59 | if 'identifier' in work_properties: 60 | if not xmp.does_property_exist(XMP_NS_XMP, 'Identifier'): 61 | xmp.set_property( 62 | XMP_NS_XMP, 63 | 'Identifier', 64 | work_properties['identifier'] 65 | ) 66 | # Set generic XMP rights. 67 | xmp.register_namespace(XMP_NS_XMP_Rights, 'xmpRights') 68 | if not xmp.does_property_exist(XMP_NS_XMP_Rights, 'XMP_NS_XMP_Rights'): 69 | xmp.set_property_bool(XMP_NS_XMP_Rights, 'Marked', True) 70 | if not xmp.does_property_exist(XMP_NS_XMP_Rights, 'UsageTerms'): 71 | usage = work_properties['attribution'] 72 | xmp.set_property(XMP_NS_XMP_Rights, 'UsageTerms', usage) 73 | xmpfile.put_xmp(xmp) 74 | xmpfile.close_file() 75 | 76 | with open(filename, 'r+b') as xmpfile: 77 | file_with_xmp = io.BytesIO(xmpfile.read()) 78 | os.remove(filename) 79 | return file_with_xmp 80 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/utils/dead_link_mask.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import List 3 | from django_redis import get_redis_connection 4 | from deepdiff import DeepHash 5 | from elasticsearch_dsl import Search 6 | 7 | # 3 hours minutes (in seconds) 8 | DEAD_LINK_MASK_TTL = 60 * 60 * 3 9 | 10 | 11 | def get_query_hash(s: Search) -> str: 12 | """ 13 | Generates a deterministic Murmur3 or SHA256 hash from the serialized Search 14 | object using DeepHash so that two Search objects with the same content will 15 | produce the same hash. 16 | 17 | :param s: Search object to be serialized and hashed. 18 | :return: Serialized Search object hash. 19 | """ 20 | serialized_search_obj = s.to_dict() 21 | serialized_search_obj.pop('from', None) 22 | serialized_search_obj.pop('size', None) 23 | deep_hash = DeepHash(serialized_search_obj)[serialized_search_obj] 24 | return deep_hash 25 | 26 | 27 | def get_query_mask(query_hash: str) -> List[int]: 28 | """ 29 | Fetches an existing query mask for a given query hash 30 | or returns an empty one. 31 | 32 | :param query_hash: Unique value for a particular query. 33 | :return: Boolean mask as a list of integers (0 or 1). 34 | """ 35 | redis = get_redis_connection("default") 36 | key = f'{query_hash}:dead_link_mask' 37 | return list(map(int, redis.lrange(key, 0, -1))) 38 | 39 | 40 | def save_query_mask(query_hash: str, mask: List): 41 | """ 42 | Saves a query mask to redis. 43 | 44 | :param mask: Boolean mask as a list of integers (0 or 1). 45 | :param query_hash: Unique value to be used as key. 46 | """ 47 | redis_pipe = get_redis_connection("default").pipeline() 48 | key = f'{query_hash}:dead_link_mask' 49 | 50 | redis_pipe.delete(key) 51 | redis_pipe.rpush(key, *mask) 52 | redis_pipe.expire(key, DEAD_LINK_MASK_TTL) 53 | redis_pipe.execute() 54 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/utils/exceptions.py: -------------------------------------------------------------------------------- 1 | from rest_framework import status 2 | from rest_framework.response import Response 3 | """ 4 | Override the presentation of ValidationErrors, which are deeply nested and 5 | difficult to parse. 6 | 7 | Note that error 500 pages are not handled here; they are generated by the 8 | production web server configuration, and not reproducible locally. 9 | """ 10 | 11 | 12 | def parse_value_errors(errors): 13 | fields = ['q'] 14 | messages = [errors.args[0].info['error']['root_cause'][0]['reason']] 15 | return fields, messages 16 | 17 | 18 | def parse_non_value_errors(errors): 19 | fields = [f for f in errors] 20 | messages = [] 21 | for _field in errors: 22 | error = errors[_field] 23 | for e in error: 24 | messages.append(e) 25 | 26 | # Don't return "non field errors" in deprecation exceptions. There is no 27 | # other way to recover the affected fields other than parsing the error. 28 | if fields == ['non_field_errors']: 29 | split_error = list(messages) 30 | field_idx = ' '.join(messages).index('Parameter') + 1 31 | fields = [split_error[field_idx].replace("'", '')][0] 32 | 33 | return fields, messages 34 | 35 | 36 | def input_error_response(errors): 37 | if isinstance(errors, ValueError): 38 | fields, messages = parse_value_errors(errors) 39 | else: 40 | fields, messages = parse_non_value_errors(errors) 41 | 42 | detail = "Invalid input given for fields." 43 | for i, _ in enumerate(fields): 44 | detail += f" '{fields[i]}' -> {messages[i]}" 45 | 46 | return Response( 47 | status=status.HTTP_400_BAD_REQUEST, 48 | data={ 49 | 'error': 'InputError', 50 | 'detail': detail, 51 | 'fields': fields 52 | } 53 | ) 54 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/utils/fonts/SourceCodePro-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/utils/fonts/SourceCodePro-Bold.ttf -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/utils/fonts/SourceSansPro-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/utils/fonts/SourceSansPro-Bold.ttf -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/utils/oauth2_helper.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | import logging 3 | from oauth2_provider.models import AccessToken 4 | from cccatalog.api.models import ThrottledApplication 5 | 6 | log = logging.getLogger(__name__) 7 | 8 | 9 | def get_token_info(token: str): 10 | """ 11 | Recover an OAuth2 application client ID and rate limit model from an access 12 | token. 13 | 14 | :param token: An OAuth2 access token. 15 | :return: If the token is valid, return the client ID associated with the 16 | token, rate limit model, and email verification status as a tuple; else 17 | return (None, None, None). 18 | """ 19 | try: 20 | token = AccessToken.objects.get(token=token) 21 | except AccessToken.DoesNotExist: 22 | return None, None, None 23 | if token.expires >= dt.datetime.now(token.expires.tzinfo): 24 | try: 25 | application = ThrottledApplication.objects.get(accesstoken=token) 26 | client_id = str(application.client_id) 27 | rate_limit_model = application.rate_limit_model 28 | verified = application.verified 29 | except ThrottledApplication.DoesNotExist: 30 | log.warning( 31 | 'Failed to find application associated with access token.' 32 | ) 33 | client_id = None 34 | rate_limit_model = None 35 | verified = None 36 | return client_id, rate_limit_model, verified 37 | else: 38 | log.warning('Rejected expired access token.') 39 | return None, None, None 40 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/utils/scheduled_tasks.py: -------------------------------------------------------------------------------- 1 | from django_cron import CronJobBase, Schedule 2 | from django_redis import get_redis_connection 3 | from django.core.exceptions import ObjectDoesNotExist 4 | from cccatalog.api.models import Image 5 | import logging as log 6 | import time 7 | """ 8 | Cron-like tasks run at a set interval. `python3 manage.py runcrons` will 9 | execute any scheduled tasks. This is intended to run on all instances of the 10 | server. 11 | 12 | Even though there may be multiple instances of the server running, a job is 13 | guaranteed to execute only once. Jobs are not run unless it can acquire a lock 14 | inside of the cache (shared by all instances of cccatalog-api). 15 | """ 16 | model_name_to_instance = { 17 | 'Image': Image 18 | } 19 | 20 | 21 | class SaveCachedTrafficStats(CronJobBase): 22 | """ 23 | Traffic statistics (view count, API usage) are stored in Redis for fast 24 | updates and retrieval. In order to ensure durability of statistics and 25 | minimize cache memory requirements, they are intermittently replicated to 26 | the database in small batches and subsequently evicted from the cache if 27 | they exceed a certain age. Recently updated view data is replicated but not 28 | evicted. 29 | 30 | After traffic statistics have been stored in the database, they are 31 | replicated to Elasticsearch by es-syncer and used to compute trending views. 32 | """ 33 | RUN_EVERY_MINS = 20 34 | schedule = Schedule(run_every_mins=RUN_EVERY_MINS) 35 | # Number of failures before notification is sent 36 | MIN_NUM_FAILURES = 5 37 | code = 'cccatalog.api.utils.scheduled_tasks.SaveCachedTrafficStats' 38 | 39 | def do(self): 40 | log.info('Starting view count persistence job') 41 | redis = get_redis_connection('traffic_stats') 42 | one_day_ago = time.time() - 60 * 60 * 24 43 | last_save_time = time.time() - (self.RUN_EVERY_MINS * 60) 44 | old_view_data = redis.zrangebyscore( 45 | 'model-last-accessed', '-inf', one_day_ago 46 | ) 47 | recent_view_data = redis.zrangebyscore( 48 | 'model-last-accessed', last_save_time, 'inf' 49 | ) 50 | self._save_views_to_db(old_view_data, evict_from_cache=True) 51 | redis.zremrangebyscore('model-last-accessed', '-inf', one_day_ago) 52 | self._save_views_to_db(recent_view_data) 53 | log.info('Saved cached traffic stats') 54 | 55 | @staticmethod 56 | def _save_views_to_db(view_keys, evict_from_cache=False): 57 | if not view_keys: 58 | return 59 | redis = get_redis_connection('traffic_stats') 60 | view_keys = [x.decode('utf-8') for x in view_keys] 61 | for obj in view_keys: 62 | model_name, model_id = obj.split(':') 63 | if model_name in model_name_to_instance: 64 | model = model_name_to_instance[model_name] 65 | try: 66 | instance = model.objects.get(id=model_id) 67 | instance.view_count = redis.get(obj) 68 | instance.save(update_fields=['view_count']) 69 | except ObjectDoesNotExist: 70 | log.warning('Tried to save views of non-existent instance.') 71 | else: 72 | log.warning( 73 | 'Tried to persist views of non-existent model ' + model_name 74 | ) 75 | if evict_from_cache: 76 | redis.delete(*view_keys) 77 | log.info('Saved ' + str(view_keys)) 78 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/utils/throttle.py: -------------------------------------------------------------------------------- 1 | from rest_framework.throttling import SimpleRateThrottle 2 | import logging 3 | from cccatalog.api.utils.oauth2_helper import get_token_info 4 | from django_redis import get_redis_connection 5 | 6 | log = logging.getLogger(__name__) 7 | 8 | 9 | def _from_internal_network(ip): 10 | redis = get_redis_connection('default') 11 | return redis.sismember('ip-whitelist', ip) 12 | 13 | 14 | class AnonRateThrottle(SimpleRateThrottle): 15 | """ 16 | Limits the rate of API calls that may be made by a anonymous users. 17 | 18 | The IP address of the request will be used as the unique cache key. 19 | """ 20 | scope = 'anon' 21 | 22 | def get_cache_key(self, request, view): 23 | if _from_internal_network(self.get_ident(request)): 24 | return None 25 | # Do not throttle requests with a valid access token. 26 | if request.auth: 27 | client_id, _, verified = get_token_info(str(request.auth)) 28 | if client_id and verified: 29 | return None 30 | 31 | return self.cache_format % { 32 | 'scope': self.scope, 33 | 'ident': self.get_ident(request) 34 | } 35 | 36 | 37 | class PostRequestThrottler(AnonRateThrottle): 38 | rate = '30/day' 39 | 40 | 41 | class BurstRateThrottle(AnonRateThrottle): 42 | scope = 'anon_burst' 43 | 44 | 45 | class SustainedRateThrottle(AnonRateThrottle): 46 | scope = 'anon_sustained' 47 | 48 | 49 | class TenPerDay(AnonRateThrottle): 50 | rate = '10/day' 51 | 52 | 53 | class OneThousandPerMinute(AnonRateThrottle): 54 | rate = '1000/min' 55 | 56 | 57 | class OnePerSecond(AnonRateThrottle): 58 | rate = '1/second' 59 | 60 | 61 | class OAuth2IdThrottleRate(SimpleRateThrottle): 62 | """ 63 | Limits the rate of API calls that may be made by a given user's Oauth2 64 | client ID. Can be configured to apply to either standard or enhanced 65 | API keys. 66 | """ 67 | scope = 'oauth2_client_credentials' 68 | applies_to_rate_limit_model = 'standard' 69 | 70 | def get_cache_key(self, request, view): 71 | if _from_internal_network(self.get_ident(request)): 72 | return None 73 | # Find the client ID associated with the access token. 74 | auth = str(request.auth) 75 | client_id, rate_limit_model, verified = get_token_info(auth) 76 | if client_id and rate_limit_model == self.applies_to_rate_limit_model: 77 | ident = client_id 78 | else: 79 | # Don't throttle invalid tokens; leave that to the anonymous 80 | # throttlers. Don't throttle enhanced rate limit tokens either. 81 | return None 82 | 83 | return self.cache_format % { 84 | 'scope': self.scope, 85 | 'ident': ident 86 | } 87 | 88 | 89 | class OAuth2IdThrottleSustainedRate(OAuth2IdThrottleRate): 90 | applies_to_rate_limit_model = 'standard' 91 | scope = 'oauth2_client_credentials_sustained' 92 | 93 | 94 | class OAuth2IdThrottleBurstRate(OAuth2IdThrottleRate): 95 | applies_to_rate_limit_model = 'standard' 96 | scope = 'oauth2_client_credentials_burst' 97 | 98 | 99 | class EnhancedOAuth2IdThrottleSustainedRate(OAuth2IdThrottleRate): 100 | applies_to_rate_limit_model = 'enhanced' 101 | scope = 'enhanced_oauth2_client_credentials_sustained' 102 | 103 | 104 | class EnhancedOAuth2IdThrottleBurstRate(OAuth2IdThrottleRate): 105 | applies_to_rate_limit_model = 'enhanced' 106 | scope = 'enhanced_oauth2_client_credentials_burst' 107 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/utils/validate_images.py: -------------------------------------------------------------------------------- 1 | import time 2 | import grequests 3 | import logging 4 | from django_redis import get_redis_connection 5 | from cccatalog.api.utils.dead_link_mask import get_query_mask, save_query_mask 6 | 7 | log = logging.getLogger(__name__) 8 | 9 | 10 | def validate_images(query_hash, start_slice, results, image_urls): 11 | """ 12 | Make sure images exist before we display them. Treat redirects as broken 13 | links since 99% of the time the redirect leads to a generic "not found" 14 | placeholder. 15 | 16 | Results are cached in redis and shared amongst all API servers in the 17 | cluster. 18 | """ 19 | if not image_urls: 20 | return 21 | start_time = time.time() 22 | # Pull matching images from the cache. 23 | redis = get_redis_connection("default") 24 | cache_prefix = 'valid:' 25 | cached_statuses = redis.mget([cache_prefix + url for url in image_urls]) 26 | cached_statuses = [ 27 | int(b.decode('utf-8')) 28 | if b is not None else None for b in cached_statuses 29 | ] 30 | # Anything that isn't in the cache needs to be validated via HEAD request. 31 | to_verify = {} 32 | for idx, url in enumerate(image_urls): 33 | if cached_statuses[idx] is None: 34 | to_verify[url] = idx 35 | reqs = ( 36 | grequests.head(u, allow_redirects=False, timeout=2, verify=False) 37 | for u in to_verify.keys() 38 | ) 39 | verified = grequests.map(reqs, exception_handler=_validation_failure) 40 | # Cache newly verified image statuses. 41 | to_cache = {} 42 | for idx, url in enumerate(to_verify.keys()): 43 | cache_key = cache_prefix + url 44 | if verified[idx]: 45 | status = verified[idx].status_code 46 | # Response didn't arrive in time. Try again later. 47 | else: 48 | status = -1 49 | to_cache[cache_key] = status 50 | 51 | thirty_minutes = 60 * 30 52 | twenty_four_hours_seconds = 60 * 60 * 24 53 | pipe = redis.pipeline() 54 | if len(to_cache) > 0: 55 | pipe.mset(to_cache) 56 | for key, status in to_cache.items(): 57 | # Cache successful links for a day, and broken links for 120 days. 58 | if status == 200: 59 | pipe.expire(key, twenty_four_hours_seconds) 60 | elif status == -1: 61 | # Content provider failed to respond; try again in a short interval 62 | pipe.expire(key, thirty_minutes) 63 | else: 64 | pipe.expire(key, twenty_four_hours_seconds * 120) 65 | pipe.execute() 66 | 67 | # Merge newly verified results with cached statuses 68 | for idx, url in enumerate(to_verify): 69 | cache_idx = to_verify[url] 70 | if verified[idx] is not None: 71 | cached_statuses[cache_idx] = verified[idx].status_code 72 | else: 73 | cached_statuses[cache_idx] = -1 74 | 75 | # Create a new dead link mask 76 | new_mask = [1] * len(results) 77 | # Delete broken images from the search results response. 78 | for idx, _ in enumerate(cached_statuses): 79 | del_idx = len(cached_statuses) - idx - 1 80 | status = cached_statuses[del_idx] 81 | if status == 429 or status == 403: 82 | log.warning( 83 | 'Image validation failed due to rate limiting or blocking. ' 84 | 'Affected URL: {}'.format(image_urls[idx]) 85 | ) 86 | elif status != 200: 87 | log.info( 88 | 'Deleting broken image with ID {} from results.' 89 | .format(results[del_idx]['identifier']) 90 | ) 91 | del results[del_idx] 92 | new_mask[del_idx] = 0 93 | 94 | # Merge and cache the new mask 95 | mask = get_query_mask(query_hash) 96 | if mask: 97 | new_mask = mask[:start_slice] + new_mask 98 | save_query_mask(query_hash, new_mask) 99 | 100 | end_time = time.time() 101 | log.info('Validated images in {} '.format(end_time - start_time)) 102 | 103 | 104 | def _validation_failure(request, exception): 105 | log.warning('Failed to validate image! Reason: {}'.format(exception)) 106 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/views/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/views/__init__.py -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/api/views/link_views.py: -------------------------------------------------------------------------------- 1 | from django.http import HttpResponsePermanentRedirect 2 | from cccatalog.api.models import ShortenedLink 3 | from rest_framework.generics import GenericAPIView 4 | from rest_framework.views import APIView 5 | from rest_framework.decorators import throttle_classes 6 | from cccatalog.api.utils.throttle import PostRequestThrottler 7 | from cccatalog.api.serializers.link_serializers import ShortenedLinkSerializer 8 | from cccatalog.api.models import ShortenedLink 9 | from cccatalog import settings 10 | from rest_framework.response import Response 11 | from rest_framework import serializers 12 | from drf_yasg.utils import swagger_auto_schema 13 | 14 | 15 | class _LinkCreatedResponse(serializers.Serializer): 16 | shortened_url = serializers.URLField() 17 | 18 | 19 | class CreateShortenedLink(GenericAPIView): 20 | serializer_class = ShortenedLinkSerializer 21 | swagger_schema = None 22 | 23 | @throttle_classes([PostRequestThrottler]) 24 | def post(self, request, format=None): 25 | """ Create a shortened URL. Only domains within the CC Catalog platform 26 | will be accepted. The `full_url` must be a whitelisted endpoint.""" 27 | full_url = request.data['full_url'] 28 | serialized = ShortenedLinkSerializer(data={'full_url': full_url}) 29 | if not serialized.is_valid(): 30 | return Response( 31 | status=400, 32 | data=serialized.errors 33 | ) 34 | 35 | try: 36 | existing_path = ShortenedLink \ 37 | .objects \ 38 | .get(full_url=full_url) \ 39 | .shortened_path 40 | shortened_url = settings.ROOT_SHORTENING_URL + '/' + existing_path 41 | except ShortenedLink.DoesNotExist: 42 | shortened_path = serialized.save() 43 | shortened_url = settings.ROOT_SHORTENING_URL + '/' + shortened_path 44 | 45 | return Response( 46 | status=200, 47 | data={ 48 | 'shortened_url': shortened_url 49 | } 50 | ) 51 | 52 | 53 | class ResolveShortenedLink(APIView): 54 | swagger_schema = None 55 | 56 | def get(self, request, path, format=None): 57 | """ 58 | Given a shortened URL path, such as 'zb3k0', resolve the full URL 59 | and redirect the caller. 60 | """ 61 | try: 62 | link_instance = ShortenedLink.objects.get(shortened_path=path) 63 | except ShortenedLink.DoesNotExist: 64 | return Response( 65 | status=404, 66 | data='Not Found' 67 | ) 68 | full_url = link_instance.full_url 69 | return HttpResponsePermanentRedirect(full_url) 70 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/custom_auto_schema.py: -------------------------------------------------------------------------------- 1 | from drf_yasg import openapi 2 | from drf_yasg.utils import ( 3 | filter_none, force_real_str, force_serializer_instance, get_consumes, 4 | get_produces, guess_response_status, merge_params, no_body, 5 | param_list_to_odict 6 | ) 7 | from drf_yasg.inspectors import SwaggerAutoSchema 8 | 9 | 10 | class CustomAutoSchema(SwaggerAutoSchema): 11 | 12 | def get_operation(self, operation_keys=None): 13 | operation_keys = operation_keys or self.operation_keys 14 | 15 | consumes = self.get_consumes() 16 | produces = self.get_produces() 17 | 18 | body = self.get_request_body_parameters(consumes) 19 | query = self.get_query_parameters() 20 | parameters = body + query 21 | parameters = filter_none(parameters) 22 | parameters = self.add_manual_parameters(parameters) 23 | 24 | operation_id = self.get_operation_id(operation_keys) 25 | summary, description = self.get_summary_and_description() 26 | security = self.get_security() 27 | assert security is None or isinstance(security, list), \ 28 | "security must be a list of security requirement objects" 29 | deprecated = self.is_deprecated() 30 | tags = self.get_tags(operation_keys) 31 | 32 | responses = self.get_responses() 33 | 34 | return openapi.Operation( 35 | operation_id=operation_id, 36 | description=force_real_str(description), 37 | summary=force_real_str(summary), 38 | responses=responses, 39 | parameters=parameters, 40 | consumes=consumes, 41 | produces=produces, 42 | tags=tags, 43 | security=security, 44 | deprecated=deprecated, 45 | **{'x-code-samples': self.overrides.get('code_examples')} 46 | ) 47 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/scripts/api_load_testing/locustfile.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import uuid 4 | from locust import HttpLocust, TaskSet, task 5 | 6 | 7 | class BrowseResults(TaskSet): 8 | @task(30) 9 | def view_image(self): 10 | if self.parent.results: 11 | image_id = random.choice(self.parent.results)['id'] 12 | self.client.get("/image/{}".format(image_id), name="/image/[id]") 13 | 14 | @task(10) 15 | def favorite_images(self): 16 | pass 17 | if self.parent.results: 18 | list_length = random.choice([2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 9]) 19 | selected_images = self.parent.results[0:list_length] 20 | ids = [image['id'] for image in selected_images] 21 | self.client.post("/list", 22 | {"title": "Load test" + str(ids), "images": ids}) 23 | 24 | @task(10) 25 | def shorten_link(self): 26 | _unique = str(uuid.uuid4()) 27 | image_link = "http://api-dev.creativecommons.engineering/list/{}"\ 28 | .format(_unique) 29 | self.client.post("/link", {"full_url": image_link}) 30 | 31 | 32 | class UserBehavior(TaskSet): 33 | tasks = {BrowseResults: 8} 34 | 35 | def __init__(self, parent): 36 | self.results = None 37 | self.query = None 38 | with open("./common_english_words.txt", "r") as f: 39 | self.common_words = f.read().splitlines() 40 | super().__init__(parent) 41 | 42 | @task(1000) 43 | def search(self): 44 | query_length = random.choice([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 5]) 45 | query = [random.choice(self.common_words) for _ in range(query_length)] 46 | query = ','.join(query) 47 | self.query = query 48 | response = self.client.get( 49 | "/image/search?q={}".format(query), 50 | name="/image/search?q=[keywords]" 51 | ) 52 | self.results = json.loads(response.content.decode("utf-8"))['results'] 53 | 54 | 55 | class SearchUser(HttpLocust): 56 | task_set = UserBehavior 57 | min_wait = 3000 58 | max_wait = 9000 59 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/scripts/migration/migrate_lists.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import requests 3 | import json 4 | import logging as log 5 | """ 6 | Tools for migrating legacy lists from CC Search Beta to the CC Catalog platform. 7 | """ 8 | 9 | 10 | def import_lists_to_catalog(parsed_lists): 11 | success = 0 12 | errors = [] 13 | for _list in parsed_lists: 14 | _list = parsed_lists[_list] 15 | payload = { 16 | 'title': _list['title'], 17 | 'images': _list['images'] 18 | } 19 | response = requests.post( 20 | 'http://api.creativecommons.engineering/list', 21 | data=payload 22 | ) 23 | if 300 > response.status_code >= 200: 24 | json_response = json.loads(response.text) 25 | new_url = json_response['url'] 26 | success += 1 27 | print(_list['email'], new_url, _list['title'], sep='||') 28 | else: 29 | # A handful of lists from the legacy application are empty, which 30 | # isn't accepted in the new API. Skip over them and log it. 31 | errors.append((_list['title'], response.text)) 32 | continue 33 | log.info('Migrated {} lists successfully'.format(success)) 34 | if errors: 35 | log.error("The following errors occurred:") 36 | for error in errors: 37 | log.error(error) 38 | 39 | 40 | if __name__ == '__main__': 41 | with open('csvs/prod/lists.csv', 'r') as lists, \ 42 | open('csvs/prod/list_images.csv', 'r') as list_images, \ 43 | open('csvs/prod/users.csv', 'r') as users: 44 | lists = csv.DictReader(lists) 45 | list_images = csv.DictReader(list_images) 46 | users = csv.DictReader(users) 47 | 48 | # Compile all of the data required to migrate the lists and find the 49 | # emails of their owners. 50 | users_dict = {row['id']: row['email'] for row in users} 51 | lists_dict = {} 52 | for row in lists: 53 | if row['owner_id'] == '': 54 | continue 55 | lists_dict[row['id']] = { 56 | 'email': users_dict[row['owner_id']], 57 | 'title': row['title'], 58 | 'images': [] 59 | } 60 | for row in list_images: 61 | if row['list_id'] in lists_dict: 62 | lists_dict[row['list_id']]['images'].append(row['image_id']) 63 | 64 | # Use the API to migrate the lists. 65 | import_lists_to_catalog(lists_dict) 66 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/scripts/thumbnail_load_test/locustfile.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import gevent.queue 3 | import gevent.pool 4 | import grequests 5 | import statistics 6 | import json 7 | import datetime 8 | from locust import HttpLocust, TaskSet, task, between 9 | from collections import defaultdict 10 | """ 11 | Swarm the API server with async requests for thumbnails. Requires `url_dump.csv` 12 | in the same directory as the script. It is intentionally omitted from source 13 | control. 14 | 15 | The format of the csv is: 16 | url,provider 17 | https://example.com,exampleprovider 18 | http://secondexample.com,secondprovider 19 | . . . 20 | 21 | To prepare the server for testing: 22 | - Ensure that the hardware allocation matches production. 23 | - Disable referer origin limiting in the imageproxy server. 24 | - Empty the S3 thumbnail cache bucket. 25 | 26 | To run the test: 27 | `locust` 28 | Open the web interface and start a test with the desired number of workers. 29 | Watch the console for updates on the progress of the test and the number of 30 | successful vs failed thumbnails. 31 | 32 | Optionally rerun the test after the cache has been warmed up. 33 | """ 34 | PROXY_URL = "https://api-dev.creativecommons.engineering/t/600/" 35 | 36 | url_queue = gevent.queue.Queue() 37 | provider_counts = defaultdict(int) 38 | url_provider = {} 39 | thumb_statuses = defaultdict(int) 40 | statuses_by_provider = {} 41 | response_times = [] 42 | 43 | 44 | with open('url_dump.csv') as urls_csv: 45 | reader = csv.reader(urls_csv) 46 | for row in reader: 47 | if row[0] == 'url': 48 | continue 49 | url = row[0] 50 | provider = row[1] 51 | url_queue.put((url, provider)) 52 | url_provider[url] = provider 53 | provider_counts[provider] += 1 54 | 55 | 56 | def print_current_stats(): 57 | """ 58 | Re-compute and print current thumbnail statistics. 59 | """ 60 | mean_response_time = statistics.mean(response_times) 61 | failed = 0 62 | successful = 0 63 | for status in thumb_statuses: 64 | num_statuses = thumb_statuses[status] 65 | if status >= 300 and status != 404: 66 | failed += num_statuses 67 | else: 68 | successful += num_statuses 69 | 70 | out = { 71 | 'timestamp': str(datetime.datetime.now()), 72 | 'mean_response_time': mean_response_time, 73 | 'successful': successful, 74 | 'failed': failed, 75 | 'statuses': thumb_statuses, 76 | 'provider_statuses': statuses_by_provider 77 | } 78 | print(json.dumps(out)) 79 | 80 | 81 | def record_stats(responses, providers): 82 | for idx, resp in enumerate(responses): 83 | response_times.append(resp.elapsed.total_seconds()) 84 | thumb_statuses[resp.status_code] += 1 85 | provider = providers[idx] 86 | if provider not in statuses_by_provider: 87 | statuses_by_provider[provider] = defaultdict(int) 88 | statuses_by_provider[provider][resp.status_code] += 1 89 | 90 | 91 | class ThumbTask(TaskSet): 92 | @task 93 | def load_thumbs(self): 94 | reqs = [] 95 | providers = [] 96 | for _ in range(20): 97 | base_url, provider = url_queue.get() 98 | providers.append(provider) 99 | proxied_url = f'{PROXY_URL}{base_url}' 100 | reqs.append(grequests.get(proxied_url)) 101 | thumb_responses = grequests.map(reqs) 102 | record_stats(thumb_responses, providers) 103 | print_current_stats() 104 | 105 | 106 | class ThumbLocust(HttpLocust): 107 | """ 108 | Load a page's worth of thumbnails every 3 to 6 seconds. 109 | """ 110 | wait_time = between(3, 6) 111 | task_set = ThumbTask 112 | -------------------------------------------------------------------------------- /cccatalog-api/cccatalog/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for cccatalog project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/2.0/howto/deployment/wsgi/ 8 | """ 9 | from gevent import monkey; monkey.patch_all() 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | from wsgi_basic_auth import BasicAuth 14 | 15 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cccatalog.settings") 16 | 17 | application = get_wsgi_application() 18 | application = BasicAuth(application) 19 | -------------------------------------------------------------------------------- /cccatalog-api/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | from gevent import monkey 5 | monkey.patch_all() 6 | 7 | if __name__ == "__main__": 8 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cccatalog.settings") 9 | try: 10 | from django.core.management import execute_from_command_line 11 | except ImportError as exc: 12 | raise ImportError( 13 | "Couldn't import Django. Are you sure it's installed and " 14 | "available on your PYTHONPATH environment variable? Did you " 15 | "forget to activate a virtual environment?" 16 | ) from exc 17 | execute_from_command_line(sys.argv) 18 | -------------------------------------------------------------------------------- /cccatalog-api/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | DJANGO_SETTINGS_MODULE = cccatalog.settings 3 | -------------------------------------------------------------------------------- /cccatalog-api/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | while [[ "$(curl --insecure -s -o /dev/null -w '%{http_code}' http://es:9200/)" != "200" ]] 6 | do 7 | echo "Waiting for Elasticsearch connection..." 8 | sleep 2 9 | done 10 | 11 | exec "$@" 12 | -------------------------------------------------------------------------------- /cccatalog-api/test/README: -------------------------------------------------------------------------------- 1 | 1. Set environment variable INTEGRATION_TEST_URL to the instance you would like to test. Defaults to localhost. 2 | 2. Run `pytest -s` 3 | -------------------------------------------------------------------------------- /cccatalog-api/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/test/__init__.py -------------------------------------------------------------------------------- /cccatalog-api/test/api_live_search_qa.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | 4 | """ 5 | Tests to run against a live instance of CC Search with a significant (10M+) 6 | number of records. Quality of search rankings can be affected by the number of 7 | documents in the search index, so toy examples with five or six documents 8 | do not accurately model relevance at scale. 9 | """ 10 | 11 | API_URL = 'https://api-dev.creativecommons.engineering' 12 | 13 | 14 | def _phrase_in_tags(tags, term): 15 | for tag in tags: 16 | if 'name' in tag: 17 | if tag['name'] == term: 18 | return True 19 | return False 20 | 21 | 22 | def _phrase_in_title(title, term): 23 | return term in title 24 | 25 | 26 | def test_phrase_relevance(): 27 | """ 28 | If I search for "home office", the top results ought to have the phrase 29 | 'home office' in the tags or title. 30 | """ 31 | search_term = 'home office' 32 | response = requests.get( 33 | API_URL + '/image/search?q={}'.format(search_term), 34 | verify=False 35 | ) 36 | assert response.status_code == 200 37 | parsed = json.loads(response.text) 38 | first_result = parsed['results'][0] 39 | assert ( 40 | _phrase_in_tags(first_result['tags'], search_term) or 41 | _phrase_in_title(first_result['title'], search_term) 42 | ) 43 | -------------------------------------------------------------------------------- /cccatalog-api/test/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Local environments don't have valid certificates; suppress this warning. 3 | export PYTHONWARNINGS="ignore:Unverified HTTPS request" 4 | export INTEGRATION_TEST_URL="http://localhost:8000" 5 | DJANGO_SETTINGS_MODULE='cccatalog.settings' PYTHONPATH=. DJANGO_SECRET_KEY='ny#b__$f6ry4wy8oxre97&-68u_0lk3gw(z=d40_dxey3zw0v1' DJANGO_DATABASE_NAME='openledger' DJANGO_DATABASE_USER='deploy' DJANGO_DATABASE_PASSWORD='deploy' DJANGO_DATABASE_HOST='localhost' REDIS_HOST='localhost' pytest -s --disable-pytest-warnings test/v1_integration_test.py 6 | succeeded=$? 7 | if [ $succeeded != 0 ]; then 8 | echo 'Tests failed. Full system logs: ' 9 | docker-compose logs 10 | fi 11 | exit $succeeded 12 | -------------------------------------------------------------------------------- /cccatalog-api/test/search_qa_test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pprint 3 | import json 4 | import pytest 5 | from enum import Enum 6 | from .api_live_integration_test import API_URL 7 | 8 | """ 9 | Perform some basic tests to ensure that search rankings work as anticipated. 10 | """ 11 | 12 | 13 | class QAScores(Enum): 14 | TARGET = 1 15 | LESS_RELEVANT = 2 16 | NOT_RELEVANT = 3 17 | 18 | 19 | @pytest.mark.skip(reason="This test is nondeterministic") 20 | def test_phrase_relevance(): 21 | res = requests.get( 22 | "{}/image/search?q=home office&filter_dead=false&qa=true" 23 | .format(API_URL) 24 | ) 25 | parsed = json.loads(res.text) 26 | pprint.pprint(parsed) 27 | assert int(parsed['results'][0]['id']) == QAScores.TARGET.value 28 | assert int(parsed['results'][1]['id']) < QAScores.NOT_RELEVANT.value 29 | assert int(parsed['results'][-1]['id']) != QAScores.NOT_RELEVANT.value 30 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | db: 4 | image: postgres:10.3-alpine 5 | ports: 6 | - "5432:5432" 7 | environment: 8 | POSTGRES_DB: "openledger" 9 | POSTGRES_USER: "deploy" 10 | POSTGRES_PASSWORD: "deploy" 11 | POSTGRES_HOST: "0.0.0.0" 12 | healthcheck: 13 | test: "pg_isready -U deploy -d openledger" 14 | 15 | thumbs: 16 | image: willnorris/imageproxy 17 | ports: 18 | - "8222:8222" 19 | command: ["-addr", "0.0.0.0:8222"] 20 | 21 | upstream_db: 22 | image: postgres:10.3-alpine 23 | ports: 24 | - "5433:5432" 25 | environment: 26 | POSTGRES_DB: "openledger" 27 | POSTGRES_USER: "deploy" 28 | POSTGRES_PASSWORD: "deploy" 29 | POSTGRES_HOST: "0.0.0.0" 30 | healthcheck: 31 | test: "pg_isready -U deploy -d openledger" 32 | 33 | es: 34 | image: docker.elastic.co/elasticsearch/elasticsearch:7.1.0 35 | ports: 36 | - "9200:9200" 37 | environment: 38 | # disable XPack 39 | # https://www.elastic.co/guide/en/elasticsearch/reference/5.3/docker.html#_security_note 40 | - xpack.security.enabled=false 41 | - discovery.type=single-node 42 | healthcheck: 43 | test: ["CMD-SHELL", "curl -si -XGET 'localhost:9200/_cluster/health?pretty' | grep -qE 'yellow|green'"] 44 | interval: 10s 45 | timeout: 60s 46 | retries: 10 47 | ulimits: 48 | nofile: 49 | soft: 65536 50 | hard: 65536 51 | 52 | web: 53 | build: ./cccatalog-api/ 54 | image: cccatalog_api 55 | command: python manage.py runserver 0.0.0.0:8000 56 | container_name: cccatalog-api_web_1 57 | volumes: 58 | - ./cccatalog-api:/cccatalog-api 59 | ports: 60 | - "8000:8000" 61 | - "4444:4444" 62 | depends_on: 63 | - db 64 | - es 65 | environment: 66 | - DJANGO_DATABASE_NAME=openledger 67 | - DJANGO_DATABASE_USER=deploy 68 | - DJANGO_DATABASE_PASSWORD=deploy 69 | - DJANGO_DATABASE_HOST=db 70 | - UPSTREAM_DATABASE_HOST=upstream_db 71 | - PYTHONUNBUFFERED=0 72 | - DJANGO_DEBUG_ENABLED=True 73 | - ELASTICSEARCH_URL=es 74 | - ELASTICSEARCH_PORT=9200 75 | - DISABLE_GLOBAL_THROTTLING=True 76 | - ROOT_SHORTENING_URL=localhost:8000 77 | - THUMBNAIL_PROXY_URL=http://thumbs:8222 78 | - DJANGO_SECRET_KEY=ny#b__$$f6ry4wy8oxre97&-68u_0lk3gw(z=d40_dxey3zw0v1 79 | - AWS_SECRET_ACCESS_KEY 80 | - AWS_ACCESS_KEY_ID 81 | stdin_open: true 82 | tty: true 83 | 84 | cache: 85 | image: redis:4.0.10 86 | container_name: cccatalog-api_cache_1 87 | ports: 88 | - "6379:6379" 89 | 90 | ingestion-server: 91 | build: ./ingestion_server/ 92 | command: bash -c 'sleep 20 && supervisord -c config/supervisord.conf' 93 | ports: 94 | - "8001:8001" 95 | depends_on: 96 | - db 97 | - es 98 | - indexer-worker 99 | volumes: 100 | - ./ingestion_server:/ingestion-server 101 | environment: 102 | PYTHONUNBUFFERED: "0" 103 | ELASTICSEARCH_URL: 'es' 104 | ELASTICSEARCH_PORT: "9200" 105 | DATABASE_HOST: 'db' 106 | DATABASE_USER: 'deploy' 107 | DATABASE_PASSWORD: 'deploy' 108 | DATABASE_NAME: 'openledger' 109 | DATABASE_PORT: '5432' 110 | UPSTREAM_DB_HOST: 'upstream_db' 111 | UPSTREAM_DB_PORT: 5432 112 | DB_BUFFER_SIZE: '100000' 113 | COPY_TABLES: 'image' 114 | SYNCER_POLL_INTERVAL: '60' 115 | stdin_open: true 116 | tty: true 117 | 118 | indexer-worker: 119 | build: 120 | context: ./ingestion_server/ 121 | dockerfile: Dockerfile-worker 122 | container_name: indexer-worker 123 | ports: 124 | - "8002:8002" 125 | depends_on: 126 | - db 127 | - es 128 | volumes: 129 | - ./ingestion_server:/ingestion-server 130 | environment: 131 | PYTHONUNBUFFERED: "0" 132 | ELASTICSEARCH_URL: 'es' 133 | ELASTICSEARCH_PORT: "9200" 134 | DATABASE_HOST: 'db' 135 | DATABASE_USER: 'deploy' 136 | DATABASE_PASSWORD: 'deploy' 137 | DATABASE_NAME: 'openledger' 138 | DATABASE_PORT: '5432' 139 | UPSTREAM_DB_HOST: 'upstream_db' 140 | UPSTREAM_DB_PORT: 5432 141 | DB_BUFFER_SIZE: '100000' 142 | COPY_TABLES: 'image' 143 | SYNCER_POLL_INTERVAL: '60' 144 | stdin_open: true 145 | tty: true 146 | 147 | analytics: 148 | build: ./analytics/ 149 | image: analytics 150 | container_name: cccatalog-api_analytics_1 151 | ports: 152 | - "8090:8090" 153 | environment: 154 | DATABASE_CONN: 'postgres+psycopg2://deploy:deploy@db/openledger' 155 | -------------------------------------------------------------------------------- /ingestion_server/.dockerignore: -------------------------------------------------------------------------------- 1 | venv 2 | venv2 3 | es-venv 4 | -------------------------------------------------------------------------------- /ingestion_server/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | 3 | ENV PYTHONBUFFERED 1 4 | 5 | RUN groupadd --system supervisord && useradd --system --gid supervisord supervisord 6 | 7 | RUN apt-get update \ 8 | && apt-get install -y supervisor \ 9 | && mkdir -p /var/log/supervisord/ \ 10 | && chown -R supervisord:supervisord /var/log/supervisord 11 | 12 | # Install Python dependency management tools 13 | RUN pip install --upgrade pip \ 14 | && pip install --upgrade setuptools \ 15 | && pip install --upgrade pipenv 16 | 17 | # Copy all files into the container 18 | COPY . /ingestion_server/ 19 | WORKDIR /ingestion_server 20 | RUN chown -R supervisord:supervisord /ingestion_server 21 | ENV PYTHONPATH=$PYTHONPATH:/ingestion_server/ 22 | 23 | # Install the dependencies system-wide 24 | # TODO: Use build args to avoid installing dev dependencies in production 25 | RUN pipenv install --deploy --system --dev 26 | USER supervisord 27 | EXPOSE 8001 28 | CMD ["supervisord", "-c", "/ingestion_server/config/supervisord.conf"] 29 | -------------------------------------------------------------------------------- /ingestion_server/Dockerfile-worker: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | 3 | ENV PYTHONBUFFERED 1 4 | 5 | # Install Python dependency management tools 6 | RUN pip install --upgrade pip \ 7 | && pip install --upgrade setuptools \ 8 | && pip install --upgrade pipenv 9 | 10 | # Copy all files into the container 11 | COPY . /ingestion_server/ 12 | WORKDIR /ingestion_server 13 | ENV PYTHONPATH=$PYTHONPATH:/ingestion_server/ 14 | 15 | RUN pipenv install --deploy --system --dev 16 | EXPOSE 8002 17 | CMD gunicorn indexer_worker:api -b 0.0.0.0:8002 --reload --access-logfile '-' --error-logfile '-' --chdir ./ingestion_server/ 18 | -------------------------------------------------------------------------------- /ingestion_server/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | remote-pdb = "*" 8 | ipython = "*" 9 | pipdeptree = "*" 10 | pycodestyle = "*" 11 | 12 | [packages] 13 | aws-requests-auth = "*" 14 | bottle = "*" 15 | elasticsearch-dsl = "==7.0.0" 16 | falcon = "*" 17 | gunicorn = "*" 18 | psycopg2-binary = "*" 19 | PyYAML = "*" 20 | boto3 = "*" 21 | filelock = "*" 22 | pytest = "*" 23 | tldextract = "*" 24 | -------------------------------------------------------------------------------- /ingestion_server/README.md: -------------------------------------------------------------------------------- 1 | # Ingestion Server 2 | 3 | ## Introduction 4 | Ingestion Server is a small private API for copying data from an upstream source and loading it into the CC Catalog API. This is a two step process: 5 | 1. The data is copied from the upstream CC Catalog database and into the downstream API database. 6 | 2. Data from the downstream API database gets indexed in Elasticsearch. 7 | 8 | For example, let's say that I want to download and index all new images. 9 | `http POST ingestion.private:8001/task <<<'{"model": "image", "action": "INGEST_UPSTREAM"}'` 10 | 11 | Performance is dependent on the size of the target Elasticsearch cluster, database throughput, and bandwidth available to the ingestion server. The primary bottleneck is indexing to Elasticsearch. 12 | 13 | ## How Indexing Works 14 | ![How indexing works](https://github.com/creativecommons/cccatalog-api/blob/master/ingestion_server/howitworks.png) 15 | 16 | ## Safety and security considerations 17 | The server has been designed to fail gracefully in the event of network interruptions, full disks, etc. If a task fails to complete successfully, the whole process is rolled back with zero impact to production. 18 | 19 | The server is designed to be run in a private network only. You must not expose the private Ingestion Server API to the public internet. 20 | 21 | ## Running the tests 22 | This runs a simulated environment in Docker containers and ensures that ingestion is working properly. 23 | ``` 24 | mkvirtualenv venv 25 | source venv/bin/activate 26 | python test/integration_tests.py 27 | ``` 28 | Set `ENABLE_DETAILED_LOGS` to `True` if more information is needed about the failing test. 29 | 30 | ## Configuration 31 | All configuration is performed through environment variables. 32 | 33 | #### Required 34 | * **COPY_TABLES**: A comma-separated list of database tables that should be replicated to Elasticsearch. **Example**: image,text 35 | 36 | * ELASTICSEARCH_URL 37 | * ELASTICSEARCH_PORT 38 | * DATABASE_HOST 39 | * DATABASE_USER 40 | * DATABASE_PASSWORD 41 | * DATABASE_NAME 42 | * DATABASE_PORT 43 | 44 | #### Optional 45 | * **DB_BUFFER_SIZE**: The number of rows to load from the database at once while replicating. **Default**: 100000 46 | 47 | To access a cluster on AWS, define these additional environment variables. 48 | * AWS_ACCESS_KEY_ID 49 | * AWS_SECRET_ACCESS_KEY 50 | * AWS_REGION 51 | 52 | ## Mapping database tables to Elasticsearch 53 | In order to synchronize a given table to Elasticsearch, the following requirements must be met: 54 | * The database table must have an autoincrementing integer primary key named `id`. 55 | * A SyncableDoctype must be defined in `es_syncer/elasticsearch_models`. The SyncableDoctype must implement the function `database_row_to_elasticsearch_model`. 56 | * The table name must be mapped to the corresponding Elasticsearch SyncableDoctype in `database_table_to_elasticsearch_model` map. 57 | 58 | Example from `es_syncer/elasticsearch_models.py`: 59 | ``` 60 | class Image(SyncableDocType): 61 | title = Text(analyzer="english") 62 | identifier = Text(index="not_analyzed") 63 | creator = Text() 64 | creator_url = Text(index="not_analyzed") 65 | tags = Text(multi=True) 66 | created_on = Date() 67 | url = Text(index="not_analyzed") 68 | thumbnail = Text(index="not_analyzed") 69 | provider = Text(index="not_analyzed") 70 | source = Text(index="not_analyzed") 71 | license = Text(index="not_analyzed") 72 | license_version = Text("not_analyzed") 73 | foreign_landing_url = Text(index="not_analyzed") 74 | meta_data = Nested() 75 | 76 | class Meta: 77 | index = 'image' 78 | 79 | @staticmethod 80 | def database_row_to_elasticsearch_doc(row, schema): 81 | return Image( 82 | pg_id=row[schema['id']], 83 | title=row[schema['title']], 84 | identifier=row[schema['identifier']], 85 | creator=row[schema['creator']], 86 | creator_url=row[schema['creator_url']], 87 | tags=row[schema['tags_list']], 88 | created_on=row[schema['created_on']], 89 | url=row[schema['url']], 90 | thumbnail=row[schema['thumbnail']], 91 | provider=row[schema['provider']], 92 | source=row[schema['source']], 93 | license=row[schema['license']], 94 | license_version=row[schema['license_version']], 95 | foreign_landing_url=row[schema['foreign_landing_url']], 96 | meta_data=row[schema['meta_data']], 97 | ) 98 | 99 | 100 | # Table name -> Elasticsearch model 101 | database_table_to_elasticsearch_model = { 102 | 'image': Image 103 | } 104 | ``` 105 | -------------------------------------------------------------------------------- /ingestion_server/config/supervisord.conf: -------------------------------------------------------------------------------- 1 | [supervisord] 2 | logfile=/var/log/supervisord/supervisord.log 3 | childlogdir=/var/log/supervisord/ 4 | logfile_maxbytes=50MB 5 | logfile_backups=5 6 | loglevel=info 7 | pidfile=/tmp/supervisord.pid 8 | nodaemon=true 9 | 10 | [program:ingestion-server] 11 | directory=/ingestion_server 12 | command=/bin/bash -c 'gunicorn api:api -b 0.0.0.0:8001 --reload --chdir ./ingestion_server/ --timeout 120' 13 | user=supervisord 14 | autostart=true 15 | autorestart=true 16 | startretries=9999999999 17 | stdout_logfile=/dev/stdout 18 | stdout_logfile_maxbytes=0 19 | stderr_logfile=/dev/stdout 20 | stderr_logfile_maxbytes=0 21 | -------------------------------------------------------------------------------- /ingestion_server/howitworks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/ingestion_server/howitworks.png -------------------------------------------------------------------------------- /ingestion_server/ingestion_server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/ingestion_server/ingestion_server/__init__.py -------------------------------------------------------------------------------- /ingestion_server/ingestion_server/api.py: -------------------------------------------------------------------------------- 1 | import falcon 2 | import logging 3 | import sys 4 | import json 5 | import uuid 6 | import time 7 | from urllib.parse import urlparse 8 | from multiprocessing import Value, Process 9 | from ingestion_server.tasks import TaskTracker, Task, TaskTypes 10 | from ingestion_server.state import worker_finished, clear_state 11 | import ingestion_server.indexer as indexer 12 | 13 | """ 14 | A small RPC API server for scheduling ingestion of upstream data and 15 | Elasticsearch indexing tasks. 16 | """ 17 | 18 | 19 | MODEL = 'model' 20 | ACTION = 'action' 21 | CALLBACK_URL = 'callback_url' 22 | SINCE_DATE = 'since_date' 23 | 24 | 25 | class TaskResource: 26 | def __init__(self, tracker: TaskTracker): 27 | self.tracker = tracker 28 | 29 | @staticmethod 30 | def _get_base_url(req): 31 | parsed = urlparse(req.url) 32 | return parsed.scheme + '://' + parsed.netloc 33 | 34 | @staticmethod 35 | def _validate_create_task(request): 36 | """ 37 | Validate an index creation task. 38 | :return: None if valid else a string containing an error message. 39 | """ 40 | if request == b'': 41 | return "Expected JSON request body but found nothing." 42 | request = json.loads(request.decode('utf-8')) 43 | if MODEL not in request: 44 | return "No model supplied in request body." 45 | if ACTION not in request: 46 | return "No action supplied in request body." 47 | if request[ACTION] not in [x.name for x in TaskTypes]: 48 | return "Invalid action." 49 | if request[ACTION] in TaskTypes and SINCE_DATE not in request: 50 | return "Received UPDATE request but no since_date." 51 | 52 | return None 53 | 54 | def on_post(self, req, resp): 55 | """ Create a task. """ 56 | raw_body = req.stream.read() 57 | request_error = self._validate_create_task(raw_body) 58 | if request_error: 59 | logging.warning( 60 | 'Invalid request made. Reason: {}'.format(request_error) 61 | ) 62 | resp.status = falcon.HTTP_400 63 | resp.media = { 64 | 'message': request_error 65 | } 66 | return 67 | body = json.loads(raw_body.decode('utf-8')) 68 | model = body[MODEL] 69 | action = body[ACTION] 70 | callback_url = None 71 | if CALLBACK_URL in body: 72 | callback_url = body[CALLBACK_URL] 73 | since_date = body[SINCE_DATE] if SINCE_DATE in body else None 74 | task_id = str(uuid.uuid4()) 75 | # Inject shared memory 76 | progress = Value('d', 0.0) 77 | finish_time = Value('d', 0.0) 78 | task = Task( 79 | model=model, 80 | task_type=TaskTypes[action], 81 | since_date=since_date, 82 | progress=progress, 83 | task_id=task_id, 84 | finish_time=finish_time, 85 | callback_url=callback_url 86 | ) 87 | task.start() 88 | task_id = self.tracker \ 89 | .add_task(task, task_id, action, progress, finish_time) 90 | base_url = self._get_base_url(req) 91 | status_url = base_url + '/task/{}'.format(task_id) 92 | # Give the task a moment to start so we can detect immediate failure. 93 | # TODO: Use IPC to detect if the job launched successfully instead 94 | # of giving it 100ms to crash. This is prone to race conditions. 95 | time.sleep(0.1) 96 | if task.is_alive(): 97 | resp.status = falcon.HTTP_202 98 | resp.media = { 99 | 'message': 'Successfully scheduled task', 100 | 'task_id': task_id, 101 | 'status_check': status_url 102 | } 103 | return 104 | else: 105 | resp.status = falcon.HTTP_500 106 | resp.media = { 107 | 'message': 'Failed to schedule task due to an internal server ' 108 | 'error. Check scheduler logs.' 109 | } 110 | return 111 | 112 | def on_get(self, req, resp): 113 | """ List all indexing tasks. """ 114 | resp.media = self.tracker.list_task_statuses() 115 | 116 | 117 | class TaskStatus: 118 | def __init__(self, tracker: TaskTracker): 119 | self.tracker = tracker 120 | 121 | def on_get(self, req, resp, task_id): 122 | """ Check the status of a single task.""" 123 | task = self.tracker.id_task[task_id] 124 | active = task.is_alive() 125 | 126 | percent_completed = self.tracker.id_progress[task_id].value 127 | resp.media = { 128 | 'active': active, 129 | 'percent_completed': percent_completed, 130 | 'error': percent_completed < 100 and not active 131 | } 132 | 133 | 134 | class WorkerFinishedResource: 135 | """ 136 | For notifying ingestion server that an indexing worker has finished its 137 | task. 138 | """ 139 | def on_post(self, req, resp): 140 | target_index = worker_finished(str(req.remote_addr)) 141 | if target_index: 142 | logging.info( 143 | 'All indexer workers finished! Attempting to promote index ' 144 | f'{target_index}' 145 | ) 146 | f = indexer.TableIndexer.go_live 147 | p = Process(target=f, args=(target_index, 'image')) 148 | p.start() 149 | 150 | 151 | class StateResource: 152 | def on_delete(self, req, resp): 153 | """ 154 | Forget about the last scheduled indexing job. 155 | """ 156 | clear_state() 157 | 158 | 159 | def create_api(log=True): 160 | """ Create an instance of the Falcon API server. """ 161 | if log: 162 | root = logging.getLogger() 163 | root.setLevel(logging.DEBUG) 164 | handler = logging.StreamHandler(sys.stdout) 165 | handler.setLevel(logging.INFO) 166 | formatter = logging.Formatter( 167 | '%(asctime)s %(levelname)s %(filename)s:%(lineno)d - %(message)s' 168 | ) 169 | handler.setFormatter(formatter) 170 | root.addHandler(handler) 171 | 172 | _api = falcon.API() 173 | task_tracker = TaskTracker() 174 | task_resource = TaskResource(task_tracker) 175 | get_task_status = TaskStatus(task_tracker) 176 | _api.add_route('/task', task_resource) 177 | _api.add_route('/task/{task_id}', get_task_status) 178 | _api.add_route('/worker_finished', WorkerFinishedResource()) 179 | _api.add_route('/state', StateResource()) 180 | 181 | return _api 182 | 183 | 184 | api = create_api() 185 | -------------------------------------------------------------------------------- /ingestion_server/ingestion_server/authority.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, auto 2 | """ 3 | Authority is a ranking from 0 to 100 (with 0 being least authoritative) 4 | indicating the pedigree of an image. Some examples of things that could impact 5 | authority: 6 | - The reputation of the website that posted an image 7 | - The popularity of the uploader on a social media site in terms of number of 8 | followers 9 | - Whether the uploader has uploaded images that have previously been flagged for 10 | copyright infringement. 11 | - etc 12 | 13 | The authority can be set from the catalog layer through the meta_data field 14 | or through the ingestion layer. As of now, we are only factoring in the 15 | reputation of the website as a static hand-picked list based on experience 16 | and search result quality, with the intention to add more sophisticated and 17 | tailored measures of authority later on. 18 | 19 | Also note that this is just one factor in rankings, and the magnitude of the 20 | boost can be adjusted at search-time. 21 | """ 22 | 23 | 24 | class AuthorityTypes(Enum): 25 | CURATED = auto() 26 | SOCIAL_MEDIA = auto() 27 | DEFAULT = auto() 28 | 29 | 30 | # We want to boost curated collections where each image has been vetted for 31 | # cultural significance. 32 | boost = { 33 | AuthorityTypes.CURATED: 90, 34 | AuthorityTypes.SOCIAL_MEDIA: 80, 35 | AuthorityTypes.DEFAULT: 85 36 | } 37 | authority_types = { 38 | 'flickr': AuthorityTypes.SOCIAL_MEDIA, 39 | 'behance': AuthorityTypes.SOCIAL_MEDIA, 40 | 'thingiverse': AuthorityTypes.SOCIAL_MEDIA, 41 | 'sketchfab': AuthorityTypes.SOCIAL_MEDIA, 42 | 'deviantart': AuthorityTypes.SOCIAL_MEDIA, 43 | 'thorvaldsensmuseum': AuthorityTypes.CURATED, 44 | 'svgsilh': AuthorityTypes.CURATED, 45 | 'smithsonian': AuthorityTypes.CURATED, 46 | 'rijksmuseum': AuthorityTypes.CURATED, 47 | 'museumsvictoria': AuthorityTypes.CURATED, 48 | 'met': AuthorityTypes.CURATED, 49 | 'mccordsmuseum': AuthorityTypes.CURATED, 50 | 'digitaltmuseum': AuthorityTypes.CURATED, 51 | 'clevelandmuseum': AuthorityTypes.CURATED, 52 | 'brooklynmuseum': AuthorityTypes.CURATED 53 | } 54 | 55 | 56 | def get_authority_boost(source): 57 | authority_boost = None 58 | if source in authority_types: 59 | authority_type = authority_types[source] 60 | if authority_type in boost: 61 | authority_boost = boost[authority_type] 62 | else: 63 | authority_boost = boost[AuthorityTypes.DEFAULT] 64 | return authority_boost 65 | -------------------------------------------------------------------------------- /ingestion_server/ingestion_server/categorize.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, auto 2 | 3 | """ 4 | https://github.com/creativecommons/cccatalog-api/issues/340 5 | 6 | Attempt to figure out the image type (illustration, vector, photograph, or 7 | digitized artwork) based on its source and file extension. 8 | """ 9 | 10 | 11 | class Category(Enum): 12 | PHOTOGRAPH = auto() 13 | DIGITIZED_ARTWORK = auto() 14 | ILLUSTRATION = auto() 15 | 16 | 17 | # Map each provider to a set of categories.. 18 | source_category = { 19 | '__default': [], 20 | 'thorvaldsenmuseum': [Category.DIGITIZED_ARTWORK], 21 | 'svgsilh': [Category.ILLUSTRATION], 22 | 'phylopic': [Category.ILLUSTRATION], 23 | 'floraon': [Category.PHOTOGRAPH], 24 | 'animaldiversity': [Category.PHOTOGRAPH], 25 | 'WoRMS': [Category.PHOTOGRAPH], 26 | 'clevelandmuseum': [Category.DIGITIZED_ARTWORK], 27 | 'CAPL': [Category.PHOTOGRAPH], 28 | 'sciencemuseum': [Category.PHOTOGRAPH], 29 | 'rijksmuseum': [Category.DIGITIZED_ARTWORK], 30 | 'museumsvictoria': [Category.DIGITIZED_ARTWORK], 31 | 'met': [Category.DIGITIZED_ARTWORK], 32 | 'mccordmuseum': [Category.DIGITIZED_ARTWORK], 33 | 'digitaltmuseum': [Category.DIGITIZED_ARTWORK], 34 | 'deviantart': [Category.DIGITIZED_ARTWORK], 35 | 'brooklynmuseum': [Category.DIGITIZED_ARTWORK] 36 | } 37 | 38 | 39 | def get_categories(extension, source): 40 | if extension and extension.lower() == 'svg': 41 | categories = [Category.ILLUSTRATION] 42 | elif source in source_category: 43 | categories = source_category[source] 44 | else: 45 | categories = source_category['__default'] 46 | return [x.name for x in categories] 47 | -------------------------------------------------------------------------------- /ingestion_server/ingestion_server/distributed_reindex_scheduler.py: -------------------------------------------------------------------------------- 1 | """ 2 | Allocate hardware for performing a distributed index by spawning several 3 | indexer_worker instances on multiple machines. Then, partition the work across 4 | each worker, notifying each worker which partition to reindex through an HTTP 5 | request. 6 | 7 | Once the reindexing job is finished, each worker will notify Ingestion Server, 8 | which should then shut down the instances. 9 | """ 10 | import math 11 | import requests 12 | import logging as log 13 | import os 14 | import time 15 | import boto3 16 | import socket 17 | from ingestion_server.state import register_indexing_job 18 | 19 | 20 | client = boto3.client('ec2', region_name=os.getenv('AWS_REGION', 'us-east-1')) 21 | 22 | 23 | def schedule_distributed_index(db_conn, target_index): 24 | workers = _prepare_workers() 25 | registered = register_indexing_job(workers, target_index) 26 | if registered: 27 | _assign_work(db_conn, workers, target_index) 28 | 29 | 30 | def _assign_work(db_conn, workers, target_index): 31 | est_records_query = 'SELECT id FROM image ORDER BY id DESC LIMIT 1' 32 | with db_conn.cursor() as cur: 33 | cur.execute(est_records_query) 34 | estimated_records = cur.fetchone()[0] 35 | records_per_worker = math.floor(estimated_records / len(workers)) 36 | 37 | worker_url_template = 'http://{}:8002' 38 | # Wait for the workers to start. 39 | for worker in workers: 40 | worker_url = worker_url_template.format(worker) 41 | succeeded = _wait_for_healthcheck(f'{worker_url}/healthcheck') 42 | if not succeeded: 43 | return False 44 | for idx, worker in enumerate(workers): 45 | worker_url = worker_url_template.format(worker) 46 | params = { 47 | 'start_id': idx * records_per_worker, 48 | 'end_id': (1 + idx) * records_per_worker, 49 | 'target_index': target_index 50 | } 51 | log.info(f'Assigning job {params} to {worker_url}') 52 | requests.post(worker_url + '/indexing_task', json=params) 53 | 54 | 55 | def _prepare_workers(): 56 | """ 57 | Get a list of internal URLs bound to each indexing worker. If the worker is 58 | stopped, start the worker. 59 | 60 | :return: A list of private URLs pointing to each available indexing worker 61 | """ 62 | environment = os.getenv('ENVIRONMENT', 'local') 63 | if environment == 'local': 64 | return [socket.gethostbyname('indexer-worker')] 65 | instance_filters = [ 66 | { 67 | 'Name': 'tag:Name', 68 | 'Values': ['indexer-worker-' + environment + '*'] 69 | }, 70 | { 71 | 'Name': 'instance-state-name', 72 | 'Values': ['stopped', 'running'] 73 | } 74 | ] 75 | response = client.describe_instances(Filters=instance_filters) 76 | servers = [] 77 | ids = [] 78 | for reservation in response['Reservations']: 79 | instance = reservation['Instances'][0] 80 | server = instance['PrivateIpAddress'] 81 | _id = instance['InstanceId'] 82 | servers.append(server) 83 | ids.append(_id) 84 | log.info('Selected worker instances {}'.format(servers)) 85 | client.start_instances(InstanceIds=ids) 86 | return servers 87 | 88 | 89 | def _wait_for_healthcheck(endpoint, attempts=60, wait=5): 90 | """ 91 | Wait for the instance at `endpoint` to become healthy before assigning work. 92 | 93 | :param endpoint: The URL to test 94 | :param attempts: Number of attempts at reaching healthcheck 95 | :param wait: Amount of time to wait between each attempt 96 | :return: True if the healthcheck succeeded 97 | """ 98 | num_attempts = 0 99 | healthcheck_passed = False 100 | while not healthcheck_passed and num_attempts < attempts: 101 | try: 102 | log.info(f'Checking {endpoint}. . .') 103 | response = requests.get(endpoint, timeout=3) 104 | if response.status_code == 200: 105 | healthcheck_passed = True 106 | break 107 | except requests.exceptions.RequestException: 108 | pass 109 | time.sleep(wait) 110 | num_attempts += 1 111 | if num_attempts >= attempts or not healthcheck_passed: 112 | log.error(f'Timed out waiting for {endpoint}.') 113 | return False 114 | else: 115 | log.info(f'{endpoint} passed healthcheck') 116 | return True 117 | -------------------------------------------------------------------------------- /ingestion_server/ingestion_server/indexer_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | A single worker responsible for indexing a subset of the records stored in the 3 | database. 4 | 5 | Accept an HTTP request specifying a range of image IDs to reindex. After the 6 | data has been indexed, notify Ingestion Server and stop the instance. 7 | """ 8 | import falcon 9 | import sys 10 | import logging as log 11 | import os 12 | import boto3 13 | import requests 14 | from multiprocessing import Value, Process 15 | from psycopg2.sql import SQL 16 | from ingestion_server.indexer import elasticsearch_connect, TableIndexer 17 | 18 | 19 | ec2_client = boto3.client( 20 | 'ec2', 21 | region_name=os.getenv('AWS_REGION', 'us-east-1'), 22 | aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID', None), 23 | aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY', None) 24 | ) 25 | 26 | 27 | class IndexingJobResource: 28 | def on_post(self, req, resp): 29 | j = req.media 30 | start_id = j['start_id'] 31 | end_id = j['end_id'] 32 | target_index = j['target_index'] 33 | notify_url = f'http://{req.remote_addr}:8001/worker_finished' 34 | _execute_indexing_task(target_index, start_id, end_id, notify_url) 35 | log.info(f'Received indexing request for records {start_id}-{end_id}') 36 | resp.status = falcon.HTTP_201 37 | 38 | 39 | class HealthcheckResource: 40 | def on_get(self, req, resp): 41 | resp.status = falcon.HTTP_200 42 | 43 | 44 | def _execute_indexing_task(target_index, start_id, end_id, notify_url): 45 | table = 'image' 46 | elasticsearch = elasticsearch_connect() 47 | progress = Value('d', 0.0) 48 | finish_time = Value('d', 0.0) 49 | exists_in_table = \ 50 | 'exists(SELECT 1 FROM {table} ' \ 51 | 'WHERE identifier = image.identifier) as "{name}"' 52 | exists_in_deleted_table = exists_in_table.format( 53 | table='api_deletedimage', name='deleted' 54 | ) 55 | exists_in_mature_table = exists_in_table.format( 56 | table='api_matureimage', name='mature' 57 | ) 58 | 59 | query = SQL(f''' 60 | SELECT *, 61 | {exists_in_deleted_table}, {exists_in_mature_table} 62 | FROM image 63 | WHERE id BETWEEN {start_id} AND {end_id} 64 | ''') 65 | log.info('Querying {}'.format(query)) 66 | indexer = TableIndexer( 67 | elasticsearch, table, progress, finish_time 68 | ) 69 | p = Process( 70 | target=_launch_reindex, 71 | args=(table, target_index, query, indexer, notify_url) 72 | ) 73 | p.start() 74 | log.info('Started indexing task') 75 | 76 | 77 | def _launch_reindex(table, target_index, query, indexer, notify_url): 78 | try: 79 | indexer.replicate(table, target_index, query) 80 | except Exception: 81 | log.error("Indexing error occurred: ", exc_info=True) 82 | 83 | log.info(f'Notifying {notify_url}') 84 | requests.post(notify_url) 85 | _self_destruct() 86 | return 87 | 88 | 89 | def _self_destruct(): 90 | """ 91 | Stop this EC2 instance once the task is finished. 92 | """ 93 | # Get instance ID from AWS metadata service 94 | if os.getenv('ENVIRONMENT', 'local') == 'local': 95 | log.info( 96 | 'Skipping self destruction because worker is in local environment' 97 | ) 98 | return 99 | endpoint = 'http://169.254.169.254/latest/meta-data/instance-id' 100 | response = requests.get(endpoint) 101 | instance_id = response.content.decode('utf8') 102 | log.info('Shutting self down') 103 | ec2_client.stop_instances(InstanceIds=[instance_id]) 104 | 105 | 106 | root = log.getLogger() 107 | root.setLevel(log.DEBUG) 108 | handler = log.StreamHandler(sys.stdout) 109 | handler.setLevel(log.INFO) 110 | formatter = log.Formatter( 111 | '%(asctime)s %(levelname)s %(filename)s:%(lineno)d - %(message)s' 112 | ) 113 | handler.setFormatter(formatter) 114 | root.addHandler(handler) 115 | api = falcon.API() 116 | api.add_route('/indexing_task', IndexingJobResource()) 117 | api.add_route('/healthcheck', HealthcheckResource()) 118 | -------------------------------------------------------------------------------- /ingestion_server/ingestion_server/qa.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import random 3 | from enum import Enum 4 | from ingestion_server.elasticsearch_models import Image 5 | 6 | 7 | class QAScores(Enum): 8 | TARGET = 1 9 | LESS_RELEVANT = 2 10 | NOT_RELEVANT = 3 11 | 12 | 13 | def create_search_qa_index(): 14 | test_idx = 'search-qa' 15 | _phrase_relevance(test_idx) 16 | 17 | 18 | def test_image(title, tags, creator, relevance): 19 | _id = random.randint(0, 1000000000) 20 | sample_url = 'https://example.com/' 21 | img = Image( 22 | _id=_id, 23 | id=_id, 24 | title=title, 25 | identifier=relevance, 26 | creator=creator, 27 | creator_url=sample_url, 28 | tags=tags, 29 | created_on=None, 30 | url=sample_url, 31 | thumbnail='', 32 | provider='test', 33 | source=sample_url, 34 | license='by', 35 | license_version='3.0', 36 | foreign_landing_url=sample_url, 37 | metadata=None, 38 | view_count=0 39 | ) 40 | return img 41 | 42 | 43 | def _phrase_relevance(index): 44 | less_relevant1 = test_image( 45 | 'A picture of my office', 46 | [{'name': 'office'}], 47 | 'Alice Foo', 48 | QAScores.LESS_RELEVANT.value 49 | ) 50 | less_relevant1.save(index=index) 51 | 52 | less_relevant2 = test_image( 53 | 'My office in my home', 54 | [{'name': 'office'}, {'name': 'home'}], 55 | 'Gordon', 56 | QAScores.LESS_RELEVANT.value 57 | ) 58 | less_relevant2.save(index=index) 59 | 60 | not_relevant = test_image( 61 | 'Mastiff', [{'name': 'dog'}], 'Liam', QAScores.NOT_RELEVANT.value 62 | ) 63 | not_relevant.save(index=index) 64 | 65 | # This should be the top result. 66 | target_tags = [ 67 | {'name': 'home office'}, 68 | {'name': 'noise'}, 69 | {'name': 'clutter'} 70 | ] 71 | target = test_image( 72 | 'My home office', target_tags, 'John Fooson', QAScores.TARGET.value 73 | ) 74 | target.save(index=index) 75 | -------------------------------------------------------------------------------- /ingestion_server/ingestion_server/state.py: -------------------------------------------------------------------------------- 1 | import shelve 2 | import datetime 3 | import enum 4 | import logging as log 5 | from filelock import FileLock 6 | """ 7 | Indexing is distributed across multiple independent hosts. We don't want to 8 | "go live" in production with the newly indexed data until all of the indexing 9 | workers have finished their tasks. To that end, we need to track the state of 10 | each worker, and be notified when the job has finished. 11 | 12 | State is persisted to the disk using shelve. Concurrent writes aren't allowed, 13 | so all operations need to acquire a lock. 14 | """ 15 | 16 | 17 | class WorkerStatus(enum.Enum): 18 | RUNNING = 0 19 | FINISHED = 1 20 | 21 | 22 | def register_indexing_job(worker_ips, target_index): 23 | """ 24 | Track the hosts that are running indexing jobs. Only one indexing job can 25 | run at a time. 26 | 27 | :param worker_ips: A list of private IP addresses corresponding to the pool 28 | of relevant indexer-worker instances. 29 | :param target_index: The name of the Elasticsearch index that will be 30 | promoted to production after indexing is complete 31 | :return: Return True if scheduling succeeds 32 | """ 33 | with FileLock('lock'), shelve.open('db', writeback=True) as db: 34 | # Wipe last job out if it has finished. 35 | indexing_in_progress = False 36 | if 'worker_statuses' in db: 37 | for worker in db['worker_statuses']: 38 | if db['worker_statuses'][worker] == WorkerStatus.RUNNING: 39 | indexing_in_progress = True 40 | if indexing_in_progress: 41 | log.error( 42 | 'Failed to schedule indexing job; another one is running.' 43 | ) 44 | return False 45 | 46 | # Register the workers. 47 | worker_statuses = {} 48 | for worker_url in worker_ips: 49 | worker_statuses[worker_url] = WorkerStatus.RUNNING 50 | db['worker_statuses'] = worker_statuses 51 | db['start_time'] = datetime.datetime.now() 52 | db['target_index'] = target_index 53 | return True 54 | 55 | 56 | def worker_finished(worker_ip): 57 | """ 58 | The scheduler received a notification indicating an indexing worker has 59 | finished its task. 60 | :param worker_ip: The private IP of the worker. 61 | :return: The target index if all workers are finished, else False. 62 | """ 63 | with FileLock('lock'), shelve.open('db', writeback=True) as db: 64 | try: 65 | _ = db['worker_statuses'][worker_ip] 66 | db['worker_statuses'][worker_ip] = WorkerStatus.FINISHED 67 | log.info(f'Received worker_finished signal from {worker_ip}') 68 | except KeyError: 69 | log.error( 70 | 'An indexer worker notified us it finished its task, but ' 71 | 'we are not tracking it.' 72 | ) 73 | for worker_key in db['worker_statuses']: 74 | if db['worker_statuses'][worker_key] == WorkerStatus.RUNNING: 75 | log.info(f'{worker_key} is still indexing') 76 | return False 77 | return db['target_index'] 78 | 79 | 80 | def clear_state(): 81 | """ 82 | Forget about all running index jobs. Use with care. 83 | """ 84 | with FileLock('lock'), shelve.open('db', writeback=True) as db: 85 | for key in db: 86 | log.info('Deleting ' + str(db[key])) 87 | del db[key] 88 | log.info('Cleared indexing state.') 89 | -------------------------------------------------------------------------------- /ingestion_server/ingestion_server/tasks.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import datetime as dt 3 | import requests 4 | from enum import Enum 5 | from multiprocessing import Process 6 | from ingestion_server.indexer import elasticsearch_connect, TableIndexer 7 | from ingestion_server.ingest import reload_upstream 8 | 9 | """ Simple in-memory tracking of executed tasks. """ 10 | 11 | 12 | class TaskTypes(Enum): 13 | # Completely reindex all data for a given model. 14 | REINDEX = 0 15 | # Reindex updates to a model from the database since a certain date. 16 | UPDATE_INDEX = 1 17 | # Download the latest copy of the data from the upstream database, then 18 | # completely reindex the newly imported data. 19 | INGEST_UPSTREAM = 2 20 | # Create indices in Elasticsearch for QA tests. 21 | # This is not intended for production use, but can be safely executed in a 22 | # production environment without consequence. 23 | LOAD_TEST_DATA = 3 24 | 25 | 26 | class TaskTracker: 27 | def __init__(self): 28 | self.id_task = {} 29 | self.id_action = {} 30 | self.id_progress = {} 31 | self.id_start_time = {} 32 | self.id_finish_time = {} 33 | 34 | def add_task(self, task, task_id, action, progress, finish_time): 35 | self._prune_old_tasks() 36 | self.id_task[task_id] = task 37 | self.id_action[task_id] = action 38 | self.id_progress[task_id] = progress 39 | self.id_start_time[task_id] = dt.datetime.utcnow().timestamp() 40 | self.id_finish_time[task_id] = finish_time 41 | return task_id 42 | 43 | def _prune_old_tasks(self): 44 | pass 45 | 46 | def list_task_statuses(self): 47 | self._prune_old_tasks() 48 | results = [] 49 | for _id, task in self.id_task.items(): 50 | percent_completed = self.id_progress[_id].value 51 | active = task.is_alive() 52 | start_time = self.id_start_time[_id] 53 | finish_time = self.id_finish_time[_id].value 54 | results.append({ 55 | 'task_id': _id, 56 | 'active': active, 57 | 'action': self.id_action[_id], 58 | 'progress': percent_completed, 59 | 'error': percent_completed < 100 and not active, 60 | 'start_time': start_time, 61 | 'finish_time': finish_time 62 | }) 63 | sorted_results = sorted( 64 | results, 65 | key=lambda x: x['finish_time'] 66 | ) 67 | 68 | to_utc = dt.datetime.utcfromtimestamp 69 | 70 | def render_date(x): 71 | return to_utc(x) if x != 0.0 else None 72 | 73 | # Convert date to a readable format 74 | for idx, task in enumerate(sorted_results): 75 | start_time = task['start_time'] 76 | finish_time = task['finish_time'] 77 | sorted_results[idx]['start_time'] = str(render_date(start_time)) 78 | sorted_results[idx]['finish_time'] = str(render_date(finish_time)) 79 | 80 | return sorted_results 81 | 82 | 83 | class Task(Process): 84 | def __init__(self, model, task_type, since_date, progress, task_id, 85 | finish_time, callback_url): 86 | Process.__init__(self) 87 | self.model = model 88 | self.task_type = task_type 89 | self.since_date = since_date 90 | self.progress = progress 91 | self.task_id = task_id 92 | self.finish_time = finish_time 93 | self.callback_url = callback_url 94 | 95 | def run(self): 96 | # Map task types to actions. 97 | elasticsearch = elasticsearch_connect() 98 | indexer = TableIndexer( 99 | elasticsearch, self.model, self.progress, self.finish_time 100 | ) 101 | if self.task_type == TaskTypes.REINDEX: 102 | indexer.reindex(self.model) 103 | elif self.task_type == TaskTypes.UPDATE_INDEX: 104 | indexer.update(self.model, self.since_date) 105 | elif self.task_type == TaskTypes.INGEST_UPSTREAM: 106 | reload_upstream(self.model) 107 | indexer.reindex(self.model) 108 | elif self.task_type == TaskTypes.LOAD_TEST_DATA: 109 | indexer.load_test_data() 110 | logging.info('Task {} exited.'.format(self.task_id)) 111 | if self.callback_url: 112 | try: 113 | requests.post(self.callback_url) 114 | except requests.exceptions.RequestException as e: 115 | logging.error('Failed to send callback!') 116 | logging.error(e) 117 | -------------------------------------------------------------------------------- /ingestion_server/publish_release.sh: -------------------------------------------------------------------------------- 1 | # Usage: ./public_release.sh [VERSION] 2 | docker build -t creativecommons/ingestion_server:$1 . 3 | docker build -f Dockerfile-worker -t creativecommons/indexer_worker:$1 . 4 | docker push creativecommons/ingestion_server:$1 5 | docker push creativecommons/indexer_worker:$1 6 | -------------------------------------------------------------------------------- /ingestion_server/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/ingestion_server/test/__init__.py -------------------------------------------------------------------------------- /ingestion_server/test/generate_integration_test_docker_compose.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import yaml 3 | import datetime 4 | import os 5 | import sys 6 | import traceback 7 | import textwrap 8 | 9 | """ 10 | Parses docker-compose file and generates an integration-test-docker-compose.yml. 11 | The generated file is written to the same directory this script resides in. 12 | 13 | Q: Why didn't you just use multiple docker-compose files and inheritance? 14 | 15 | A: If you are running the development docker-compose file already, launching 16 | an inherited elasticsearch/postgres service will result in the containers 17 | being destroyed and recreated. Using this approach ensures that: 18 | 1) Running tests doesn't interfere with your development environment. 19 | 2) The file stays up-to-date without manual copy-pasting. 20 | 3) We don't blow up running containers on Travis CI. 21 | """ 22 | 23 | this_dir = os.path.dirname(os.path.realpath(__file__)) 24 | outname = this_dir + '/integration-test-docker-compose.yml' 25 | parent_docker_compose = this_dir + '/../../docker-compose.yml' 26 | 27 | with open(parent_docker_compose, 'r') as docker_compose_file: 28 | docker_compose = yaml.safe_load(docker_compose_file) 29 | try: 30 | db = docker_compose['services']['db'] 31 | es = docker_compose['services']['es'] 32 | ingestion_server = docker_compose['services']['ingestion-server'] 33 | upstream_db = docker_compose['services']['upstream_db'] 34 | # Delete services we're not testing. 35 | desired_services = {'es', 'db', 'ingestion-server', 'upstream_db'} 36 | for service in dict(docker_compose['services']): 37 | if service not in desired_services: 38 | del docker_compose['services'][service] 39 | del docker_compose['services']['es']['healthcheck'] 40 | 41 | # Expose alternate ports. Use the same internal port defined in the 42 | # original docker-compose file. 43 | upstream_db_port = upstream_db['ports'][0].split(':')[1] 44 | upstream_db['ports'][0] = '59999' + ':' + upstream_db_port 45 | db['ports'][0] = '60000' + ':' + db['ports'][0].split(':')[1] 46 | es['ports'][0] = '60001' + ':' + es['ports'][0].split(':')[1] 47 | ingestion_api_port = ingestion_server['ports'][0].split(':')[1] 48 | ingestion_server['ports'][0] = '60002' + ':' + ingestion_api_port 49 | 50 | # Configure ingestion server to point to integration containers. 51 | upstream_name = 'integration-upstream' 52 | ingestion_server['environment']['DATABASE_HOST'] = 'integration-db' 53 | ingestion_server['environment']['ELASTICSEARCH_URL'] = 'integration-es' 54 | ingestion_server['environment']['UPSTREAM_DB_HOST'] = upstream_name 55 | ingestion_server['depends_on'] = ['integration-es', 'integration-db'] 56 | ingestion_server['build'] = '../' 57 | 58 | # Create a volume for the mock data 59 | db['volumes'] = ['./mock_data:/mock_data'] 60 | upstream_db['volumes'] = ['./mock_data:/mock_data'] 61 | 62 | # Rename the services and update ports. 63 | for service in dict(docker_compose['services']): 64 | if service in desired_services: 65 | del docker_compose['services'][service] 66 | docker_compose['services']['integration-db'] = db 67 | docker_compose['services']['integration-es'] = es 68 | docker_compose['services']['integration-ingestion'] = ingestion_server 69 | docker_compose['services']['integration-upstream'] = upstream_db 70 | 71 | 72 | # Start the document with a warning message 73 | warning_message = '\n'.join(textwrap.wrap( 74 | 'This docker-compose file was generated from ' 75 | + parent_docker_compose + '. Do not modify this file directly. ' 76 | 'Your changes will be overwritten. Last update: ' 77 | + str(datetime.datetime.now()), width=79, 78 | initial_indent='# ', subsequent_indent='# ')) + '\n\n' 79 | 80 | with open(outname, 'w') as integration_docker_compose: 81 | integration_docker_compose.truncate() 82 | integration_docker_compose.write(warning_message) 83 | yaml.dump(docker_compose, integration_docker_compose, 84 | default_flow_style=False) 85 | 86 | except KeyError as e: 87 | print(traceback.format_exc()) 88 | print('Failed to parse docker-compose.yml due to missing key. No file' 89 | ' was written to disk. Missing key: ' + str(e)) 90 | sys.exit(1) 91 | except Exception as e: 92 | print(traceback.format_exc()) 93 | print('Failed to generate', outname, 'due to exception:', e) 94 | -------------------------------------------------------------------------------- /ingestion_server/test/integration-test-docker-compose.yml: -------------------------------------------------------------------------------- 1 | # This docker-compose file was generated from /home/alden/code/cccatalog- 2 | # api/ingestion_server/test/../../docker-compose.yml. Do not modify this file 3 | # directly. Your changes will be overwritten. Last update: 2019-01-09 4 | # 11:36:00.858884 5 | 6 | services: 7 | integration-db: 8 | environment: 9 | POSTGRES_DB: openledger 10 | POSTGRES_HOST: 0.0.0.0 11 | POSTGRES_PASSWORD: deploy 12 | POSTGRES_USER: deploy 13 | healthcheck: 14 | test: pg_isready -U deploy -d openledger 15 | image: postgres:10.3-alpine 16 | ports: 17 | - 60000:5432 18 | volumes: 19 | - ./mock_data:/mock_data 20 | integration-es: 21 | environment: 22 | - xpack.security.enabled=false 23 | image: docker.elastic.co/elasticsearch/elasticsearch:6.2.4 24 | ports: 25 | - 60001:9200 26 | ulimits: 27 | nofile: 28 | hard: 65536 29 | soft: 65536 30 | integration-ingestion: 31 | build: ../ 32 | command: bash -c 'sleep 20 && supervisord -c config/supervisord.conf' 33 | depends_on: 34 | - integration-es 35 | - integration-db 36 | environment: 37 | COPY_TABLES: image 38 | DATABASE_HOST: integration-db 39 | DATABASE_NAME: openledger 40 | DATABASE_PASSWORD: deploy 41 | DATABASE_PORT: '5432' 42 | DATABASE_USER: deploy 43 | DB_BUFFER_SIZE: '100000' 44 | ELASTICSEARCH_PORT: '9200' 45 | ELASTICSEARCH_URL: integration-es 46 | PYTHONUNBUFFERED: '0' 47 | SYNCER_POLL_INTERVAL: '60' 48 | UPSTREAM_DB_HOST: integration-upstream 49 | UPSTREAM_DB_PORT: 5432 50 | ports: 51 | - 60002:8001 52 | stdin_open: true 53 | tty: true 54 | volumes: 55 | - ./ingestion_server:/ingestion-server 56 | integration-upstream: 57 | environment: 58 | POSTGRES_DB: openledger 59 | POSTGRES_HOST: 0.0.0.0 60 | POSTGRES_PASSWORD: deploy 61 | POSTGRES_USER: deploy 62 | healthcheck: 63 | test: pg_isready -U deploy -d openledger 64 | image: postgres:10.3-alpine 65 | ports: 66 | - 59999:5432 67 | volumes: 68 | - ./mock_data:/mock_data 69 | version: '3' 70 | -------------------------------------------------------------------------------- /ingestion_server/test/mock_data/no_constraints_schema.sql: -------------------------------------------------------------------------------- 1 | -- 2 | -- PostgreSQL database dump 3 | -- 4 | 5 | -- Dumped from database version 10.3 6 | -- Dumped by pg_dump version 10.3 (Debian 10.3-1.pgdg90+1) 7 | 8 | SET statement_timeout = 0; 9 | SET lock_timeout = 0; 10 | SET idle_in_transaction_session_timeout = 0; 11 | SET client_encoding = 'UTF8'; 12 | SET standard_conforming_strings = on; 13 | SET check_function_bodies = false; 14 | SET client_min_messages = warning; 15 | SET row_security = off; 16 | 17 | SET default_tablespace = ''; 18 | 19 | SET default_with_oids = false; 20 | 21 | -- 22 | -- Name: image; Type: TABLE; Schema: public; Owner: deploy 23 | -- 24 | 25 | CREATE TABLE public.image ( 26 | id integer NOT NULL, 27 | created_on timestamp with time zone NOT NULL, 28 | updated_on timestamp with time zone NOT NULL, 29 | identifier character varying(255), 30 | perceptual_hash character varying(255), 31 | provider character varying(80), 32 | source character varying(80), 33 | foreign_identifier character varying(1000), 34 | foreign_landing_url character varying(1000), 35 | url character varying(1000) NOT NULL, 36 | thumbnail character varying(1000), 37 | width integer, 38 | height integer, 39 | filesize integer, 40 | license character varying(50) NOT NULL, 41 | license_version character varying(25), 42 | creator character varying(2000), 43 | creator_url character varying(2000), 44 | title character varying(2000), 45 | tags_list character varying(255)[], 46 | last_synced_with_source timestamp with time zone, 47 | removed_from_source boolean NOT NULL, 48 | meta_data jsonb, 49 | view_count integer NOT NULL, 50 | tags jsonb NOT NULL, 51 | watermarked boolean NOT NULL 52 | ); 53 | 54 | 55 | ALTER TABLE public.image OWNER TO deploy; 56 | 57 | -- 58 | -- Name: image_id_seq; Type: SEQUENCE; Schema: public; Owner: deploy 59 | -- 60 | 61 | CREATE SEQUENCE public.image_id_seq 62 | AS integer 63 | START WITH 1 64 | INCREMENT BY 1 65 | NO MINVALUE 66 | NO MAXVALUE 67 | CACHE 1; 68 | 69 | 70 | ALTER TABLE public.image_id_seq OWNER TO deploy; 71 | 72 | -- 73 | -- Name: image_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: deploy 74 | -- 75 | 76 | ALTER SEQUENCE public.image_id_seq OWNED BY public.image.id; 77 | 78 | 79 | -- 80 | -- Name: image id; Type: DEFAULT; Schema: public; Owner: deploy 81 | -- 82 | 83 | ALTER TABLE ONLY public.image ALTER COLUMN id SET DEFAULT nextval('public.image_id_seq'::regclass); 84 | 85 | 86 | 87 | -- 88 | -- Name: image image_pkey; Type: CONSTRAINT; Schema: public; Owner: deploy 89 | -- 90 | 91 | ALTER TABLE ONLY public.image 92 | ADD CONSTRAINT image_pkey PRIMARY KEY (id); 93 | 94 | 95 | -- 96 | -- Name: image_foreign_identifier_4c72d3ee_like; Type: INDEX; Schema: public; Owner: deploy 97 | -- 98 | 99 | CREATE INDEX image_foreign_identifier_4c72d3ee_like ON public.image USING btree (foreign_identifier varchar_pattern_ops); 100 | 101 | 102 | -- 103 | -- Name: image_identifier_d102a6e0_like; Type: INDEX; Schema: public; Owner: deploy 104 | -- 105 | 106 | CREATE INDEX image_identifier_d102a6e0_like ON public.image USING btree (identifier varchar_pattern_ops); 107 | 108 | 109 | -- 110 | -- Name: image_last_synced_with_source_187adf09; Type: INDEX; Schema: public; Owner: deploy 111 | -- 112 | 113 | CREATE INDEX image_last_synced_with_source_187adf09 ON public.image USING btree (last_synced_with_source); 114 | 115 | 116 | -- 117 | -- Name: image_perceptual_hash_0d126a7a; Type: INDEX; Schema: public; Owner: deploy 118 | -- 119 | 120 | CREATE INDEX image_perceptual_hash_0d126a7a ON public.image USING btree (perceptual_hash); 121 | 122 | 123 | -- 124 | -- Name: image_perceptual_hash_0d126a7a_like; Type: INDEX; Schema: public; Owner: deploy 125 | -- 126 | 127 | CREATE INDEX image_perceptual_hash_0d126a7a_like ON public.image USING btree (perceptual_hash varchar_pattern_ops); 128 | 129 | 130 | -- 131 | -- Name: image_provider_7d11f847; Type: INDEX; Schema: public; Owner: deploy 132 | -- 133 | 134 | CREATE INDEX image_provider_7d11f847 ON public.image USING btree (provider); 135 | 136 | 137 | -- 138 | -- Name: image_provider_7d11f847_like; Type: INDEX; Schema: public; Owner: deploy 139 | -- 140 | 141 | CREATE INDEX image_provider_7d11f847_like ON public.image USING btree (provider varchar_pattern_ops); 142 | 143 | 144 | -- 145 | -- Name: image_source_d5a89e97; Type: INDEX; Schema: public; Owner: deploy 146 | -- 147 | 148 | CREATE INDEX image_source_d5a89e97 ON public.image USING btree (source); 149 | 150 | 151 | -- 152 | -- Name: image_source_d5a89e97_like; Type: INDEX; Schema: public; Owner: deploy 153 | -- 154 | 155 | CREATE INDEX image_source_d5a89e97_like ON public.image USING btree (source varchar_pattern_ops); 156 | 157 | 158 | -- 159 | -- Name: image_url_c6aabda2_like; Type: INDEX; Schema: public; Owner: deploy 160 | -- 161 | 162 | CREATE INDEX image_url_c6aabda2_like ON public.image USING btree (url varchar_pattern_ops); 163 | 164 | 165 | -- 166 | -- PostgreSQL database dump complete 167 | -- 168 | 169 | -------------------------------------------------------------------------------- /ingestion_server/test/mock_data/schema.sql: -------------------------------------------------------------------------------- 1 | -- 2 | -- PostgreSQL database dump 3 | -- 4 | 5 | -- Dumped from database version 10.3 6 | -- Dumped by pg_dump version 10.3 (Debian 10.3-1.pgdg90+1) 7 | 8 | SET statement_timeout = 0; 9 | SET lock_timeout = 0; 10 | SET idle_in_transaction_session_timeout = 0; 11 | SET client_encoding = 'UTF8'; 12 | SET standard_conforming_strings = on; 13 | SET check_function_bodies = false; 14 | SET client_min_messages = warning; 15 | SET row_security = off; 16 | 17 | SET default_tablespace = ''; 18 | 19 | SET default_with_oids = false; 20 | 21 | -- 22 | -- Name: image; Type: TABLE; Schema: public; Owner: deploy 23 | -- 24 | 25 | CREATE TABLE public.image ( 26 | id integer NOT NULL, 27 | created_on timestamp with time zone NOT NULL, 28 | updated_on timestamp with time zone NOT NULL, 29 | identifier character varying(255), 30 | perceptual_hash character varying(255), 31 | provider character varying(80), 32 | source character varying(80), 33 | foreign_identifier character varying(1000), 34 | foreign_landing_url character varying(1000), 35 | url character varying(1000) NOT NULL, 36 | thumbnail character varying(1000), 37 | width integer, 38 | height integer, 39 | filesize integer, 40 | license character varying(50) NOT NULL, 41 | license_version character varying(25), 42 | creator character varying(2000), 43 | creator_url character varying(2000), 44 | title character varying(2000), 45 | tags_list character varying(255)[], 46 | last_synced_with_source timestamp with time zone, 47 | removed_from_source boolean NOT NULL, 48 | meta_data jsonb, 49 | view_count integer NOT NULL, 50 | tags jsonb NOT NULL, 51 | watermarked boolean NOT NULL 52 | ); 53 | 54 | 55 | ALTER TABLE public.image OWNER TO deploy; 56 | 57 | -- 58 | -- Name: image_id_seq; Type: SEQUENCE; Schema: public; Owner: deploy 59 | -- 60 | 61 | CREATE SEQUENCE public.image_id_seq 62 | AS integer 63 | START WITH 1 64 | INCREMENT BY 1 65 | NO MINVALUE 66 | NO MAXVALUE 67 | CACHE 1; 68 | 69 | 70 | ALTER TABLE public.image_id_seq OWNER TO deploy; 71 | 72 | -- 73 | -- Name: image_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: deploy 74 | -- 75 | 76 | ALTER SEQUENCE public.image_id_seq OWNED BY public.image.id; 77 | 78 | 79 | -- 80 | -- Name: image id; Type: DEFAULT; Schema: public; Owner: deploy 81 | -- 82 | 83 | ALTER TABLE ONLY public.image ALTER COLUMN id SET DEFAULT nextval('public.image_id_seq'::regclass); 84 | 85 | 86 | -- 87 | -- Name: image image_foreign_identifier_key; Type: CONSTRAINT; Schema: public; Owner: deploy 88 | -- 89 | 90 | ALTER TABLE ONLY public.image 91 | ADD CONSTRAINT image_foreign_identifier_key UNIQUE (foreign_identifier); 92 | 93 | 94 | -- 95 | -- Name: image image_identifier_key; Type: CONSTRAINT; Schema: public; Owner: deploy 96 | -- 97 | 98 | ALTER TABLE ONLY public.image 99 | ADD CONSTRAINT image_identifier_key UNIQUE (identifier); 100 | 101 | 102 | -- 103 | -- Name: image image_pkey; Type: CONSTRAINT; Schema: public; Owner: deploy 104 | -- 105 | 106 | ALTER TABLE ONLY public.image 107 | ADD CONSTRAINT image_pkey PRIMARY KEY (id); 108 | 109 | 110 | -- 111 | -- Name: image image_url_key; Type: CONSTRAINT; Schema: public; Owner: deploy 112 | -- 113 | 114 | ALTER TABLE ONLY public.image 115 | ADD CONSTRAINT image_url_key UNIQUE (url); 116 | 117 | 118 | -- 119 | -- Name: image_foreign_identifier_4c72d3ee_like; Type: INDEX; Schema: public; Owner: deploy 120 | -- 121 | 122 | CREATE INDEX image_foreign_identifier_4c72d3ee_like ON public.image USING btree (foreign_identifier varchar_pattern_ops); 123 | 124 | 125 | -- 126 | -- Name: image_identifier_d102a6e0_like; Type: INDEX; Schema: public; Owner: deploy 127 | -- 128 | 129 | CREATE INDEX image_identifier_d102a6e0_like ON public.image USING btree (identifier varchar_pattern_ops); 130 | 131 | 132 | -- 133 | -- Name: image_last_synced_with_source_187adf09; Type: INDEX; Schema: public; Owner: deploy 134 | -- 135 | 136 | CREATE INDEX image_last_synced_with_source_187adf09 ON public.image USING btree (last_synced_with_source); 137 | 138 | 139 | -- 140 | -- Name: image_perceptual_hash_0d126a7a; Type: INDEX; Schema: public; Owner: deploy 141 | -- 142 | 143 | CREATE INDEX image_perceptual_hash_0d126a7a ON public.image USING btree (perceptual_hash); 144 | 145 | 146 | -- 147 | -- Name: image_perceptual_hash_0d126a7a_like; Type: INDEX; Schema: public; Owner: deploy 148 | -- 149 | 150 | CREATE INDEX image_perceptual_hash_0d126a7a_like ON public.image USING btree (perceptual_hash varchar_pattern_ops); 151 | 152 | 153 | -- 154 | -- Name: image_provider_7d11f847; Type: INDEX; Schema: public; Owner: deploy 155 | -- 156 | 157 | CREATE INDEX image_provider_7d11f847 ON public.image USING btree (provider); 158 | 159 | 160 | -- 161 | -- Name: image_provider_7d11f847_like; Type: INDEX; Schema: public; Owner: deploy 162 | -- 163 | 164 | CREATE INDEX image_provider_7d11f847_like ON public.image USING btree (provider varchar_pattern_ops); 165 | 166 | 167 | -- 168 | -- Name: image_source_d5a89e97; Type: INDEX; Schema: public; Owner: deploy 169 | -- 170 | 171 | CREATE INDEX image_source_d5a89e97 ON public.image USING btree (source); 172 | 173 | 174 | -- 175 | -- Name: image_source_d5a89e97_like; Type: INDEX; Schema: public; Owner: deploy 176 | -- 177 | 178 | CREATE INDEX image_source_d5a89e97_like ON public.image USING btree (source varchar_pattern_ops); 179 | 180 | 181 | -- 182 | -- Name: image_url_c6aabda2_like; Type: INDEX; Schema: public; Owner: deploy 183 | -- 184 | 185 | CREATE INDEX image_url_c6aabda2_like ON public.image USING btree (url varchar_pattern_ops); 186 | 187 | 188 | -- 189 | -- PostgreSQL database dump complete 190 | -- 191 | 192 | -------------------------------------------------------------------------------- /ingestion_server/test/mock_data/update_mocks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Fetches mock data from a running postgres database. 4 | 5 | export PGPASSWORD="deploy" 6 | # Dump schema 7 | pg_dump -s -h localhost -U deploy -d openledger -t 'image' > schema.sql 8 | # Remove search path (so we can refer to the public schema implicitly) 9 | sed -ie '/search_path/d' schema.sql 10 | # Select some images and save to CSV 11 | psql -h localhost -U deploy -d openledger -c "\\copy (select * from image where meta_data is not null limit 1000) to './mocked_images.csv' with CSV" 12 | exit 0 13 | -------------------------------------------------------------------------------- /ingestion_server/test/unit_tests.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import datetime 3 | from uuid import uuid4 4 | from psycopg2.extras import Json 5 | from ingestion_server.cleanup import CleanupFunctions 6 | from ingestion_server.elasticsearch_models import Image 7 | 8 | 9 | def create_mock_image(override=None): 10 | """ 11 | Produce a mock image. Override default fields by passing in a dict with the 12 | desired keys and values. 13 | 14 | For example, to make an image with a custom title and default everything 15 | else: 16 | >>> create_mock_image({'title': 'My title'}) 17 | :return: 18 | """ 19 | test_popularity = { 20 | 'views': 50, 21 | 'likes': 3, 22 | 'comments': 1 23 | } 24 | license_url = 'https://creativecommons.org/licenses/by/2.0/fr/legalcode' 25 | meta_data = { 26 | 'popularity_metrics': test_popularity, 27 | 'license_url': license_url 28 | } 29 | test_data = { 30 | 'id': 0, 31 | 'title': 'Unit test title', 32 | 'identifier': str(uuid4()), 33 | 'creator': 'Eric Idle', 34 | 'creator_url': 'https://creativecommons.org', 35 | 'tags': [{'name': 'test', 'accuracy': 0.9}], 36 | 'created_on': datetime.datetime.now(), 37 | 'url': 'https://creativecommons.org', 38 | 'thumbnail': 'https://creativecommons.org', 39 | 'provider': 'test', 40 | 'source': 'test', 41 | 'license': 'cc-by', 42 | 'license_version': '4.0', 43 | 'foreign_landing_url': 'https://creativecommons.org', 44 | 'view_count': 0, 45 | 'height': 500, 46 | 'width': 500, 47 | 'mature': False, 48 | 'meta_data': meta_data 49 | } 50 | if override: 51 | for k, v in override.items(): 52 | test_data[k] = v 53 | schema = {} 54 | row = [] 55 | idx = 0 56 | for k, v in test_data.items(): 57 | schema[k] = idx 58 | row.append(v) 59 | idx += 1 60 | return Image.database_row_to_elasticsearch_doc(row, schema) 61 | 62 | 63 | class TestImage: 64 | @staticmethod 65 | def test_size(): 66 | small = create_mock_image({'height': 600, 'width': 300}) 67 | assert small.size == Image.ImageSizes.SMALL.name.lower() 68 | huge = create_mock_image({'height': 4096, 'width': 4096}) 69 | assert huge.size == Image.ImageSizes.LARGE.name.lower() 70 | 71 | @staticmethod 72 | def test_aspect_ratio(): 73 | square = create_mock_image({'height': 300, 'width': 300}) 74 | assert square.aspect_ratio == Image.AspectRatios.SQUARE.name.lower() 75 | tall = create_mock_image({'height': 500, 'width': 200}) 76 | assert tall.aspect_ratio == Image.AspectRatios.TALL.name.lower() 77 | wide = create_mock_image({'height': 200, 'width': 500}) 78 | assert wide.aspect_ratio == Image.AspectRatios.WIDE.name.lower() 79 | 80 | @staticmethod 81 | def test_extension(): 82 | no_extension = create_mock_image({ 83 | 'url': 'https://creativecommons.org/hello' 84 | }) 85 | assert no_extension.extension is None 86 | jpg = create_mock_image({ 87 | 'url': 'https://creativecommons.org/hello.jpg' 88 | }) 89 | assert jpg.extension == 'jpg' 90 | 91 | @staticmethod 92 | def test_mature_metadata(): 93 | # Received upstream indication the work is mature 94 | meta = { 95 | 'mature': True 96 | } 97 | mature_metadata = create_mock_image({'meta_data': meta}) 98 | assert mature_metadata['mature'] 99 | 100 | @staticmethod 101 | def test_mature_api(): 102 | # Manually flagged work as mature ourselves 103 | mature_work = create_mock_image({'mature': True}) 104 | assert mature_work['mature'] 105 | 106 | @staticmethod 107 | def test_default_maturity(): 108 | # Default to not flagged 109 | sfw = create_mock_image() 110 | assert not sfw['mature'] 111 | 112 | 113 | class TestCleanup: 114 | @staticmethod 115 | def test_tag_blacklist(): 116 | tags = [ 117 | { 118 | 'name': 'cc0' 119 | }, 120 | { 121 | 'name': ' cc0' 122 | }, 123 | { 124 | 'name': 'valid', 125 | 'accuracy': 0.99 126 | }, 127 | { 128 | 'name': 'valid_no_accuracy' 129 | }, 130 | { 131 | 'name': 'garbage:=metacrap', 132 | } 133 | ] 134 | result = str(CleanupFunctions.cleanup_tags(tags)) 135 | expected = str(Json([ 136 | {'name': 'valid', 'accuracy': 0.99}, 137 | {'name': 'valid_no_accuracy'} 138 | ])) 139 | 140 | assert result == expected 141 | 142 | @staticmethod 143 | def test_tag_no_update(): 144 | tags = [ 145 | { 146 | 'name': 'valid', 147 | 'accuracy': 0.92 148 | } 149 | ] 150 | result = CleanupFunctions.cleanup_tags(tags) 151 | assert result is None 152 | 153 | @staticmethod 154 | def test_accuracy_filter(): 155 | tags = [ 156 | { 157 | 'name': 'inaccurate', 158 | 'accuracy': 0.5 159 | }, 160 | { 161 | 'name': 'accurate', 162 | 'accuracy': 0.999 163 | } 164 | ] 165 | result = str(CleanupFunctions.cleanup_tags(tags)) 166 | expected = str(Json([{'name': 'accurate', 'accuracy': 0.999}])) 167 | assert result == expected 168 | 169 | @staticmethod 170 | def test_url_protocol_fix(): 171 | bad_url = 'flickr.com' 172 | tls_support_cache = {} 173 | result = CleanupFunctions.cleanup_url(bad_url, tls_support_cache) 174 | expected = "'https://flickr.com'" 175 | 176 | bad_http = 'neverssl.com' 177 | result_http = CleanupFunctions.cleanup_url(bad_http, tls_support_cache) 178 | expected_http = "'http://neverssl.com'" 179 | assert result == expected 180 | assert result_http == expected_http 181 | 182 | @staticmethod 183 | def test_rank_feature_verify(): 184 | img = create_mock_image({'standardized_popularity': 200}) 185 | assert img.standardized_popularity == 100 186 | img2 = create_mock_image({'standardized_popularity': 0}) 187 | assert img2.standardized_popularity is None 188 | -------------------------------------------------------------------------------- /initialization.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/initialization.PNG -------------------------------------------------------------------------------- /load_sample_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | CCCAPI_CONTAINER_NAME="${CCCAPI_CONTAINER_NAME:-cccatalog-api_web_1}" 4 | ANALYTICS_CONTAINER_NAME="${ANALYTICS_CONTAINER_NAME:-cccatalog-api_analytics_1}" 5 | # Set up API database and upstream 6 | docker exec -i $CCCAPI_CONTAINER_NAME /bin/bash -c 'python3 manage.py migrate --noinput' 7 | # Create a user for integration testing. 8 | docker exec -i $CCCAPI_CONTAINER_NAME /bin/bash <<'EOF' 9 | python3 manage.py shell -c "from django.contrib.auth.models import User 10 | user = User.objects.create_user('continuous_integration', 'test@test.test', 'deploydeploy') 11 | user.save() 12 | " 13 | EOF 14 | # Migrate analytics 15 | docker exec -i $ANALYTICS_CONTAINER_NAME /bin/bash -c 'PYTHONPATH=. pipenv run alembic upgrade head' 16 | PGPASSWORD=deploy pg_dump -s -t image -U deploy -d openledger -h localhost -p 5432 | PGPASSWORD=deploy psql -U deploy -d openledger -p 5433 -h localhost 17 | # Load sample data 18 | PGPASSWORD=deploy psql -U deploy -d openledger -h localhost -p 5432 -c "INSERT INTO content_provider (created_on, provider_identifier, provider_name, domain_name, filter_content) VALUES (now(), 'flickr', 'Flickr', 'https://www.flickr.com', false), (now(), 'behance', 'Behance', 'https://www.behance.net', false);" 19 | PGPASSWORD=deploy psql -U deploy -d openledger -h localhost -p 5433 <