├── .cc-metadata.yml
├── .gitattributes
├── .github
├── dependabot.yml
└── workflows
│ ├── IssueAndPR.yml
│ └── integration-tests.yml
├── .gitignore
├── .idea
└── dictionaries
│ └── alden.xml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── CONTRIBUTORS.md
├── DOCUMENTATION_GUIDELINES.md
├── LICENSE
├── README.md
├── analytics
├── Dockerfile
├── Pipfile
├── Pipfile.lock
├── README.md
├── __init__.py
├── alembic.ini
├── attribution_worker.py
├── backdate.py
├── docs
│ ├── redoc.html
│ └── swagger.yaml
├── event_controller.py
├── gen_daily_report.py
├── migrations
│ ├── README
│ ├── env.py
│ ├── script.py.mako
│ └── versions
│ │ ├── 0cd416f5a7d2_add_attribution_events_table.py
│ │ ├── 54e56668b66a_regenerate_initial_migration.py
│ │ ├── 7695412f8a64_switch_to_boolean_search_rating_instead_.py
│ │ └── beb6d39f2dfd_add_reporting_tables.py
├── models.py
├── report_controller.py
├── server.py
├── settings.py
└── tests.py
├── cccatalog-api
├── Dockerfile
├── Pipfile
├── Pipfile.lock
├── cccatalog
│ ├── __init__.py
│ ├── api
│ │ ├── __init__.py
│ │ ├── admin.py
│ │ ├── apps.py
│ │ ├── controllers
│ │ │ ├── __init__.py
│ │ │ ├── link_controller.py
│ │ │ └── search_controller.py
│ │ ├── licenses.py
│ │ ├── migrations
│ │ │ ├── 0001_initial.py
│ │ │ ├── 0002_auto_20180723_1737.py
│ │ │ ├── 0003_image_view_count.py
│ │ │ ├── 0004_shortenedlink.py
│ │ │ ├── 0005_auto_20180803_1905.py
│ │ │ ├── 0006_image_watermarked.py
│ │ │ ├── 0007_auto_20180803_1909.py
│ │ │ ├── 0008_imagelist_slug.py
│ │ │ ├── 0009_auto_20180831_1425.py
│ │ │ ├── 0010_auto_20180831_1815.py
│ │ │ ├── 0011_auto_20181117_0029.py
│ │ │ ├── 0012_auto_20190102_2012.py
│ │ │ ├── 0013_contentprovider.py
│ │ │ ├── 0014_auto_20190122_1853.py
│ │ │ ├── 0015_contentprovider_notes.py
│ │ │ ├── 0016_auto_20190122_1908.py
│ │ │ ├── 0017_remove_contentprovider_updated_on.py
│ │ │ ├── 0018_auto_20190122_1917.py
│ │ │ ├── 0019_auto_20190307_1830.py
│ │ │ ├── 0020_auto_20190918_1954.py
│ │ │ ├── 0021_deletedimages.py
│ │ │ ├── 0022_reportimage.py
│ │ │ ├── 0023_auto_20200423_1526.py
│ │ │ ├── 0024_auto_20200423_1601.py
│ │ │ ├── 0025_auto_20200429_1401.py
│ │ │ ├── 0026_imagereport_date.py
│ │ │ ├── 0027_auto_20200515_2037.py
│ │ │ ├── 0028_sourcelogo.py
│ │ │ └── __init__.py
│ │ ├── models.py
│ │ ├── serializers
│ │ │ ├── __init__.py
│ │ │ ├── image_serializers.py
│ │ │ ├── link_serializers.py
│ │ │ ├── list_serializers.py
│ │ │ └── oauth2_serializers.py
│ │ ├── tests.py
│ │ ├── utils
│ │ │ ├── __init__.py
│ │ │ ├── ccrel.py
│ │ │ ├── dead_link_mask.py
│ │ │ ├── exceptions.py
│ │ │ ├── fonts
│ │ │ │ ├── SourceCodePro-Bold.ttf
│ │ │ │ └── SourceSansPro-Bold.ttf
│ │ │ ├── oauth2_helper.py
│ │ │ ├── scheduled_tasks.py
│ │ │ ├── throttle.py
│ │ │ ├── validate_images.py
│ │ │ └── watermark.py
│ │ └── views
│ │ │ ├── __init__.py
│ │ │ ├── image_views.py
│ │ │ ├── link_views.py
│ │ │ ├── list_views.py
│ │ │ └── site_views.py
│ ├── custom_auto_schema.py
│ ├── example_responses.py
│ ├── scripts
│ │ ├── api_load_testing
│ │ │ ├── common_english_words.txt
│ │ │ └── locustfile.py
│ │ ├── migration
│ │ │ └── migrate_lists.py
│ │ └── thumbnail_load_test
│ │ │ └── locustfile.py
│ ├── settings.py
│ ├── urls.py
│ └── wsgi.py
├── manage.py
├── pytest.ini
├── run.sh
├── terms_of_service.html
└── test
│ ├── README
│ ├── __init__.py
│ ├── api_live_integration_test.py
│ ├── api_live_search_qa.py
│ ├── run_test.sh
│ ├── search_qa_test.py
│ └── v1_integration_test.py
├── docker-compose.yml
├── ingestion_server
├── .dockerignore
├── Dockerfile
├── Dockerfile-worker
├── Pipfile
├── Pipfile.lock
├── README.md
├── config
│ └── supervisord.conf
├── howitworks.png
├── ingestion_server
│ ├── __init__.py
│ ├── api.py
│ ├── authority.py
│ ├── categorize.py
│ ├── cleanup.py
│ ├── distributed_reindex_scheduler.py
│ ├── elasticsearch_models.py
│ ├── es_mapping.py
│ ├── indexer.py
│ ├── indexer_worker.py
│ ├── ingest.py
│ ├── qa.py
│ ├── state.py
│ └── tasks.py
├── publish_release.sh
└── test
│ ├── __init__.py
│ ├── generate_integration_test_docker_compose.py
│ ├── integration-test-docker-compose.yml
│ ├── integration_tests.py
│ ├── mock_data
│ ├── mocked_images.csv
│ ├── no_constraints_schema.sql
│ ├── schema.sql
│ └── update_mocks.sh
│ └── unit_tests.py
├── initialization.PNG
├── load_sample_data.sh
├── local_api_documentation.PNG
├── localhost_request.PNG
├── sample_data
├── make_sample_pop.py
├── pop_col.csv
└── sample_data.csv
└── system_architecture.png
/.cc-metadata.yml:
--------------------------------------------------------------------------------
1 | # Whether this GitHub repo is engineering related
2 | engineering_project: true
3 | # Name of the repository/project in English
4 | english_name: CC Catalog API
5 | # All technologies used
6 | technologies: Python, Django, Django REST Framework, Elasticsearch
7 | # Whether this repository should be featured on the CC Open Source site
8 | featured: false
9 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Set default behavior to automatically normalize line endings
2 | * text=auto
3 |
4 | # Force all files to always use LF line endings so that if a repo is accessed
5 | # in Unix via a file share from Windows, the files will work
6 | * text eol=lf
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | #################################
2 | # Dependabot Configuration File #
3 | #################################
4 |
5 | # current Github-native version of Dependabot
6 | version: 2
7 |
8 | updates:
9 | # Enable version updates for Docker
10 | - package-ecosystem: 'docker'
11 | # Look for a `Dockerfile` in the `/cccatalog-api` directory
12 | directory: '/cccatalog-api'
13 | # Check for updates once a week
14 | schedule:
15 | interval: 'weekly'
16 |
17 | # Enable version updates for Python
18 | - package-ecosystem: 'pip'
19 | # Look for a `Pipfile` in the `/cccatalog-api` directory
20 | directory: '/cccatalog-api'
21 | # Check for updates once a week
22 | schedule:
23 | interval: 'weekly'
24 |
25 |
--------------------------------------------------------------------------------
/.github/workflows/IssueAndPR.yml:
--------------------------------------------------------------------------------
1 | name: "Project Board Automation"
2 |
3 | on:
4 | issues:
5 | types: [ opened ]
6 | pull_request:
7 | types: [ opened ]
8 | jobs:
9 | join_issue_pr_to_project:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - name: "Automate adding issues to Backlog"
13 | uses: docker://takanabe/github-actions-automate-projects:v0.0.1
14 | if: github.event_name == 'issues'
15 | env:
16 | GITHUB_TOKEN: ${{ secrets.ADMIN_GITHUB_TOKEN }}
17 | GITHUB_PROJECT_URL: https://github.com/orgs/creativecommons/projects/10
18 | GITHUB_PROJECT_COLUMN_NAME: "Pending Review"
19 | - name: "Automate adding PRs to Active Sprint"
20 | uses: docker://takanabe/github-actions-automate-projects:v0.0.1
21 | if: github.event_name == 'pull_request'
22 | continue-on-error: true
23 | env:
24 | GITHUB_TOKEN: ${{ secrets.ADMIN_GITHUB_TOKEN }}
25 | GITHUB_PROJECT_URL: https://github.com/orgs/creativecommons/projects/7
26 | GITHUB_PROJECT_COLUMN_NAME: "In Progress"
27 |
--------------------------------------------------------------------------------
/.github/workflows/integration-tests.yml:
--------------------------------------------------------------------------------
1 | name: Automated tests
2 | on:
3 | pull_request:
4 | branches: [ master ]
5 | push:
6 | branches: [ master ]
7 |
8 | jobs:
9 | Style:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/setup-python@v2
13 | - name: Install pycodestyle
14 | run: pip install pycodestyle
15 | - name: Checkout
16 | uses: actions/checkout@v2
17 | - name: Check API style
18 | run: pycodestyle cccatalog-api/cccatalog --exclude='cccatalog-api/cccatalog/api/migrations,cccatalog-api/cccatalog/example_responses.py' --max-line-length=80 --ignore=E402,E702
19 | - name: Check ingestion-server style
20 | run: pycodestyle ingestion_server/ingestion_server --max-line-length=80 --ignore=E402
21 | Tests:
22 | timeout-minutes: 15
23 | runs-on: ubuntu-latest
24 | steps:
25 | - uses: actions/checkout@v2
26 | - uses: actions/setup-python@v1
27 | - name: Install dependencies
28 | run: |
29 | pip install pytest pipenv
30 | sudo apt-get install libexempi3 librdkafka-dev
31 | PIPENV_PIPFILE=./cccatalog-api/Pipfile pipenv install --system --deploy --dev &
32 | - name: Start API
33 | run: docker-compose up --build -d
34 | - name: Wait for API to come up
35 | run: bash -c 'while [[ "$(curl --insecure -s -o /dev/null -w ''%{http_code}'' http://localhost:8000/healthcheck)" != "200" ]]; do sleep 10; done'
36 | - name: Ingest and index test data
37 | run: ./load_sample_data.sh
38 | - name: Wait for data to be indexed in Elasticsearch
39 | run: bash -c 'while [[ "$(curl -sb -H "Accept:application/json" http://localhost:9200/_cat/aliases/image | grep -c image-)" == "0" ]]; do sleep 5 && docker-compose logs; done'
40 | - name: Run API tests
41 | run: cd cccatalog-api && test/run_test.sh
42 | - name: Run analytics tests
43 | run: cd ./analytics && docker exec -i cccatalog-api_analytics_1 /bin/bash -c 'PYTHONPATH=. pipenv run pytest tests.py'
44 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | es-venv
3 |
4 | # IDE junk
5 | .idea
6 |
7 | # Byte-compiled / optimized / DLL files
8 | __pycache__/
9 | *.py[cod]
10 | *$py.class
11 |
12 | # C extensions
13 | *.so
14 |
15 | # Distribution / packaging
16 | .Python
17 | build/
18 | develop-eggs/
19 | dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | wheels/
29 | *.egg-info/
30 | .installed.cfg
31 | *.egg
32 | MANIFEST
33 |
34 | # PyInstaller
35 | # Usually these files are written by a python script from a template
36 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
37 | *.manifest
38 | *.spec
39 |
40 | # Installer logs
41 | pip-log.txt
42 | pip-delete-this-directory.txt
43 |
44 | # Unit test / coverage reports
45 | htmlcov/
46 | .tox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | .hypothesis/
54 | .pytest_cache/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # pyenv
82 | .python-version
83 |
84 | # celery beat schedule file
85 | celerybeat-schedule
86 |
87 | # SageMath parsed files
88 | *.sage.py
89 |
90 | # Environments
91 | .env
92 | .venv
93 | env/
94 | venv/
95 | ENV/
96 | env.bak/
97 | venv.bak/
98 |
99 | # Spyder project settings
100 | .spyderproject
101 | .spyproject
102 |
103 | # Rope project settings
104 | .ropeproject
105 |
106 | # mkdocs documentation
107 | /site
108 |
109 | # mypy
110 | .mypy_cache/
111 |
112 | # Local .terraform directories
113 | **/.terraform/*
114 |
115 | # .tfstate files
116 | *.tfstate
117 | *.tfstate.*
118 |
119 | # Crash log files
120 | crash.log
121 |
122 | # Ignore any .tfvars files that are generated automatically for each Terraform run. Most
123 | # .tfvars files are managed as part of configuration and so should be included in
124 | # version control.
125 | #
126 | # example.tfvars
127 |
128 | # Ignore crawler data files
129 | ccbot/url_dump.csv
130 | ccbot/crawl_plan.yml
131 |
132 | .DS_Store
133 | *.iml
134 | .idea
135 |
--------------------------------------------------------------------------------
/.idea/dictionaries/alden.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | cccatalog
5 | daemonized
6 | elasticsearch
7 | itersize
8 | syncable
9 | syncer
10 | synchronizer
11 |
12 |
13 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Code of Conduct
2 |
3 | The Creative Commons team is committed to fostering a welcoming community. This
4 | project and all other Creative Commons open source projects are governed by our
5 | [Code of Conduct][code_of_conduct]. Please report unacceptable behavior to
6 | [conduct@creativecommons.org](mailto:conduct@creativecommons.org) per our
7 | [reporting guidelines][reporting_guide].
8 |
9 | For a history of updates, see the [page history here][updates].
10 |
11 | [code_of_conduct]:https://creativecommons.github.io/community/code-of-conduct/
12 | [reporting_guide]:https://creativecommons.github.io/community/code-of-conduct/enforcement/
13 | [updates]:https://github.com/creativecommons/creativecommons.github.io-source/commits/master/content/community/code-of-conduct/contents.lr
14 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to CC Open Source
2 |
3 | Thank you for your interest in contributing to CC Open Source! This document is a set of guidelines to help you contribute to this project.
4 |
5 |
6 |
7 | ## Code of Conduct
8 |
9 | By participating in this project, you are expected to uphold our [Code of Conduct](https://creativecommons.github.io/community/code-of-conduct/).
10 |
11 |
12 |
13 | ## Project Documentation
14 |
15 | Please consult the [README](./README.md) and [CODEBASE](./CODEBASE.md) files at the root of this repository.
16 |
17 |
18 |
19 | ## How to Contribute
20 |
21 | Please read the processes in our general [Contributing Code](https://creativecommons.github.io/contributing-code/) guidelines on the Creative Common Open Source website. It contains some general instructions that should be followed when contributing to any of the Creative Commons open-source repositories.
22 |
23 |
24 |
25 | ### Bugs
26 |
27 | If you find a bug, please open an issue in this repository describing the bug. You can file a bug [here](https://github.com/creativecommons/cccatalog-api/issues/new?template=bug_report.md). You will see a bug report template with the required information you should provide.
28 |
29 | After that, don't forget to tag the issue with the "Bug" label.
30 |
31 |
32 |
33 | ### Proposing changes or new features
34 |
35 | If you have an idea of a new feature or change to how the CC Catalog API works, please [file an issue](https://github.com/creativecommons/cccatalog-api/issues/new?template=feature_request.md) so we can discuss the possibility of that change or new feature being implemented and released in the future. This lets us come to an agreement about the proposed idea before any work is done.
36 |
37 | If you'd like to build a new feature but don't have a specific idea, please check our [public roadmap](https://docs.google.com/document/d/19yH2V5K4nzWgEXaZhkzD1egzrRayyDdxlzxZOTCm_pc/). Choose something from the pipeline of ideas and follow the same process as above.
38 |
39 |
40 |
41 | ### Pull requests
42 |
43 | Before you start writing code, make sure there is an issue open. Pull requests without a link to an existing issue won't be merged.
44 |
45 | If you want to get started contributing code to this project but don't know exactly what to work on, we compiled a good list of issues labeled as [`good first issue`](https://github.com/creativecommons/cccatalog-api/labels/good%20first%20issue) which are small in scope and not so complex to solve. There are also issues labeled as [`help wanted`](https://github.com/creativecommons/cccatalog-api/labels/help%20wanted) which can be a bit more complex but are good examples of things we are currently accepting help from the community.
46 |
47 | Any code modifications will have to be accompanied by the appropriate unit tests. This will be checked and verified during code review. Once the Pull Request is opened, our CI server will run the unit test suite and run a code linter to verify that the code follows the coding guidelines.
48 |
49 |
50 |
51 | ## Running the tests
52 |
53 | ### How to Run API live integration tests
54 | You can check the health of a live deployment of the API by running the live integration tests.
55 |
56 | 1. Change directory to CC Catalog API
57 | ```
58 | cd cccatalog-api
59 | ```
60 |
61 | 2. Install all dependencies for CC Catalog API
62 | ```
63 | pipenv install
64 | ```
65 |
66 | 3. Launch a new shell session
67 | ```
68 | pipenv shell
69 | ```
70 |
71 | 4. Run API live integration test
72 | ```
73 | ./test/run_test.sh
74 | ```
75 |
76 |
77 |
78 | ### How to Run Ingestion Server tests
79 | You can ingest and index some dummy data using the Ingestion Server API.
80 |
81 | 1. Change directory to ingestion server
82 | ```
83 | cd ingestion_server
84 | ```
85 |
86 | 2. Install all dependencies for Ingestion Server API
87 | ```
88 | pipenv install
89 | ```
90 |
91 | 3. Launch a new shell session
92 | ```
93 | pipenv shell
94 | ```
95 |
96 | 4. Run the integration tests
97 | ```
98 | python3 test/integration_tests.py
99 | ```
100 |
101 |
102 |
103 | ## Questions or Thoughts?
104 |
105 | Talk to us on [our developer mailing list or Slack community](https://creativecommons.github.io/community/).
106 |
--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
1 | cccatalog-api contributors (sorted alphabetically by last name)
2 | ============================================
3 |
4 | * **[Liza Daley](https://github.com/lizadaly)**
5 | * Built CC Search prototype, bits of which live on in this repository to this day
6 | * **[Alden Page](https://github.com/aldenstpage)**
7 | * Author and maintainer of current implementation
8 | * **[Paulo Rosário](https://github.com/paulofilip3)**
9 | * Contributed to solution for consistent link rot filtering without impacting result count, improved test suite
10 | * **[Krystle Salazar](https://github.com/krysal)**
11 | * Implemented image takedown endpoint
12 | * **[Habeeb Shopeju](https://github.com/HAKSOAT)**
13 | * Fixed issue with error handling
14 | * **[Vignesh Ram Somnath](https://github.com/VIGS25)**
15 | * Implemented exclusion of known dead links from the search index
16 |
17 |
--------------------------------------------------------------------------------
/DOCUMENTATION_GUIDELINES.md:
--------------------------------------------------------------------------------
1 | # Documentation Guidelines
2 |
3 | Interested in improving our documentation? Here’s what you need to know before making any changes to the documentation.
4 |
5 |
6 |
7 | ## Introduction
8 |
9 | CC Catalog API uses [drf-yasg](https://github.com/axnsan12/drf-yasg), which is a tool that generates real Swagger/OpenAPI 2.0 specifications from a Django Rest Framework API.
10 |
11 |
12 |
13 | ## How to Start Contributing
14 |
15 | - Run the server locally by following this [link](https://github.com/creativecommons/cccatalog-api#running-the-server-locally)
16 | - Update documentation
17 | - Make sure the updates passed the automated tests in this [file](https://github.com/creativecommons/cccatalog-api/blob/master/.github/workflows/integration-tests.yml)
18 | - Commit and push
19 | - Create pull request by following [GitHub Repo Guidelines](https://opensource.creativecommons.org/contributing-code/github-repo-guidelines/)
20 |
21 |
22 |
23 | ## Documentation Styles
24 |
25 | - All documentation must be written in American English with no contractions.
26 | - Descriptions must be written using simple yet concise explanations.
27 | - Codes are preferred over videos and screenshots.
28 |
29 |
30 |
31 | ## Cheat Sheet for drf-yasg
32 | This is a quick syntax guide with examples on how to add or update the documentation for API endpoints.
33 |
34 |
35 |
36 | ### Operation ID
37 | The name of API endpoint.
38 |
39 | **Example**
40 | ```
41 | @swagger_auto_schema(operation_id='image_stats')
42 | ```
43 |
44 |
45 |
46 | ### Operation Description
47 | The description for API endpoint.
48 |
49 | **Example**
50 | ```
51 | image_stats_description = \
52 | """
53 | image_stats is an API endpoint to get a list of all content providers
54 | and their respective number of images in the Creative Commons catalog.
55 |
56 | You can use this endpoint to get details about content providers
57 | such as `source_name`, `image_count`, `display_name`, and `source_url`.
58 |
59 | You can refer to Bash's Request Samples for example on how to use
60 | this endpoint.
61 | """ # noqa
62 |
63 | @swagger_auto_schema(operation_id='image_stats',
64 | operation_description=image_stats_description)
65 | ```
66 |
67 |
68 |
69 | ### Responses
70 | The response received after submitting an API request. The current API documentation includes response schemas and response samples based on their response codes.
71 |
72 | **Example**
73 | ```
74 | image_stats_200_example = {
75 | "application/json": {
76 | "source_name": "flickr",
77 | "image_count": 465809213,
78 | "display_name": "Flickr",
79 | "source_url": "https://www.flickr.com"
80 | }
81 | }
82 |
83 | image_stats_response = {
84 | "200": openapi.Response(
85 | description="OK",
86 | examples=image_stats_200_example,
87 | schema=AboutImageResponse(many=True)
88 | )
89 | }
90 |
91 | @swagger_auto_schema(operation_id='image_stats',
92 | operation_description=image_stats_description,
93 | responses=image_stats_response)
94 | ```
95 |
96 |
97 |
98 | ### Request Body
99 | The data sent to the server when submitting an API request.
100 |
101 | **Example**
102 | ```
103 | register_api_oauth2_request = openapi.Schema(
104 | type=openapi.TYPE_OBJECT,
105 | required=['name', 'description', 'email'],
106 | properties={
107 | 'name': openapi.Schema(
108 | title="Name",
109 | type=openapi.TYPE_STRING,
110 | min_length=1,
111 | max_length=150,
112 | unique=True,
113 | description="A unique human-readable name for your application "
114 | "or project requiring access to the CC Catalog API."
115 | ),
116 | 'description': openapi.Schema(
117 | title="Description",
118 | type=openapi.TYPE_STRING,
119 | min_length=1,
120 | max_length=10000,
121 | description="A description of what you are trying to achieve "
122 | "with your project using the API. Please provide "
123 | "as much detail as possible!"
124 | ),
125 | 'email': openapi.Schema(
126 | title="Email",
127 | type=openapi.TYPE_STRING,
128 | min_length=1,
129 | max_length=254,
130 | format=openapi.FORMAT_EMAIL,
131 | description="A valid email that we can reach you at if we "
132 | "have any questions about your use case or "
133 | "data consumption."
134 | )
135 | },
136 | example={
137 | "name": "My amazing project",
138 | "description": "To access CC Catalog API",
139 | "email": "cccatalog-api@creativecommons.org"
140 | }
141 | )
142 |
143 | @swagger_auto_schema(operation_id='register_api_oauth2',
144 | operation_description=register_api_oauth2_description,
145 | request_body=register_api_oauth2_request,
146 | responses=register_api_oauth2_response)
147 | ```
148 |
149 |
150 |
151 | ### Code Examples
152 | Code examples on how to use the API endpoints. The current API documentation provides code examples in Bash.
153 |
154 | **Example**
155 | ```
156 | image_stats_bash = \
157 | """
158 | # Get a list of content providers and their image count
159 | curl -H "Authorization: Bearer DLBYIcfnKfolaXKcmMC8RIDCavc2hW" http://api.creativecommons.engineering/v1/sources
160 | """ # noqa
161 |
162 | @swagger_auto_schema(operation_id='image_stats',
163 | operation_description=image_stats_description,
164 | responses=image_stats_response,
165 | code_examples=[
166 | {
167 | 'lang': 'Bash',
168 | 'source': image_stats_bash
169 | }
170 | ])
171 | ```
172 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Creative Commons
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/analytics/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7
2 |
3 | ENV PYTHONBUFFERED 1
4 | ENV PYTHONPATH .
5 | WORKDIR /analytics
6 |
7 | # Install Python dependency management tools
8 | RUN pip install --upgrade pip \
9 | && pip install --upgrade setuptools \
10 | && pip install --upgrade pipenv
11 |
12 | # Copy the Pipenv files into the container
13 | COPY . /analytics/
14 |
15 | RUN pipenv install
16 | EXPOSE 8090
17 | ENTRYPOINT pipenv run gunicorn -b '0.0.0.0:8090' server:api
18 |
--------------------------------------------------------------------------------
/analytics/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | name = "pypi"
3 | url = "https://pypi.org/simple"
4 | verify_ssl = true
5 |
6 | [dev-packages]
7 |
8 | [packages]
9 | falcon = "*"
10 | sqlalchemy = "*"
11 | psycopg2 = "*"
12 | alembic = "*"
13 | gunicorn = "*"
14 | requests = "*"
15 | pytest = "*"
16 | falcon-cors = "*"
17 | confluent-kafka = "*"
18 |
--------------------------------------------------------------------------------
/analytics/README.md:
--------------------------------------------------------------------------------
1 | # CC Search Analytics
2 |
3 | ## Purpose
4 |
5 | The `analytics` server collects information about anonymous usage of CC Search.
6 | We intend to use this information to generate statistics about the quality of
7 | search results; the API may be extended in the future to produce usage data
8 | reports.
9 |
10 | To minimize risks to privacy, data is only connected to an anonymous session
11 | UUID, which changes every time that a user visits CC Search. No other
12 | identifying information is collected for analytical purposes. We intend to
13 | consume this raw data to produce aggregated reports, after which the raw
14 | data (along with session UUIDs) will be promptly deleted.
15 |
16 | ## Running the server
17 |
18 | The analytics server is automatically started by `docker-compose` in the parent
19 | directory. Before analytics endpoints can be called, the database needs to
20 | be set up with `../load_sample_data.sh`.
21 |
22 | To run the `analytics` container by itself:
23 |
24 | ```
25 | cd ../
26 | docker-compose up db analytics
27 | # Set up the database.
28 | cd analytics
29 | alembic upgrade head
30 | ```
31 |
32 | ## Generating new database migrations
33 | After updating `models.py`, you will need to produce new database migrations.
34 |
35 | `alembic revision --autogenerate -m "A message concisely explaining the purpose of your new migration`
36 |
37 | ## Running the tests
38 |
39 | ```
40 | pipenv install
41 | pipenv run pytest tests.py
42 | ```
43 |
44 | ## Documentation
45 |
46 | After starting the server, you can view the documentation by visiting the
47 | root path (e.g. localhost:8090/). You may have to tweak `docs/redoc.html` for
48 | this to work on your local machine.
49 |
50 | Alternatively, you can view the production version of the documentation at
51 | `https://api.creativecommons.engineering/analytics`.
52 |
53 | ## Contributing / Code Structure
54 |
55 | Pull requests are welcome. Please make sure to update the unit tests and
56 | OpenAPI documentation (`docs/swagger.yaml`) where appropriate.
57 |
58 | `analytics` uses a model-view-controller pattern. It is intended to be simple
59 | and idiomatic Python. You shouldn't need to know much else besides that to get
60 | started.
61 |
62 | Key technologies to familiarize yourself with include:
63 | * [Falcon](https://falcon.readthedocs.io/en/stable/), a backend API web framework.
64 | * [SQLAlchemy](https://www.sqlalchemy.org/), a database ORM.
65 | * [Alembic](https://alembic.sqlalchemy.org/en/latest/), a lightweight database migration tool for SQLAlchemy.
66 | * [pipenv](https://docs.pipenv.org/en/latest/) for packaging.
67 | * [Docker](https://www.docker.com/) for containerization.
68 | * [OpenAPI](https://www.openapis.org/) (AKA Swagger) for human and machine readable documentation.
69 |
--------------------------------------------------------------------------------
/analytics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/analytics/__init__.py
--------------------------------------------------------------------------------
/analytics/alembic.ini:
--------------------------------------------------------------------------------
1 | # A generic, single database configuration.
2 |
3 | [alembic]
4 | # path to migration scripts
5 | script_location = migrations
6 |
7 | # template used to generate migration files
8 | # file_template = %%(rev)s_%%(slug)s
9 |
10 | # timezone to use when rendering the date
11 | # within the migration file as well as the filename.
12 | # string value is passed to dateutil.tz.gettz()
13 | # leave blank for localtime
14 | # timezone =
15 |
16 | # max length of characters to apply to the
17 | # "slug" field
18 | # truncate_slug_length = 40
19 |
20 | # set to 'true' to run the environment during
21 | # the 'revision' command, regardless of autogenerate
22 | # revision_environment = false
23 |
24 | # set to 'true' to allow .pyc and .pyo files without
25 | # a source .py file to be detected as revisions in the
26 | # versions/ directory
27 | # sourceless = false
28 |
29 | # version location specification; this defaults
30 | # to migrations/versions. When using multiple version
31 | # directories, initial revisions must be specified with --version-path
32 | # version_locations = %(here)s/bar %(here)s/bat migrations/versions
33 |
34 | # the output encoding used when revision files
35 | # are written from script.py.mako
36 | # output_encoding = utf-8
37 |
38 | # Logging configuration
39 | [loggers]
40 | keys = root,sqlalchemy,alembic
41 |
42 | [handlers]
43 | keys = console
44 |
45 | [formatters]
46 | keys = generic
47 |
48 | [logger_root]
49 | level = WARN
50 | handlers = console
51 | qualname =
52 |
53 | [logger_sqlalchemy]
54 | level = WARN
55 | handlers =
56 | qualname = sqlalchemy.engine
57 |
58 | [logger_alembic]
59 | level = INFO
60 | handlers =
61 | qualname = alembic
62 |
63 | [handler_console]
64 | class = StreamHandler
65 | args = (sys.stderr,)
66 | level = NOTSET
67 | formatter = generic
68 |
69 | [formatter_generic]
70 | format = %(levelname)-5.5s [%(name)s] %(message)s
71 | datefmt = %H:%M:%S
72 |
--------------------------------------------------------------------------------
/analytics/attribution_worker.py:
--------------------------------------------------------------------------------
1 | import settings
2 | import json
3 | import logging as log
4 | import urllib.parse as urlparse
5 | from urllib.parse import parse_qs
6 | from uuid import UUID
7 | from models import AttributionReferrerEvent
8 | from sqlalchemy import create_engine
9 | from sqlalchemy.orm import sessionmaker
10 | from confluent_kafka import Consumer
11 |
12 |
13 | def parse_identifier(resource):
14 | identifier = None
15 | parsed_url = urlparse.urlparse(resource)
16 | query = parsed_url.query
17 | if query:
18 | try:
19 | query_parsed = parse_qs(query)
20 | image_id = query_parsed['image_id'][0]
21 | identifier = str(UUID(image_id))
22 | except (KeyError, ValueError, TypeError):
23 | identifier = None
24 | return identifier
25 |
26 |
27 | def parse_message(msg):
28 | if msg is None:
29 | return None
30 | try:
31 | decoded = json.loads(msg)
32 | decoded = json.loads(scrub_malformed(decoded['message']))
33 | resource = decoded['request'].split(' ')[1]
34 | _id = parse_identifier(resource)
35 | parsed = {
36 | 'http_referer': decoded['http_referer'],
37 | 'resource': decoded['request'].split(' ')[1],
38 | 'identifier': _id
39 | }
40 | except (json.JSONDecodeError, KeyError):
41 | log.warning(f'Failed to parse {msg}. Reason: ', exc_info=True)
42 | parsed = None
43 | return parsed
44 |
45 |
46 | def save_message(validated_msg: dict, session):
47 | event = AttributionReferrerEvent(
48 | image_uuid=validated_msg['identifier'],
49 | full_referer=validated_msg['http_referer'],
50 | referer_domain=urlparse.urlparse(validated_msg['http_referer']).netloc,
51 | resource=validated_msg['resource']
52 | )
53 | session.add(event)
54 | session.commit()
55 |
56 |
57 | def scrub_malformed(_json: str):
58 | """ Remove some invalid JSON that NGINX sometimes spits out """
59 | return _json.replace('\"upstream_response_time\":,', '')
60 |
61 |
62 | def is_valid(parsed_msg: dict):
63 | """
64 | We are only interested in attribution image logs for images that are
65 | embedded in domains not owned by Creative Commons. We also want to make
66 | sure that we're only tracking hits on embedded content.
67 | """
68 | if parsed_msg is None:
69 | return False
70 | try:
71 | referer = parsed_msg['http_referer']
72 | resource = parsed_msg['resource']
73 | valid = 'creativecommons.org' not in referer and '.svg' in resource
74 | except KeyError:
75 | valid = False
76 | return valid
77 |
78 |
79 | def listen(consumer, database):
80 | saved = 0
81 | ignored = 0
82 | timeout = 30
83 | while True:
84 | msg = consumer.poll(timeout=timeout)
85 | if msg:
86 | parsed_msg = parse_message(str(msg.value(), 'utf-8'))
87 | if is_valid(parsed_msg):
88 | save_message(parsed_msg, database)
89 | saved += 1
90 | else:
91 | ignored += 1
92 | else:
93 | log.info('No message received in {timeout}')
94 | if saved + ignored % 100 == 0:
95 | log.info(f'Saved {saved} attribution events, ignored {ignored}')
96 |
97 |
98 | if __name__ == '__main__':
99 | log.basicConfig(
100 | filename=settings.ATTRIBUTION_LOGFILE,
101 | format='%(asctime)s %(message)s',
102 | level=log.INFO
103 | )
104 | consumer_settings = {
105 | 'bootstrap.servers': settings.KAFKA_HOSTS,
106 | 'group.id': 'attribution_streamer',
107 | 'auto.offset.reset': 'earliest'
108 | }
109 | c = Consumer(consumer_settings)
110 | c.subscribe([settings.KAFKA_TOPIC_NAME])
111 | engine = create_engine(settings.DATABASE_CONNECTION)
112 | session_maker = sessionmaker(bind=engine)
113 | session = session_maker()
114 | listen(c, session)
115 |
--------------------------------------------------------------------------------
/analytics/backdate.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import settings
3 | from sqlalchemy import create_engine
4 | from sqlalchemy.orm import sessionmaker
5 | from analytics.report_controller import (
6 | generate_usage_report, generate_source_usage_report,
7 | generate_referrer_usage_report, generate_top_searches,
8 | generate_top_result_clicks
9 | )
10 | """
11 | A one-off script for generating analytics reports back to September 2019, when
12 | we first started collecting analytics data.
13 | """
14 |
15 |
16 | engine = create_engine(settings.DATABASE_CONNECTION)
17 | session_maker = sessionmaker(bind=engine)
18 | session = session_maker()
19 | backdate_limit = datetime.datetime(year=2019, month=9, day=10)
20 | current_end_date = datetime.datetime.utcnow()
21 | while current_end_date > backdate_limit:
22 | start_date = current_end_date - datetime.timedelta(days=1)
23 |
24 | generate_usage_report(session, start_date, current_end_date)
25 | generate_source_usage_report(session, start_date, current_end_date)
26 | generate_referrer_usage_report(session, start_date, current_end_date)
27 | generate_top_searches(session, start_date, current_end_date)
28 | generate_top_result_clicks(session, start_date, current_end_date)
29 |
30 | current_end_date -= datetime.timedelta(days=1)
31 | print(f'Generated backdated reports for {current_end_date}')
32 |
--------------------------------------------------------------------------------
/analytics/docs/redoc.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | ReDoc
5 |
6 |
7 |
8 |
9 |
10 |
13 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/analytics/docs/swagger.yaml:
--------------------------------------------------------------------------------
1 | swagger: "2.0"
2 | info:
3 | description: "An API for registering anonymous usage data events in CC Search, which we intend to use to improve the quality of the search results."
4 | version: "1.0.0"
5 | title: "CC Search Usage Data API"
6 | termsOfService: "https://api.creativecommons.engineering/terms_of_service.html"
7 | contact:
8 | email: "alden@creativecommons.org"
9 | license:
10 | name: "MIT License"
11 | url: "https://github.com/creativecommons/cccatalog-api/blob/master/LICENSE"
12 | host: "api.creativecommons.engineering"
13 | basePath: "/analytics"
14 | tags:
15 | - name: "Register events"
16 | description: "Send events to the analytics server."
17 | schemes:
18 | - "https"
19 | paths:
20 | /search_event:
21 | post:
22 | tags:
23 | - "Register events"
24 | summary: "Register a search query event."
25 | description: ""
26 | operationId: "addSearch"
27 | consumes:
28 | - "application/json"
29 | produces:
30 | - "application/json"
31 | parameters:
32 | - in: "body"
33 | name: "body"
34 | description: "The user's search query and unique session UUID."
35 | required: true
36 | schema:
37 | $ref: "#/definitions/CreateSearchEvent"
38 |
39 | responses:
40 | 400:
41 | description: "Invalid input"
42 | /search_rating_event:
43 | post:
44 | tags:
45 | - "Register events"
46 | summary: "Submit a user's rating of a search."
47 | description: ""
48 | operationId: "addSearchRating"
49 | consumes:
50 | - "application/json"
51 | produces:
52 | - "application/json"
53 | parameters:
54 | - in: "body"
55 | name: "body"
56 | required: true
57 | schema:
58 | $ref: "#/definitions/CreateSearchRatingEvent"
59 |
60 | responses:
61 | 201:
62 | description: "Created"
63 | 400:
64 | description: "Invalid input"
65 | /result_click_event:
66 | post:
67 | tags:
68 | - "Register events"
69 | summary: "Submit an event indicating which result was clicked for a given search query."
70 | description: ""
71 | operationId: "addResultClick"
72 | consumes:
73 | - "application/json"
74 | produces:
75 | - "application/json"
76 | parameters:
77 | - in: "body"
78 | name: "body"
79 | required: true
80 | schema:
81 | $ref: "#/definitions/CreateResultClickEvent"
82 | responses:
83 | 201:
84 | description: "Created"
85 | 400:
86 | description: "Invalid input"
87 | /detail_page_event:
88 | post:
89 | tags:
90 | - "Register events"
91 | summary: "Record events occurring on detail pages, such as sharing an image to social media or clicking through to its source."
92 | description: ""
93 | operationId: "addDetailPageEvent"
94 | consumes:
95 | - "application/json"
96 | produces:
97 | - "application/json"
98 | parameters:
99 | - in: "body"
100 | name: "body"
101 | required: true
102 | schema:
103 | $ref: "#/definitions/CreateDetailPageEvent"
104 | responses:
105 | 201:
106 | description: "Created"
107 | 400:
108 | description: "Invalid input"
109 |
110 | definitions:
111 | CreateSearchEvent:
112 | type: "object"
113 | required:
114 | - query
115 | - session_uuid
116 | properties:
117 | query:
118 | type: "string"
119 | session_uuid:
120 | type: "string"
121 | example: "12345678-1234-1234-1234-1234567890ab"
122 |
123 | CreateSearchRatingEvent:
124 | type: "object"
125 | required:
126 | - query
127 | - relevant
128 | properties:
129 | query:
130 | type: "string"
131 | description: "A unique identifier labeling an anonymous user's session."
132 | relevant:
133 | type: "boolean"
134 | example: true
135 |
136 | CreateResultClickEvent:
137 | type: "object"
138 | required:
139 | - query
140 | - session_uuid
141 | - result_uuid
142 | - result_rank
143 | properties:
144 | query:
145 | type: "string"
146 | result_rank:
147 | type: "integer"
148 | example: 2
149 | description: "The position of the result in the search results grid, e.g. 0 for the first result, or 22 for the 21st result."
150 | result_uuid:
151 | type: "string"
152 | example: "12345678-1234-1234-1234-1234567890ab"
153 | description: "The unique identifier for the result that was clicked."
154 | session_uuid:
155 | type: "string"
156 | example: "12345678-1234-1234-1234-1234567890ab"
157 | description: "A unique identifier labeling an anonymous user's session."
158 |
159 | CreateDetailPageEvent:
160 | type: "object"
161 | required:
162 | - event_type
163 | - result_uuid
164 | properties:
165 | event_type:
166 | type: "string"
167 | description: >
168 | Supported event types:
169 | * `ATTRIBUTION_CLICKED` - The user generated an attribution string for this result.
170 | * `REUSE_SURVEY` - The user took a reuse survey.
171 | * `SOURCE_CLICKED` - The user visited the source page of the work.
172 | * `CREATOR_CLICKED` - The user visited the creator of the work's page.
173 | * `SHARED_SOCIAL` - The user shared a link to the work on social media.
174 | example: "ATTRIBUTION_CLICKED"
175 | enum:
176 | - ATTRIBUTION_CLICKED
177 | - REUSE_SURVEY
178 | - SOURCE_CLICKED
179 | - CREATOR_CLICKED
180 | - SHARED_SOCIAL
181 | result_uuid:
182 | type: "string"
183 | example: "12345678-1234-1234-1234-1234567890ab"
184 | description: "The unique identifier for the detail page associated with the event."
185 |
186 | externalDocs:
187 | description: "The Creative Commons search API"
188 | url: "https://api.creativecommons.engineering"
189 |
--------------------------------------------------------------------------------
/analytics/event_controller.py:
--------------------------------------------------------------------------------
1 | from models import SearchEvent, SearchRatingEvent, ResultClickedEvent, \
2 | DetailPageEvent, DetailPageEvents
3 | from sqlalchemy import create_engine
4 | from sqlalchemy.orm import sessionmaker
5 | from settings import DATABASE_CONNECTION
6 |
7 | class EventController:
8 | def __init__(self):
9 | self.engine = create_engine(DATABASE_CONNECTION)
10 |
11 | def _persist(self, _object):
12 | Session = sessionmaker(bind=self.engine)
13 | session = Session()
14 | session.add(_object)
15 | session.commit()
16 |
17 | def create_search(self, session_uuid, query):
18 | search = SearchEvent(
19 | session_uuid=session_uuid,
20 | query=query
21 | )
22 | self._persist(search)
23 |
24 | def create_search_rating(self, query, relevant):
25 | if type(relevant) != bool:
26 | raise ValueError('Invalid rating; must be a boolean.')
27 | search_rating = SearchRatingEvent(
28 | query=query,
29 | relevant=relevant
30 | )
31 | self._persist(search_rating)
32 |
33 | def create_result_click(self, session_uuid, result_uuid, query, rank):
34 | result_click = ResultClickedEvent(
35 | session_uuid=session_uuid,
36 | result_uuid=result_uuid,
37 | query=query,
38 | result_rank=rank
39 | )
40 | self._persist(result_click)
41 |
42 | def create_detail_event(self, event, result_uuid):
43 | _event = DetailPageEvents[event]
44 | detail_event = DetailPageEvent(
45 | event_type=_event,
46 | result_uuid=result_uuid
47 | )
48 | self._persist(detail_event)
49 |
50 | def list_valid_detail_events(self):
51 | return [k.name for k in DetailPageEvents]
52 |
--------------------------------------------------------------------------------
/analytics/gen_daily_report.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import settings
3 | import logging as log
4 | from sqlalchemy import create_engine
5 | from sqlalchemy.orm import sessionmaker
6 | from analytics.report_controller import (
7 | generate_usage_report, generate_source_usage_report,
8 | generate_referrer_usage_report, generate_top_searches,
9 | generate_top_result_clicks
10 | )
11 |
12 | engine = create_engine(settings.DATABASE_CONNECTION)
13 | session_maker = sessionmaker(bind=engine)
14 | session = session_maker()
15 | end_date = datetime.datetime.utcnow()
16 | start_date = end_date - datetime.timedelta(days=1)
17 |
18 | generate_usage_report(session, start_date, end_date)
19 | generate_source_usage_report(session, start_date, end_date)
20 | generate_referrer_usage_report(session, start_date, end_date)
21 | generate_top_searches(session, start_date, end_date)
22 | generate_top_result_clicks(session, start_date, end_date)
23 |
24 | log.info(f'Generated analytics reports for {end_date}')
25 |
--------------------------------------------------------------------------------
/analytics/migrations/README:
--------------------------------------------------------------------------------
1 | Generic single-database configuration.
--------------------------------------------------------------------------------
/analytics/migrations/env.py:
--------------------------------------------------------------------------------
1 |
2 | from logging.config import fileConfig
3 |
4 | from sqlalchemy import engine_from_config
5 | from sqlalchemy import pool
6 |
7 | from alembic import context
8 | import inspect
9 | from settings import DATABASE_CONNECTION
10 | import models
11 | from models import *
12 | # this is the Alembic Config object, which provides
13 | # access to the values within the .ini file in use.
14 | config = context.config
15 | config.set_main_option('sqlalchemy.url', DATABASE_CONNECTION)
16 | # Interpret the config file for Python logging.
17 | # This line sets up loggers basically.
18 | fileConfig(config.config_file_name)
19 |
20 | # add your model's MetaData object here
21 | # for 'autogenerate' support
22 | # from myapp import mymodel
23 | # target_metadata = mymodel.Base.metadata
24 | target_metadata = Base.metadata
25 |
26 | # other values from the config, defined by the needs of env.py,
27 | # can be acquired:
28 | # my_important_option = config.get_main_option("my_important_option")
29 | # ... etc.
30 |
31 |
32 | def include_object(object, name, type_, reflected, compare_to):
33 | """
34 | Tells Alembic whether it owns an object. This can be used to exclude
35 | objects from autogenerated migrations.
36 | """
37 | valid_names = set()
38 | for name, obj in inspect.getmembers(models):
39 | if inspect.isclass(obj):
40 | if hasattr(obj, '__tablename__'):
41 | valid_names.add(str(obj.__tablename__))
42 | if type_ == "table":
43 | if str(object) == "image":
44 | return False
45 | elif str(object) in valid_names:
46 | return True
47 | else:
48 | return False
49 | else:
50 | return True
51 |
52 |
53 | def run_migrations_offline():
54 | """Run migrations in 'offline' mode.
55 |
56 | This configures the context with just a URL
57 | and not an Engine, though an Engine is acceptable
58 | here as well. By skipping the Engine creation
59 | we don't even need a DBAPI to be available.
60 |
61 | Calls to context.execute() here emit the given string to the
62 | script output.
63 |
64 | """
65 | url = DATABASE_CONNECTION
66 | context.configure(
67 | url=url, target_metadata=target_metadata, literal_binds=True,
68 | include_object=include_object
69 | )
70 |
71 | with context.begin_transaction():
72 | context.run_migrations()
73 |
74 |
75 | def run_migrations_online():
76 | """Run migrations in 'online' mode.
77 |
78 | In this scenario we need to create an Engine
79 | and associate a connection with the context.
80 |
81 | """
82 | connectable = engine_from_config(
83 | config.get_section(config.config_ini_section),
84 | prefix="sqlalchemy.",
85 | poolclass=pool.NullPool,
86 | )
87 |
88 | with connectable.connect() as connection:
89 | context.configure(
90 | connection=connection, target_metadata=target_metadata,
91 | include_object=include_object
92 | )
93 |
94 | with context.begin_transaction():
95 | context.run_migrations()
96 |
97 |
98 | if context.is_offline_mode():
99 | run_migrations_offline()
100 | else:
101 | run_migrations_online()
102 |
--------------------------------------------------------------------------------
/analytics/migrations/script.py.mako:
--------------------------------------------------------------------------------
1 | """${message}
2 |
3 | Revision ID: ${up_revision}
4 | Revises: ${down_revision | comma,n}
5 | Create Date: ${create_date}
6 |
7 | """
8 | from alembic import op
9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 |
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 |
18 |
19 | def upgrade():
20 | ${upgrades if upgrades else "pass"}
21 |
22 |
23 | def downgrade():
24 | ${downgrades if downgrades else "pass"}
25 |
--------------------------------------------------------------------------------
/analytics/migrations/versions/0cd416f5a7d2_add_attribution_events_table.py:
--------------------------------------------------------------------------------
1 | """Add attribution events table
2 |
3 | Revision ID: 0cd416f5a7d2
4 | Revises: 7695412f8a64
5 | Create Date: 2020-09-11 15:43:24.507088
6 |
7 | """
8 | from alembic import op
9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 |
12 | # revision identifiers, used by Alembic.
13 | revision = '0cd416f5a7d2'
14 | down_revision = '7695412f8a64'
15 | branch_labels = None
16 | depends_on = None
17 |
18 |
19 | def upgrade():
20 | # ### commands auto generated by Alembic - please adjust! ###
21 | op.create_table('attribution_referrer_event',
22 | sa.Column('id', sa.Integer(), nullable=False),
23 | sa.Column('timestamp', sa.DateTime(), server_default=sa.text('now()'), nullable=True),
24 | sa.Column('image_uuid', postgresql.UUID(), nullable=True),
25 | sa.Column('full_referer', sa.String(), nullable=True),
26 | sa.Column('referer_domain', sa.String(), nullable=True),
27 | sa.Column('resource', sa.String(), nullable=True),
28 | sa.PrimaryKeyConstraint('id')
29 | )
30 | op.create_index(op.f('ix_attribution_referrer_event_image_uuid'), 'attribution_referrer_event', ['image_uuid'], unique=False)
31 | op.create_index(op.f('ix_attribution_referrer_event_referer_domain'), 'attribution_referrer_event', ['referer_domain'], unique=False)
32 | op.create_index(op.f('ix_attribution_referrer_event_resource'), 'attribution_referrer_event', ['resource'], unique=False)
33 | op.create_index(op.f('ix_attribution_referrer_event_timestamp'), 'attribution_referrer_event', ['timestamp'], unique=False)
34 | # ### end Alembic commands ###
35 |
36 |
37 | def downgrade():
38 | # ### commands auto generated by Alembic - please adjust! ###
39 | op.drop_index(op.f('ix_attribution_referrer_event_timestamp'), table_name='attribution_referrer_event')
40 | op.drop_index(op.f('ix_attribution_referrer_event_resource'), table_name='attribution_referrer_event')
41 | op.drop_index(op.f('ix_attribution_referrer_event_referer_domain'), table_name='attribution_referrer_event')
42 | op.drop_index(op.f('ix_attribution_referrer_event_image_uuid'), table_name='attribution_referrer_event')
43 | op.drop_table('attribution_referrer_event')
44 | # ### end Alembic commands ###
45 |
--------------------------------------------------------------------------------
/analytics/migrations/versions/54e56668b66a_regenerate_initial_migration.py:
--------------------------------------------------------------------------------
1 | """Regenerate initial migration
2 |
3 | Revision ID: 54e56668b66a
4 | Revises:
5 | Create Date: 2019-11-07 13:57:47.146441
6 |
7 | """
8 | from alembic import op
9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 |
12 | # revision identifiers, used by Alembic.
13 | revision = '54e56668b66a'
14 | down_revision = None
15 | branch_labels = None
16 | depends_on = None
17 |
18 |
19 | def upgrade():
20 | # ### commands auto generated by Alembic - please adjust! ###
21 | op.create_table('detail_page_event',
22 | sa.Column('id', sa.Integer(), nullable=False),
23 | sa.Column('timestamp', sa.DateTime(), server_default=sa.text('now()'), nullable=True),
24 | sa.Column('result_uuid', postgresql.UUID(), nullable=True),
25 | sa.Column('event_type', sa.Enum('ATTRIBUTION_CLICKED', 'REUSE_SURVEY', 'SOURCE_CLICKED', 'CREATOR_CLICKED', 'SHARED_SOCIAL', name='detailpageevents'), nullable=True),
26 | sa.PrimaryKeyConstraint('id')
27 | )
28 | op.create_index(op.f('ix_detail_page_event_event_type'), 'detail_page_event', ['event_type'], unique=False)
29 | op.create_index(op.f('ix_detail_page_event_result_uuid'), 'detail_page_event', ['result_uuid'], unique=False)
30 | op.create_index(op.f('ix_detail_page_event_timestamp'), 'detail_page_event', ['timestamp'], unique=False)
31 | op.create_table('result_clicked_event',
32 | sa.Column('id', sa.Integer(), nullable=False),
33 | sa.Column('timestamp', sa.DateTime(), server_default=sa.text('now()'), nullable=True),
34 | sa.Column('session_uuid', postgresql.UUID(), nullable=True),
35 | sa.Column('result_uuid', postgresql.UUID(), nullable=True),
36 | sa.Column('query', sa.String(), nullable=True),
37 | sa.Column('result_rank', sa.Integer(), nullable=True),
38 | sa.PrimaryKeyConstraint('id')
39 | )
40 | op.create_index(op.f('ix_result_clicked_event_query'), 'result_clicked_event', ['query'], unique=False)
41 | op.create_index(op.f('ix_result_clicked_event_result_uuid'), 'result_clicked_event', ['result_uuid'], unique=False)
42 | op.create_index(op.f('ix_result_clicked_event_session_uuid'), 'result_clicked_event', ['session_uuid'], unique=False)
43 | op.create_index(op.f('ix_result_clicked_event_timestamp'), 'result_clicked_event', ['timestamp'], unique=False)
44 | op.create_table('search_event',
45 | sa.Column('id', sa.Integer(), nullable=False),
46 | sa.Column('timestamp', sa.DateTime(), server_default=sa.text('now()'), nullable=True),
47 | sa.Column('query', sa.String(), nullable=True),
48 | sa.Column('session_uuid', postgresql.UUID(), nullable=True),
49 | sa.PrimaryKeyConstraint('id')
50 | )
51 | op.create_index(op.f('ix_search_event_query'), 'search_event', ['query'], unique=False)
52 | op.create_index(op.f('ix_search_event_session_uuid'), 'search_event', ['session_uuid'], unique=False)
53 | op.create_index(op.f('ix_search_event_timestamp'), 'search_event', ['timestamp'], unique=False)
54 | op.create_table('search_rating_event',
55 | sa.Column('id', sa.Integer(), nullable=False),
56 | sa.Column('timestamp', sa.DateTime(), server_default=sa.text('now()'), nullable=True),
57 | sa.Column('query', sa.String(), nullable=True),
58 | sa.Column('rating', sa.Integer(), nullable=True),
59 | sa.PrimaryKeyConstraint('id')
60 | )
61 | op.create_index(op.f('ix_search_rating_event_query'), 'search_rating_event', ['query'], unique=False)
62 | op.create_index(op.f('ix_search_rating_event_timestamp'), 'search_rating_event', ['timestamp'], unique=False)
63 | # ### end Alembic commands ###
64 |
65 |
66 | def downgrade():
67 | # ### commands auto generated by Alembic - please adjust! ###
68 | op.drop_index(op.f('ix_search_rating_event_timestamp'), table_name='search_rating_event')
69 | op.drop_index(op.f('ix_search_rating_event_query'), table_name='search_rating_event')
70 | op.drop_table('search_rating_event')
71 | op.drop_index(op.f('ix_search_event_timestamp'), table_name='search_event')
72 | op.drop_index(op.f('ix_search_event_session_uuid'), table_name='search_event')
73 | op.drop_index(op.f('ix_search_event_query'), table_name='search_event')
74 | op.drop_table('search_event')
75 | op.drop_index(op.f('ix_result_clicked_event_timestamp'), table_name='result_clicked_event')
76 | op.drop_index(op.f('ix_result_clicked_event_session_uuid'), table_name='result_clicked_event')
77 | op.drop_index(op.f('ix_result_clicked_event_result_uuid'), table_name='result_clicked_event')
78 | op.drop_index(op.f('ix_result_clicked_event_query'), table_name='result_clicked_event')
79 | op.drop_table('result_clicked_event')
80 | op.drop_index(op.f('ix_detail_page_event_timestamp'), table_name='detail_page_event')
81 | op.drop_index(op.f('ix_detail_page_event_result_uuid'), table_name='detail_page_event')
82 | op.drop_index(op.f('ix_detail_page_event_event_type'), table_name='detail_page_event')
83 | op.drop_table('detail_page_event')
84 | # ### end Alembic commands ###
85 |
--------------------------------------------------------------------------------
/analytics/migrations/versions/7695412f8a64_switch_to_boolean_search_rating_instead_.py:
--------------------------------------------------------------------------------
1 | """Switch to boolean search rating instead of 1-5 star rating
2 |
3 | Revision ID: 7695412f8a64
4 | Revises: 54e56668b66a
5 | Create Date: 2019-11-07 14:13:50.764789
6 |
7 | """
8 | from alembic import op
9 | import sqlalchemy as sa
10 |
11 |
12 | # revision identifiers, used by Alembic.
13 | revision = '7695412f8a64'
14 | down_revision = '54e56668b66a'
15 | branch_labels = None
16 | depends_on = None
17 |
18 |
19 | def upgrade():
20 | # ### commands auto generated by Alembic - please adjust! ###
21 | op.add_column('search_rating_event', sa.Column('relevant', sa.Boolean(), nullable=True))
22 | op.create_index(op.f('ix_search_rating_event_relevant'), 'search_rating_event', ['relevant'], unique=False)
23 | op.drop_column('search_rating_event', 'rating')
24 | # ### end Alembic commands ###
25 |
26 |
27 | def downgrade():
28 | # ### commands auto generated by Alembic - please adjust! ###
29 | op.add_column('search_rating_event', sa.Column('rating', sa.INTEGER(), autoincrement=False, nullable=True))
30 | op.drop_index(op.f('ix_search_rating_event_relevant'), table_name='search_rating_event')
31 | op.drop_column('search_rating_event', 'relevant')
32 | # ### end Alembic commands ###
33 |
--------------------------------------------------------------------------------
/analytics/models.py:
--------------------------------------------------------------------------------
1 | import enum
2 | from sqlalchemy import Integer, Column, Enum, String, DateTime, Boolean, Float
3 | from sqlalchemy.dialects.postgresql import UUID
4 | from sqlalchemy.sql import func
5 | from sqlalchemy.ext.declarative import declarative_base
6 |
7 | Base = declarative_base()
8 |
9 | class Image(Base):
10 | __tablename__ = "image"
11 | # Managed by Django ORM; partially duplicated here so we can join
12 | # analytics and image data together. This is excluded from migrations.
13 | id = Column(Integer, primary_key=True)
14 | identifier = Column(UUID)
15 | source = Column(String)
16 | provider = Column(String)
17 | title = Column(String)
18 |
19 |
20 | class EventMixin(object):
21 | id = Column(Integer, primary_key=True)
22 | timestamp = Column(DateTime, server_default=func.now(), index=True)
23 |
24 |
25 | class ReportMixin(object):
26 | id = Column(Integer, primary_key=True)
27 | start_time = Column(DateTime, index=True)
28 | end_time = Column(DateTime, index=True)
29 |
30 |
31 | class SearchEvent(Base, EventMixin):
32 | """
33 | Store searches linked to a session UUID.
34 | """
35 | __tablename__ = "search_event"
36 |
37 | query = Column(String, index=True)
38 | session_uuid = Column(UUID, index=True)
39 |
40 |
41 | class SearchRatingEvent(Base, EventMixin):
42 | """
43 | Users can provide feedback about the quality of search results.
44 | """
45 | __tablename__= "search_rating_event"
46 |
47 | query = Column(String, index=True)
48 | relevant = Column(Boolean, index=True)
49 |
50 |
51 | class ResultClickedEvent(Base, EventMixin):
52 | """
53 | Link result clicks to search sessions.
54 | """
55 | __tablename__ = "result_clicked_event"
56 |
57 | session_uuid = Column(UUID, index=True)
58 | result_uuid = Column(UUID, index=True)
59 | query = Column(String, index=True)
60 | result_rank = Column(Integer)
61 |
62 |
63 | class DetailPageEvents(enum.Enum):
64 | ATTRIBUTION_CLICKED = enum.auto()
65 | REUSE_SURVEY = enum.auto()
66 | SOURCE_CLICKED = enum.auto()
67 | CREATOR_CLICKED = enum.auto()
68 | SHARED_SOCIAL = enum.auto()
69 |
70 |
71 | class DetailPageEvent(Base, EventMixin):
72 | """
73 | Events that happen on result pages, such as clicking an attribution button
74 | or sharing the result on social media.
75 | """
76 | __tablename__ = "detail_page_event"
77 |
78 | result_uuid = Column(UUID, index=True)
79 | event_type = Column(Enum(DetailPageEvents), index=True)
80 |
81 |
82 | class AttributionReferrerEvent(Base, EventMixin):
83 | """
84 | Triggered by a user's browser loading one of our static assets on a non-CC
85 | site. By parsing server logs, we can determine which work was embedded and
86 | on which domain it appeared.
87 | """
88 | __tablename__ = "attribution_referrer_event"
89 |
90 | image_uuid = Column(UUID, index=True)
91 | full_referer = Column(String)
92 | referer_domain = Column(String, index=True)
93 | # The path to the embedded asset on our server. ex: /static/img/cc-by.svg
94 | resource = Column(String, index=True)
95 |
96 | # Reports
97 |
98 |
99 | class UsageReport(Base, ReportMixin):
100 | """ Tracks statistics for the last 24 hours """
101 | __tablename__ = "usage_reports"
102 | results_clicked = Column(Integer)
103 | attribution_buttonclicks = Column(Integer)
104 | survey_responses = Column(Integer)
105 | source_clicked = Column(Integer)
106 | creator_clicked = Column(Integer)
107 | shared_social = Column(Integer)
108 | sessions = Column(Integer)
109 | searches = Column(Integer)
110 | attribution_referer_hits = Column(Integer)
111 | avg_rating = Column(Float)
112 | avg_searches_per_session = Column(Float)
113 |
114 |
115 | class SourceUsageReport(Base, ReportMixin):
116 | __tablename__ = "source_report"
117 |
118 | source_id = Column(String, index=True)
119 | result_clicks = Column(Integer, index=True)
120 |
121 |
122 | class AttributionRefererReport(Base, ReportMixin):
123 | __tablename__ = "attribution_referer_report"
124 |
125 | domain = Column(String, index=True)
126 | hits = Column(Integer, index=True)
127 |
128 |
129 | class TopSearchesReport(Base, ReportMixin):
130 | __tablename__ = "top_searches"
131 | term = Column(String, index=True)
132 | hits = Column(Integer, index=True)
133 |
134 |
135 | class TopResultsReport(Base, ReportMixin):
136 | __tablename__ = "top_results"
137 | result_uuid = Column(UUID, index=True)
138 | hits = Column(Integer, index=True)
139 | source = Column(String, index=True)
140 | title = Column(String, index=True)
--------------------------------------------------------------------------------
/analytics/server.py:
--------------------------------------------------------------------------------
1 | import falcon
2 | from falcon_cors import CORS
3 | from event_controller import EventController
4 |
5 | event_controller = EventController()
6 |
7 | class SearchEventResource:
8 | def on_post(self, req, resp):
9 | j = req.media
10 | event_controller.create_search(
11 | query=j['query'],
12 | session_uuid=j['session_uuid']
13 | )
14 | resp.status = falcon.HTTP_201
15 |
16 |
17 | class SearchRatingEventResource:
18 | def on_post(self, req, resp):
19 | j = req.media
20 | try:
21 | event_controller.create_search_rating(
22 | query=j['query'],
23 | relevant=j['relevant']
24 | )
25 | resp.status = falcon.HTTP_201
26 | except ValueError:
27 | resp.body = '{"message": "Rating must be True or False"}'
28 | resp.status = falcon.HTTP_400
29 |
30 |
31 | class ResultClickEventResource:
32 | def on_post(self, req, resp):
33 | j = req.media
34 | event_controller.create_result_click(
35 | session_uuid=j['session_uuid'],
36 | result_uuid=j['result_uuid'],
37 | query=j['query'],
38 | rank=j['result_rank']
39 | )
40 | resp.status = falcon.HTTP_201
41 |
42 |
43 | class DetailEventResource:
44 | def on_post(self, req, resp):
45 | j = req.media
46 | try:
47 | event_controller.create_detail_event(
48 | event=j['event_type'],
49 | result_uuid=j['result_uuid']
50 | )
51 | resp.status = falcon.HTTP_201
52 | except KeyError:
53 | valid_events = event_controller.list_valid_detail_events()
54 | resp.body = \
55 | '{{"message": "Invalid event_type. Valid types: {}"}}' \
56 | .format(valid_events)
57 | resp.status = falcon.HTTP_400
58 |
59 |
60 | class RedocResource:
61 | def on_get(self, req, resp):
62 | resp.status = falcon.HTTP_200
63 | resp.content_type = 'text/html'
64 | with open('docs/redoc.html', 'r') as f:
65 | resp.body = f.read()
66 |
67 |
68 | class OpenAPISpecResource:
69 | def on_get(self, req, resp):
70 | resp.status = falcon.HTTP_200
71 | resp.content_type = 'text/html'
72 | with open('docs/swagger.yaml', 'r') as f:
73 | resp.body = f.read()
74 |
75 | origins = [
76 | 'https://ccsearch.creativecommons.org',
77 | 'https://ccsearch-dev.creativecommons.org',
78 | 'https://search.creativecommons.org'
79 | ]
80 | cors = CORS(
81 | allow_origins_list=origins,
82 | allow_all_methods=True,
83 | allow_all_headers=True
84 | )
85 | api = falcon.API(middleware=[cors.middleware])
86 | api.add_route('/', RedocResource())
87 | api.add_route('/swagger.yaml', OpenAPISpecResource())
88 | api.add_route('/search_event', SearchEventResource())
89 | api.add_route('/search_rating_event', SearchRatingEventResource())
90 | api.add_route('/result_click_event', ResultClickEventResource())
91 | api.add_route('/detail_page_event', DetailEventResource())
92 |
--------------------------------------------------------------------------------
/analytics/settings.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | DATABASE_CONNECTION = os.getenv(
4 | 'DATABASE_CONN', 'postgres+psycopg2://deploy:deploy@localhost/openledger'
5 | )
6 |
7 | # Attribution events stream configuration
8 | KAFKA_HOSTS = os.getenv('KAFKA_HOSTS', 'kafka:9092')
9 | KAFKA_TOPIC_NAME = os.getenv('KAFKA_TOPIC', 'attribution_events_dev')
10 | ATTRIBUTION_LOGFILE = os.getenv('LOGFILE', '/var/log/attribution_worker.log')
11 |
--------------------------------------------------------------------------------
/cccatalog-api/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7-stretch
2 |
3 | ENV PYTHONBUFFERED 1
4 |
5 | RUN apt-get update \
6 | && apt-get install libexempi3 \
7 | && mkdir /cccatalog-api \
8 | && mkdir -p /var/log/cccatalog-api/cccatalog-api.log
9 |
10 | ADD cccatalog/api/utils/fonts/SourceSansPro-Bold.ttf /usr/share/fonts/truetype/SourceSansPro-Bold.ttf
11 |
12 | WORKDIR /cccatalog-api
13 |
14 | # Install Python dependency management tools
15 | RUN pip install --upgrade pip \
16 | && pip install --upgrade setuptools \
17 | && pip install --upgrade pipenv
18 |
19 | # Copy the Pipenv files into the container
20 | COPY Pipfile /cccatalog-api/
21 | COPY Pipfile.lock /cccatalog-api/
22 |
23 | # Install the dependencies system-wide
24 | # TODO: Use build args to avoid installing dev dependencies in production
25 | RUN pipenv install --deploy --system --dev
26 |
27 | ENTRYPOINT ["./run.sh"]
28 |
--------------------------------------------------------------------------------
/cccatalog-api/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | name = "pypi"
3 | url = "https://pypi.org/simple"
4 | verify_ssl = true
5 |
6 | [dev-packages]
7 | remote-pdb = "*"
8 | ipython = "*"
9 | pipdeptree = "*"
10 | pycodestyle = "*"
11 |
12 | [packages]
13 | psycopg2-binary = "*"
14 | redlock-py = "*"
15 | hvac = "*"
16 | PyJWT = "*"
17 | python3-openid = "*"
18 | wsgi-basic-auth = "*"
19 | grequests = "*"
20 | requests-oauthlib = "*"
21 | aws-requests-auth = "*"
22 | Django = "==2.2.13"
23 | Pillow = "*"
24 | django-cors-headers = "*"
25 | django-uuslug = "*"
26 | django-sslserver = "*"
27 | django-oauth-toolkit = "==1.1.2"
28 | django-braces = "*"
29 | django-redis = "*"
30 | pytest-django = ">=3.5"
31 | djangorestframework = "*"
32 | drf-yasg = "*"
33 | elasticsearch-dsl = "==7.2.1"
34 | piexif = "*"
35 | python-xmp-toolkit = "*"
36 | deepdiff = "*"
37 | djangorestframework-xml = "*"
38 | gevent = "*"
39 | django-storages = "*"
40 | boto3 = "*"
41 |
42 | [packages.future]
43 | version = "*"
44 |
45 | [packages.ipaddress]
46 | version = "*"
47 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/__init__.py
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/__init__.py
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 | from cccatalog.api.models import (
3 | ImageReport, MatureImage, DeletedImage, ContentProvider, SourceLogo, PENDING
4 | )
5 |
6 |
7 | @admin.register(ImageReport)
8 | class ImageReportAdmin(admin.ModelAdmin):
9 | list_display = (
10 | 'reason', 'status', 'image_url', 'description', 'created_at'
11 | )
12 | list_filter = ('status', 'reason')
13 | list_display_links = ('status',)
14 | search_fields = ('description', 'identifier')
15 | actions = None
16 |
17 | def get_readonly_fields(self, request, obj=None):
18 | if obj is None:
19 | return []
20 | always_readonly = [
21 | 'reason', 'image_url', 'description', 'identifier', 'created_at'
22 | ]
23 | if obj.status == PENDING:
24 | return always_readonly
25 | else:
26 | status_readonly = ['status']
27 | status_readonly.extend(always_readonly)
28 | return status_readonly
29 |
30 |
31 | @admin.register(MatureImage)
32 | class MatureImageAdmin(admin.ModelAdmin):
33 | search_fields = ('identifier',)
34 |
35 |
36 | @admin.register(DeletedImage)
37 | class DeletedImage(admin.ModelAdmin):
38 | search_fields = ('identifier',)
39 |
40 |
41 | class InlineImage(admin.TabularInline):
42 | model = SourceLogo
43 |
44 |
45 | @admin.register(ContentProvider)
46 | class ProviderAdmin(admin.ModelAdmin):
47 | list_display = ('provider_name', 'provider_identifier')
48 | search_fields = ('provider_name', 'provider_identifier')
49 | exclude = ('notes', 'created_on')
50 | inlines = [InlineImage]
51 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 |
3 |
4 | class ApiConfig(AppConfig):
5 | name = 'api'
6 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/controllers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/controllers/__init__.py
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/controllers/link_controller.py:
--------------------------------------------------------------------------------
1 | # All possible letters that can appear in a shortened URL path
2 | URL_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
3 | # Inverted index of the alphabet
4 | ALPHABET_INDEX = {c: idx for idx, c in enumerate(URL_ALPHABET)}
5 |
6 |
7 | def get_next_shortened_path(last_url):
8 | """
9 | Produce a short URL. Each URL is guaranteed to be the shortest possible
10 | path available.
11 | :param last_url: The last allocated URL.
12 | :return: A short URL path, such as '9abx'
13 | """
14 | def get_next_char(c):
15 | c_idx = ALPHABET_INDEX[c]
16 | next_char_idx = (c_idx + 1) % len(URL_ALPHABET)
17 | return URL_ALPHABET[next_char_idx]
18 |
19 | if last_url is None:
20 | return URL_ALPHABET[0]
21 |
22 | last_character = last_url[-1]
23 | next_character = get_next_char(last_character)
24 |
25 | temp_path = last_url
26 | if next_character == URL_ALPHABET[0]:
27 | # Iterate backwards to carry the last digit.
28 | carry = True
29 | idx = len(temp_path) - 1
30 | while idx >= 0 and carry:
31 | c = temp_path[idx]
32 | if c == URL_ALPHABET[-1]:
33 | if idx == 0:
34 | # Overflowed; add a new digit
35 | temp_path = temp_path + URL_ALPHABET[0]
36 | else:
37 | carry = False
38 | temp_path = \
39 | temp_path[:idx] + get_next_char(c) + temp_path[idx + 1:]
40 | idx -= 1
41 | next_path = temp_path
42 | else:
43 | # Increment the last digit.
44 | next_path = temp_path[:-1] + next_character
45 | return next_path
46 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/licenses.py:
--------------------------------------------------------------------------------
1 | LICENSES = (
2 | ("BY", "Attribution"),
3 | ("BY-NC", "Attribution NonCommercial"),
4 | ("BY-ND", "Attribution NoDerivatives"),
5 | ("BY-SA", "Attribution ShareAlike"),
6 | ("BY-NC-ND", "Attribution NonCommercial NoDerivatives"),
7 | ("BY-NC-SA", "Attribution NonCommercial ShareAlike"),
8 | ("PDM", "Public Domain Mark"),
9 | ("CC0", "Public Domain Dedication"),
10 | )
11 |
12 | LICENSE_GROUPS = {
13 | # All open licenses
14 | "all": {'BY', 'BY-NC', 'BY-ND', 'BY-SA', 'BY-NC-ND', 'BY-NC-SA', 'PDM',
15 | 'CC0'},
16 | # All CC licenses
17 | "all-cc": {'BY', 'BY-NC', 'BY-ND', 'BY-SA', 'BY-NC-ND', 'BY-NC-SA', 'CC0'},
18 | # All licenses allowing commercial use
19 | "commercial": {'BY', 'BY-SA', 'BY-ND', 'CC0', 'PDM'},
20 | # All licenses allowing modifications
21 | "modification": {'BY', 'BY-SA', 'BY-NC', 'BY-NC-SA', 'CC0', 'PDM'},
22 | }
23 |
24 | ATTRIBUTION = \
25 | "{title} {creator}is licensed under CC-{_license} {version}. To view a " \
26 | "copy of this license, visit {license_url}."
27 |
28 |
29 | def get_license_url(_license, version, meta_data=None):
30 | license_overridden = meta_data and 'license_url' in meta_data
31 | if license_overridden and meta_data['license_url'] is not None:
32 | return meta_data['license_url']
33 | elif _license.lower() == 'pdm':
34 | return 'https://creativecommons.org/publicdomain/mark/1.0/'
35 | else:
36 | return f'https://creativecommons.org/licenses/{_license}/{version}/'
37 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0002_auto_20180723_1737.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.5 on 2018-07-23 17:37
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0001_initial'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='imagelist',
15 | name='id',
16 | field=models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0003_image_view_count.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.5 on 2018-07-26 19:53
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0002_auto_20180723_1737'),
10 | ]
11 |
12 | operations = [
13 | migrations.AddField(
14 | model_name='image',
15 | name='view_count',
16 | field=models.IntegerField(default=0),
17 | ),
18 | migrations.RunSQL('ALTER TABLE image ALTER view_count SET DEFAULT 0')
19 | ]
20 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0004_shortenedlink.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.5 on 2018-08-01 17:46
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0003_image_view_count'),
10 | ]
11 |
12 | operations = [
13 | migrations.CreateModel(
14 | name='ShortenedLink',
15 | fields=[
16 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
17 | ('updated_on', models.DateTimeField(auto_now=True)),
18 | ('shortened_path', models.CharField(db_index=True, help_text='The path to the shortened URL, e.g. tc3n834. The resulting URL will be shares.cc/tc3n834.', max_length=10, unique=True)),
19 | ('full_url', models.URLField(max_length=1000, unique=True)),
20 | ('created_on', models.DateTimeField(auto_now_add=True, db_index=True)),
21 | ],
22 | options={
23 | 'abstract': False,
24 | },
25 | ),
26 | ]
27 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0005_auto_20180803_1905.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.5 on 2018-08-03 19:05
2 |
3 | import django.contrib.postgres.fields.jsonb
4 | from django.db import migrations
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('api', '0004_shortenedlink'),
11 | ]
12 |
13 | operations = [
14 | migrations.RemoveField(
15 | model_name='image',
16 | name='tags',
17 | ),
18 | migrations.AddField(
19 | model_name='image',
20 | name='tags',
21 | field=django.contrib.postgres.fields.jsonb.JSONField(default=None),
22 | preserve_default=False,
23 | ),
24 | ]
25 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0006_image_watermarked.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.5 on 2018-08-03 19:08
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0005_auto_20180803_1905'),
10 | ]
11 |
12 | operations = [
13 | migrations.AddField(
14 | model_name='image',
15 | name='watermarked',
16 | field=models.BooleanField(default=None),
17 | preserve_default=False,
18 | ),
19 | ]
20 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0007_auto_20180803_1909.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.5 on 2018-08-03 19:09
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0006_image_watermarked'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='image',
15 | name='foreign_identifier',
16 | field=models.CharField(blank=True, db_index=True, max_length=1000, null=True, unique=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0008_imagelist_slug.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.5 on 2018-08-07 17:37
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0007_auto_20180803_1909'),
10 | ]
11 |
12 | operations = [
13 | migrations.AddField(
14 | model_name='imagelist',
15 | name='slug',
16 | field=models.CharField(default=None, help_text='A unique identifier used to make a friendly URL for external downstream API consumers.', max_length=200, unique=True),
17 | preserve_default=False,
18 | ),
19 | ]
20 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0009_auto_20180831_1425.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.5 on 2018-08-31 14:25
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0008_imagelist_slug'),
10 | ]
11 |
12 | operations = [
13 | migrations.AddField(
14 | model_name='imagelist',
15 | name='auth',
16 | field=models.CharField(default='fdsadfwetyhegaerg', help_text='A randomly generated string assigned upon list creation. Used to authenticate updates and deletions.', max_length=64),
17 | preserve_default=False,
18 | ),
19 | migrations.AlterField(
20 | model_name='imagelist',
21 | name='slug',
22 | field=models.CharField(help_text='A unique identifier used to make a friendly URL for downstream API consumers.', max_length=200, unique=True),
23 | ),
24 | migrations.AlterField(
25 | model_name='shortenedlink',
26 | name='full_url',
27 | field=models.URLField(db_index=True, max_length=1000, unique=True),
28 | ),
29 | ]
30 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0010_auto_20180831_1815.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.5 on 2018-08-31 18:15
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0009_auto_20180831_1425'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='imagelist',
15 | name='slug',
16 | field=models.CharField(db_index=True, help_text='A unique identifier used to make a friendly URL for downstream API consumers.', max_length=200, unique=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0011_auto_20181117_0029.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.8 on 2018-11-17 00:29
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0010_auto_20180831_1815'),
10 | ]
11 |
12 | operations = [
13 | migrations.RemoveField(
14 | model_name='image',
15 | name='perceptual_hash',
16 | ),
17 | migrations.AlterField(
18 | model_name='image',
19 | name='identifier',
20 | field=models.UUIDField(db_index=True, unique=True),
21 | ),
22 | migrations.AlterField(
23 | model_name='imagelist',
24 | name='images',
25 | field=models.ManyToManyField(help_text='A list of identifier keys corresponding to images.', related_name='lists', to='api.Image'),
26 | ),
27 | ]
28 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0012_auto_20190102_2012.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.8 on 2019-01-02 20:12
2 |
3 | import django.contrib.postgres.fields.jsonb
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('api', '0011_auto_20181117_0029'),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterUniqueTogether(
15 | name='usertags',
16 | unique_together=set(),
17 | ),
18 | migrations.RemoveField(
19 | model_name='usertags',
20 | name='image',
21 | ),
22 | migrations.RemoveField(
23 | model_name='usertags',
24 | name='tag',
25 | ),
26 | migrations.RemoveField(
27 | model_name='usertags',
28 | name='user',
29 | ),
30 | migrations.AlterField(
31 | model_name='image',
32 | name='tags',
33 | field=django.contrib.postgres.fields.jsonb.JSONField(blank=True, null=True),
34 | ),
35 | migrations.AlterField(
36 | model_name='image',
37 | name='watermarked',
38 | field=models.NullBooleanField(),
39 | ),
40 | migrations.DeleteModel(
41 | name='UserTags',
42 | ),
43 | ]
44 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0013_contentprovider.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.8 on 2019-01-22 18:51
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0012_auto_20190102_2012'),
10 | ]
11 |
12 | operations = [
13 | migrations.CreateModel(
14 | name='ContentProvider',
15 | fields=[
16 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
17 | ('created_on', models.DateTimeField(auto_now_add=True)),
18 | ('updated_on', models.DateTimeField(auto_now=True)),
19 | ('provider_identifier', models.CharField(max_length=50)),
20 | ('provider_name', models.CharField(max_length=250)),
21 | ('domain_name', models.CharField(max_length=500)),
22 | ('filter_content', models.BooleanField(default=False)),
23 | ],
24 | options={
25 | 'db_table': 'content_provider',
26 | },
27 | ),
28 | ]
29 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0014_auto_20190122_1853.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.8 on 2019-01-22 18:53
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0013_contentprovider'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='contentprovider',
15 | name='provider_name',
16 | field=models.CharField(max_length=250, unique=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0015_contentprovider_notes.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.8 on 2019-01-22 19:04
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0014_auto_20190122_1853'),
10 | ]
11 |
12 | operations = [
13 | migrations.AddField(
14 | model_name='contentprovider',
15 | name='notes',
16 | field=models.TextField(default=''),
17 | preserve_default=False,
18 | ),
19 | ]
20 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0016_auto_20190122_1908.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.8 on 2019-01-22 19:08
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0015_contentprovider_notes'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='contentprovider',
15 | name='created_on',
16 | field=models.DateTimeField(),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0017_remove_contentprovider_updated_on.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.8 on 2019-01-22 19:16
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0016_auto_20190122_1908'),
10 | ]
11 |
12 | operations = [
13 | migrations.RemoveField(
14 | model_name='contentprovider',
15 | name='updated_on',
16 | ),
17 | ]
18 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0018_auto_20190122_1917.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.8 on 2019-01-22 19:17
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0017_remove_contentprovider_updated_on'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='contentprovider',
15 | name='notes',
16 | field=models.TextField(null=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0019_auto_20190307_1830.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.13 on 2019-03-07 18:30
2 |
3 | from django.conf import settings
4 | from django.db import migrations, models
5 | import django.db.models.deletion
6 | import oauth2_provider.generators
7 | import oauth2_provider.validators
8 |
9 |
10 | class Migration(migrations.Migration):
11 |
12 | dependencies = [
13 | migrations.swappable_dependency(settings.AUTH_USER_MODEL),
14 | ('api', '0018_auto_20190122_1917'),
15 | ]
16 |
17 | operations = [
18 | migrations.CreateModel(
19 | name='OAuth2Registration',
20 | fields=[
21 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
22 | ('name', models.CharField(help_text='A unique human-readable name for your application or project requiring access to the CC Catalog API.', max_length=150, unique=True)),
23 | ('description', models.CharField(help_text='A description of what you are trying to achieve with your project using the API. Please provide as much detail as possible!', max_length=10000)),
24 | ('email', models.EmailField(help_text='A valid email that we can reach you at if we have any questions about your use case or data consumption.', max_length=254)),
25 | ],
26 | ),
27 | migrations.CreateModel(
28 | name='ThrottledApplication',
29 | fields=[
30 | ('id', models.BigAutoField(primary_key=True, serialize=False)),
31 | ('client_id', models.CharField(db_index=True, default=oauth2_provider.generators.generate_client_id, max_length=100, unique=True)),
32 | ('redirect_uris', models.TextField(blank=True, help_text='Allowed URIs list, space separated', validators=[oauth2_provider.validators.validate_uris])),
33 | ('client_type', models.CharField(choices=[('confidential', 'Confidential'), ('public', 'Public')], max_length=32)),
34 | ('authorization_grant_type', models.CharField(choices=[('authorization-code', 'Authorization code'), ('implicit', 'Implicit'), ('password', 'Resource owner password-based'), ('client-credentials', 'Client credentials')], max_length=32)),
35 | ('client_secret', models.CharField(blank=True, db_index=True, default=oauth2_provider.generators.generate_client_secret, max_length=255)),
36 | ('name', models.CharField(blank=True, max_length=255)),
37 | ('skip_authorization', models.BooleanField(default=False)),
38 | ('created', models.DateTimeField(auto_now_add=True)),
39 | ('updated', models.DateTimeField(auto_now=True)),
40 | ('rate_limit_model', models.CharField(choices=[('standard', 'standard'), ('enhanced', 'enhanced')], default='standard', max_length=20)),
41 | ('user', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='api_throttledapplication', to=settings.AUTH_USER_MODEL)),
42 | ],
43 | options={
44 | 'abstract': False,
45 | },
46 | ),
47 | migrations.AlterField(
48 | model_name='image',
49 | name='foreign_identifier',
50 | field=models.CharField(blank=True, db_index=True, help_text='The identifier provided by the upstream source.', max_length=1000, null=True, unique=True),
51 | ),
52 | migrations.AlterField(
53 | model_name='image',
54 | name='foreign_landing_url',
55 | field=models.CharField(blank=True, help_text='The landing page of the work.', max_length=1000, null=True),
56 | ),
57 | migrations.AlterField(
58 | model_name='image',
59 | name='identifier',
60 | field=models.UUIDField(db_index=True, help_text='A unique identifier that we assign on ingestion.', unique=True),
61 | ),
62 | migrations.AlterField(
63 | model_name='image',
64 | name='provider',
65 | field=models.CharField(blank=True, db_index=True, help_text='The content provider, e.g. Flickr, 500px...', max_length=80, null=True),
66 | ),
67 | migrations.AlterField(
68 | model_name='image',
69 | name='source',
70 | field=models.CharField(blank=True, db_index=True, help_text='The source of the data, meaning a particular dataset. Source and provider can be different: the Google Open Images dataset is source=openimages., but provider=Flickr.', max_length=80, null=True),
71 | ),
72 | migrations.AlterField(
73 | model_name='image',
74 | name='thumbnail',
75 | field=models.URLField(blank=True, help_text='The thumbnail for the image, if any.', max_length=1000, null=True),
76 | ),
77 | migrations.AlterField(
78 | model_name='image',
79 | name='url',
80 | field=models.URLField(help_text='The actual URL to the image.', max_length=1000, unique=True),
81 | ),
82 | ]
83 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0020_auto_20190918_1954.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2.4 on 2019-09-18 19:54
2 |
3 | from django.conf import settings
4 | from django.db import migrations, models
5 | import django.db.models.deletion
6 |
7 |
8 | class Migration(migrations.Migration):
9 |
10 | dependencies = [
11 | ('api', '0019_auto_20190307_1830'),
12 | ]
13 |
14 | operations = [
15 | migrations.AddField(
16 | model_name='throttledapplication',
17 | name='verified',
18 | field=models.BooleanField(default=False),
19 | ),
20 | migrations.AlterField(
21 | model_name='image',
22 | name='identifier',
23 | field=models.UUIDField(db_index=True, help_text='Our unique identifier for a CC work.', unique=True),
24 | ),
25 | migrations.CreateModel(
26 | name='OAuth2Verification',
27 | fields=[
28 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
29 | ('email', models.EmailField(max_length=254)),
30 | ('code', models.CharField(db_index=True, max_length=256)),
31 | ('associated_application', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.OAUTH2_PROVIDER_APPLICATION_MODEL)),
32 | ],
33 | ),
34 | ]
35 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0021_deletedimages.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2.4 on 2020-01-16 18:56
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0020_auto_20190918_1954'),
10 | ]
11 |
12 | operations = [
13 | migrations.CreateModel(
14 | name='DeletedImages',
15 | fields=[
16 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
17 | ('created_on', models.DateTimeField(auto_now_add=True)),
18 | ('updated_on', models.DateTimeField(auto_now=True)),
19 | ('deleted_id', models.UUIDField(db_index=True, help_text='The identifier of the deleted image.', unique=True)),
20 | ('deleting_user', models.CharField(help_text='The user that deleted the image.', max_length=50)),
21 | ],
22 | options={
23 | 'abstract': False,
24 | },
25 | ),
26 | ]
27 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0022_reportimage.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2.10 on 2020-04-12 19:54
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0021_deletedimages'),
10 | ]
11 |
12 | operations = [
13 | migrations.CreateModel(
14 | name='ImageReport',
15 | fields=[
16 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
17 | ('identifier', models.UUIDField()),
18 | ('reason', models.CharField(choices=[('mature', 'mature'), ('dmca', 'dmca'), ('other', 'other')], max_length=10)),
19 | ('description', models.TextField(max_length=500)),
20 | ],
21 | options={
22 | 'db_table': 'nsfw_reports',
23 | },
24 | ),
25 | ]
26 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0023_auto_20200423_1526.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2.10 on 2020-04-23 15:26
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0022_reportimage'),
10 | ]
11 |
12 | operations = [
13 | migrations.CreateModel(
14 | name='MatureImages',
15 | fields=[
16 | ('identifier', models.UUIDField(primary_key=True, serialize=False, unique=True)),
17 | ('created_on', models.DateTimeField(auto_now_add=True)),
18 | ],
19 | ),
20 | migrations.RemoveField(
21 | model_name='deletedimages',
22 | name='deleted_id',
23 | ),
24 | migrations.RemoveField(
25 | model_name='deletedimages',
26 | name='deleting_user',
27 | ),
28 | migrations.RemoveField(
29 | model_name='deletedimages',
30 | name='id',
31 | ),
32 | migrations.AddField(
33 | model_name='deletedimages',
34 | name='identifier',
35 | field=models.UUIDField(default='c9341bce-6e8b-4d6a-b098-29f5ca1253ac', help_text='The identifier of the deleted image.', primary_key=True, serialize=False, unique=True),
36 | preserve_default=False,
37 | ),
38 | migrations.AddField(
39 | model_name='imagereport',
40 | name='status',
41 | field=models.CharField(choices=[('pending', 'pending'), ('confirmed', 'confirmed'), ('rejected', 'rejected')], default='pending', max_length=20),
42 | ),
43 | migrations.AlterField(
44 | model_name='imagereport',
45 | name='description',
46 | field=models.TextField(blank=True, max_length=500, null=True),
47 | ),
48 | migrations.AlterField(
49 | model_name='imagereport',
50 | name='reason',
51 | field=models.CharField(choices=[('mature', 'mature'), ('dmca', 'dmca'), ('other', 'other')], max_length=20),
52 | ),
53 | ]
54 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0024_auto_20200423_1601.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2.10 on 2020-04-23 16:01
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0023_auto_20200423_1526'),
10 | ]
11 |
12 | operations = [
13 | migrations.RenameModel(
14 | old_name='DeletedImages',
15 | new_name='DeletedImage',
16 | ),
17 | migrations.RenameModel(
18 | old_name='MatureImages',
19 | new_name='MatureImage',
20 | ),
21 | migrations.AlterField(
22 | model_name='imagereport',
23 | name='status',
24 | field=models.CharField(choices=[('pending_review', 'pending_review'), ('mature_filter', 'mature_filter'), ('deindex', 'deindex'), ('do_nothing', 'do_nothing')], default='pending', max_length=20),
25 | ),
26 | ]
27 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0025_auto_20200429_1401.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2.10 on 2020-04-29 14:01
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0024_auto_20200423_1601'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='imagereport',
15 | name='status',
16 | field=models.CharField(choices=[('pending_review', 'pending_review'), ('mature_filtered', 'mature_filtered'), ('deindexed', 'deindexed'), ('no_action', 'no_action')], default='pending_review', max_length=20),
17 | ),
18 | migrations.DeleteModel(
19 | name='ImageTags',
20 | ),
21 | ]
22 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0026_imagereport_date.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2.10 on 2020-05-15 17:44
2 |
3 | from django.db import migrations, models
4 | import django.utils.timezone
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('api', '0025_auto_20200429_1401'),
11 | ]
12 |
13 | operations = [
14 | migrations.AddField(
15 | model_name='imagereport',
16 | name='date',
17 | field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
18 | preserve_default=False,
19 | ),
20 | ]
21 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0027_auto_20200515_2037.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2.10 on 2020-05-15 20:37
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0026_imagereport_date'),
10 | ]
11 |
12 | operations = [
13 | migrations.RenameField(
14 | model_name='imagereport',
15 | old_name='date',
16 | new_name='created_at',
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/0028_sourcelogo.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2.13 on 2020-06-30 19:36
2 |
3 | from django.db import migrations, models
4 | import django.db.models.deletion
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('api', '0027_auto_20200515_2037'),
11 | ]
12 |
13 | operations = [
14 | migrations.CreateModel(
15 | name='SourceLogo',
16 | fields=[
17 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
18 | ('image', models.ImageField(upload_to='')),
19 | ('source', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to='api.ContentProvider')),
20 | ],
21 | ),
22 | ]
23 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/migrations/__init__.py
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/serializers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/serializers/__init__.py
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/serializers/link_serializers.py:
--------------------------------------------------------------------------------
1 | import redlock
2 | import os
3 | import logging as log
4 | from rest_framework.serializers import ModelSerializer, Serializer, URLField, \
5 | ValidationError
6 | from cccatalog.api.controllers.link_controller import get_next_shortened_path
7 | from cccatalog.api.models import ShortenedLink
8 | from cccatalog import settings
9 | from urllib.parse import urlparse
10 | from rest_framework import serializers
11 |
12 | # Create a lock inside of Redis to ensure that multiple server workers don't
13 | # try to create the same shortened URL.
14 | __parsed_redis_url = urlparse(settings.CACHES['locks']['LOCATION'])
15 | __host, __port = __parsed_redis_url.netloc.split(':')
16 | __db_num = __parsed_redis_url.path[1] if __parsed_redis_url.path else None
17 | __password = os.environ.get("REDIS_PASSWORD")
18 | # Clients will attempt to acquire the lock infinitely with a 1 second delay.
19 | url_lock = redlock.Redlock(
20 | [{"host": __host, "port": __port, "db": __db_num, "password": __password}],
21 | retry_count=1, retry_delay=1000
22 | )
23 |
24 |
25 | class ShortenedLinkResponseSerializer(Serializer):
26 | shortened_url = URLField(
27 | help_text="A shortened link on the `shares.cc` domain."
28 | )
29 |
30 |
31 | class ShortenedLinkSerializer(ModelSerializer):
32 | """
33 | A single shortened URL, mapping a shortened path at shares.cc to a full
34 | URL elsewhere on the CC Catalog platform.
35 | """
36 | full_url = serializers.URLField(
37 | max_length=1000,
38 | help_text="The URL to shorten. Only URLs on the CC Catalog domain will"
39 | " be accepted. Valid domains: `{}`. "
40 | "Valid paths: `{}`".format(settings.SHORT_URL_WHITELIST,
41 | settings.SHORT_URL_PATH_WHITELIST)
42 | )
43 |
44 | class Meta:
45 | model = ShortenedLink
46 | fields = ('full_url',)
47 |
48 | def validate_full_url(self, value):
49 | parsed_url = urlparse(value)
50 | url = '{url.netloc}'.format(url=parsed_url)
51 | path = '{url.path}'.format(url=parsed_url)
52 | if url not in settings.SHORT_URL_WHITELIST:
53 | raise ValidationError(
54 | "You can only create a short URL to items inside of the CC "
55 | "Catalog. Pointing to other domains is not allowed."
56 | )
57 |
58 | found_allowed_path = False
59 | for allowed_path in settings.SHORT_URL_PATH_WHITELIST:
60 | if path.startswith(allowed_path):
61 | found_allowed_path = True
62 |
63 | if not found_allowed_path:
64 | raise ValidationError(
65 | "Illegal path. Valid paths must start with {}".format(
66 | str(settings.SHORT_URL_PATH_WHITELIST)
67 | )
68 | )
69 |
70 | return value
71 |
72 | def save(self):
73 | two_seconds_ms = 1000 * 2
74 | lock = url_lock.lock('unique_url_lock', ttl=two_seconds_ms)
75 | shortened_path = None
76 | if lock:
77 | try:
78 | last_url = str(
79 | ShortenedLink
80 | .objects
81 | .latest(field_name='created_on')
82 | .shortened_path
83 | )
84 | except ShortenedLink.DoesNotExist:
85 | # No URLs exist. Create the first one.
86 | last_url = None
87 |
88 | shortened_path = get_next_shortened_path(last_url)
89 | full_url = self.validated_data['full_url']
90 | shortened_link_instance = ShortenedLink(
91 | shortened_path=shortened_path,
92 | full_url=full_url
93 | )
94 | shortened_link_instance.save()
95 | url_lock.unlock(lock)
96 | return shortened_path
97 | else:
98 | log.error('Failed to acquire URL lock.')
99 | return shortened_path
100 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/serializers/list_serializers.py:
--------------------------------------------------------------------------------
1 | from rest_framework import serializers
2 | from cccatalog.api.models import ImageList, Image
3 | from cccatalog.api.serializers.image_serializers import ImageDetailSerializer
4 | import secrets
5 |
6 |
7 | class ImageListBaseSerializer(serializers.ModelSerializer):
8 | images = serializers.SlugRelatedField(
9 | many=True,
10 | queryset=Image.objects.all(),
11 | slug_field='identifier',
12 | help_text='A list of unique IDs.'
13 | )
14 |
15 | class Meta:
16 | fields = ('images',)
17 |
18 | def validate_images(self, image_keys):
19 | if len(image_keys) > 500:
20 | raise serializers.ValidationError(
21 | "Only up to 500 images can be added to a list."
22 | )
23 | return image_keys
24 |
25 |
26 | class ImageListCreateSerializer(ImageListBaseSerializer):
27 | """
28 | Responsible for parsing POST JSON body and persisting to the database.
29 | """
30 | lookup_field = 'id'
31 | id = serializers.ReadOnlyField()
32 | auth = serializers.ReadOnlyField()
33 |
34 | class Meta:
35 | model = ImageList
36 | fields = ('id', 'title', 'images', 'auth')
37 |
38 | def save(self):
39 | title = self.validated_data['title']
40 | images = self.validated_data['images']
41 | auth = secrets.token_urlsafe(48)
42 | image_list = ImageList(title=title, auth=auth)
43 | image_list.save()
44 | image_list.images.add(*images)
45 |
46 | return image_list
47 |
48 |
49 | class ImageListResponseSerializer(serializers.Serializer):
50 | """
51 | Return a list of fully resolved images.
52 | """
53 | lookup_field = 'slug'
54 | id = serializers.ReadOnlyField()
55 | title = serializers.CharField()
56 | images = ImageDetailSerializer(many=True)
57 |
58 |
59 | class ImageListUpdateSerializer(ImageListBaseSerializer):
60 | lookup_field = 'id'
61 |
62 | class Meta:
63 | model = ImageList
64 | fields = ('images',)
65 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/serializers/oauth2_serializers.py:
--------------------------------------------------------------------------------
1 | from rest_framework import serializers
2 | from cccatalog.api.models import OAuth2Registration
3 | from oauth2_provider.models import Application
4 |
5 |
6 | class OAuth2RegistrationSerializer(serializers.ModelSerializer):
7 | class Meta:
8 | model = OAuth2Registration
9 | fields = ('name', 'description', 'email')
10 |
11 |
12 | class OAuth2RegistrationSuccessful(serializers.ModelSerializer):
13 | name = serializers.CharField(
14 | help_text="A unique human-readable name for your application "
15 | "or project requiring access to the CC Catalog API."
16 | )
17 | client_id = serializers.CharField(
18 | help_text="A publicly exposed string used by CC Catalog API "
19 | "to identify the application."
20 | )
21 | client_secret = serializers.CharField(
22 | help_text="A private string that authenticates the identity "
23 | "of the application to the CC Catalog API."
24 | )
25 |
26 | class Meta:
27 | model = Application
28 | fields = ('name', 'client_id', 'client_secret')
29 |
30 |
31 | class OAuth2KeyInfo(serializers.Serializer):
32 | requests_this_minute = serializers.IntegerField(
33 | help_text="The number of requests your key has performed in the last "
34 | "minute.",
35 | allow_null=True
36 | )
37 | requests_today = serializers.IntegerField(
38 | help_text="The number of requests your key has performed in the last "
39 | "day.",
40 | allow_null=True
41 | )
42 | rate_limit_model = serializers.CharField(
43 | help_text="The type of rate limit applied to your key. Can be "
44 | "'standard' or 'enhanced'; enhanced users enjoy higher rate "
45 | "limits than their standard key counterparts. Contact "
46 | "Creative Commons if you need a higher rate limit."
47 | )
48 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/utils/__init__.py
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/utils/ccrel.py:
--------------------------------------------------------------------------------
1 | from libxmp.consts import XMP_NS_CC, XMP_NS_XMP_Rights, XMP_NS_XMP
2 | import libxmp
3 | import io
4 | import os
5 | import uuid
6 |
7 | """
8 | Tools for embedding Creative Commons Rights Expression Language (ccREL) data
9 | into files using Extensible Metadata Platform (XMP).
10 |
11 | This implementation is specifically for embedding ccREL inside of images, but it
12 | could be extended to handle other types of content.
13 |
14 | For more information, see the ccREL W3 standard [0].
15 | [0] https://www.w3.org/Submission/ccREL/
16 | """
17 |
18 |
19 | def embed_xmp_bytes(image: io.BytesIO, work_properties):
20 | """
21 | Given a file-like `io.BytesIO` object, embed ccREL metadata inside of it.
22 | For our purposes, we assume that the file is an image.
23 |
24 | :param image: A BytesIO representation of an image.
25 | :param work_properties: A dictionary with keys 'license_url' and
26 | 'attribution'. 'creator', and 'work_landing_page' are optional (but highly
27 | recommended)
28 | :return: An `io.BytesIO` object containing XMP metadata.
29 | """
30 |
31 | # libxmp only works with actual file locations on the disk. To work around
32 | # this limitation, rather than embedding the metadata directly into the
33 | # `io.BytesIO` object, we have to use a temporary file and then convert it
34 | # back.
35 | # https://github.com/python-xmp-toolkit/python-xmp-toolkit/issues/46
36 | filename = '/tmp/{}'.format(uuid.uuid4())
37 | with open(filename, 'w+b') as xmp_temp:
38 | xmp_temp.write(image.getvalue())
39 | xmp_temp.flush()
40 | xmpfile = libxmp.XMPFiles(file_path=xmp_temp.name, open_forupdate=True)
41 |
42 | # Set CC rights.
43 | xmp = xmpfile.get_xmp()
44 | xmp.register_namespace(XMP_NS_CC, 'cc')
45 | xmp.set_property(XMP_NS_CC, 'license', work_properties['license_url'])
46 | if 'creator' in work_properties:
47 | if not xmp.does_property_exist(XMP_NS_CC, 'attributionName'):
48 | xmp.set_property(
49 | XMP_NS_CC, 'attributionName', work_properties['creator']
50 | )
51 | if 'work_landing_page' in work_properties:
52 | if not xmp.does_property_exist(XMP_NS_CC, 'attributionURL'):
53 | xmp.set_property(
54 | XMP_NS_CC,
55 | 'attributionURL',
56 | work_properties['work_landing_page']
57 | )
58 | xmp.register_namespace(XMP_NS_XMP, 'xmp')
59 | if 'identifier' in work_properties:
60 | if not xmp.does_property_exist(XMP_NS_XMP, 'Identifier'):
61 | xmp.set_property(
62 | XMP_NS_XMP,
63 | 'Identifier',
64 | work_properties['identifier']
65 | )
66 | # Set generic XMP rights.
67 | xmp.register_namespace(XMP_NS_XMP_Rights, 'xmpRights')
68 | if not xmp.does_property_exist(XMP_NS_XMP_Rights, 'XMP_NS_XMP_Rights'):
69 | xmp.set_property_bool(XMP_NS_XMP_Rights, 'Marked', True)
70 | if not xmp.does_property_exist(XMP_NS_XMP_Rights, 'UsageTerms'):
71 | usage = work_properties['attribution']
72 | xmp.set_property(XMP_NS_XMP_Rights, 'UsageTerms', usage)
73 | xmpfile.put_xmp(xmp)
74 | xmpfile.close_file()
75 |
76 | with open(filename, 'r+b') as xmpfile:
77 | file_with_xmp = io.BytesIO(xmpfile.read())
78 | os.remove(filename)
79 | return file_with_xmp
80 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/utils/dead_link_mask.py:
--------------------------------------------------------------------------------
1 |
2 | from typing import List
3 | from django_redis import get_redis_connection
4 | from deepdiff import DeepHash
5 | from elasticsearch_dsl import Search
6 |
7 | # 3 hours minutes (in seconds)
8 | DEAD_LINK_MASK_TTL = 60 * 60 * 3
9 |
10 |
11 | def get_query_hash(s: Search) -> str:
12 | """
13 | Generates a deterministic Murmur3 or SHA256 hash from the serialized Search
14 | object using DeepHash so that two Search objects with the same content will
15 | produce the same hash.
16 |
17 | :param s: Search object to be serialized and hashed.
18 | :return: Serialized Search object hash.
19 | """
20 | serialized_search_obj = s.to_dict()
21 | serialized_search_obj.pop('from', None)
22 | serialized_search_obj.pop('size', None)
23 | deep_hash = DeepHash(serialized_search_obj)[serialized_search_obj]
24 | return deep_hash
25 |
26 |
27 | def get_query_mask(query_hash: str) -> List[int]:
28 | """
29 | Fetches an existing query mask for a given query hash
30 | or returns an empty one.
31 |
32 | :param query_hash: Unique value for a particular query.
33 | :return: Boolean mask as a list of integers (0 or 1).
34 | """
35 | redis = get_redis_connection("default")
36 | key = f'{query_hash}:dead_link_mask'
37 | return list(map(int, redis.lrange(key, 0, -1)))
38 |
39 |
40 | def save_query_mask(query_hash: str, mask: List):
41 | """
42 | Saves a query mask to redis.
43 |
44 | :param mask: Boolean mask as a list of integers (0 or 1).
45 | :param query_hash: Unique value to be used as key.
46 | """
47 | redis_pipe = get_redis_connection("default").pipeline()
48 | key = f'{query_hash}:dead_link_mask'
49 |
50 | redis_pipe.delete(key)
51 | redis_pipe.rpush(key, *mask)
52 | redis_pipe.expire(key, DEAD_LINK_MASK_TTL)
53 | redis_pipe.execute()
54 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/utils/exceptions.py:
--------------------------------------------------------------------------------
1 | from rest_framework import status
2 | from rest_framework.response import Response
3 | """
4 | Override the presentation of ValidationErrors, which are deeply nested and
5 | difficult to parse.
6 |
7 | Note that error 500 pages are not handled here; they are generated by the
8 | production web server configuration, and not reproducible locally.
9 | """
10 |
11 |
12 | def parse_value_errors(errors):
13 | fields = ['q']
14 | messages = [errors.args[0].info['error']['root_cause'][0]['reason']]
15 | return fields, messages
16 |
17 |
18 | def parse_non_value_errors(errors):
19 | fields = [f for f in errors]
20 | messages = []
21 | for _field in errors:
22 | error = errors[_field]
23 | for e in error:
24 | messages.append(e)
25 |
26 | # Don't return "non field errors" in deprecation exceptions. There is no
27 | # other way to recover the affected fields other than parsing the error.
28 | if fields == ['non_field_errors']:
29 | split_error = list(messages)
30 | field_idx = ' '.join(messages).index('Parameter') + 1
31 | fields = [split_error[field_idx].replace("'", '')][0]
32 |
33 | return fields, messages
34 |
35 |
36 | def input_error_response(errors):
37 | if isinstance(errors, ValueError):
38 | fields, messages = parse_value_errors(errors)
39 | else:
40 | fields, messages = parse_non_value_errors(errors)
41 |
42 | detail = "Invalid input given for fields."
43 | for i, _ in enumerate(fields):
44 | detail += f" '{fields[i]}' -> {messages[i]}"
45 |
46 | return Response(
47 | status=status.HTTP_400_BAD_REQUEST,
48 | data={
49 | 'error': 'InputError',
50 | 'detail': detail,
51 | 'fields': fields
52 | }
53 | )
54 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/utils/fonts/SourceCodePro-Bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/utils/fonts/SourceCodePro-Bold.ttf
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/utils/fonts/SourceSansPro-Bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/utils/fonts/SourceSansPro-Bold.ttf
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/utils/oauth2_helper.py:
--------------------------------------------------------------------------------
1 | import datetime as dt
2 | import logging
3 | from oauth2_provider.models import AccessToken
4 | from cccatalog.api.models import ThrottledApplication
5 |
6 | log = logging.getLogger(__name__)
7 |
8 |
9 | def get_token_info(token: str):
10 | """
11 | Recover an OAuth2 application client ID and rate limit model from an access
12 | token.
13 |
14 | :param token: An OAuth2 access token.
15 | :return: If the token is valid, return the client ID associated with the
16 | token, rate limit model, and email verification status as a tuple; else
17 | return (None, None, None).
18 | """
19 | try:
20 | token = AccessToken.objects.get(token=token)
21 | except AccessToken.DoesNotExist:
22 | return None, None, None
23 | if token.expires >= dt.datetime.now(token.expires.tzinfo):
24 | try:
25 | application = ThrottledApplication.objects.get(accesstoken=token)
26 | client_id = str(application.client_id)
27 | rate_limit_model = application.rate_limit_model
28 | verified = application.verified
29 | except ThrottledApplication.DoesNotExist:
30 | log.warning(
31 | 'Failed to find application associated with access token.'
32 | )
33 | client_id = None
34 | rate_limit_model = None
35 | verified = None
36 | return client_id, rate_limit_model, verified
37 | else:
38 | log.warning('Rejected expired access token.')
39 | return None, None, None
40 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/utils/scheduled_tasks.py:
--------------------------------------------------------------------------------
1 | from django_cron import CronJobBase, Schedule
2 | from django_redis import get_redis_connection
3 | from django.core.exceptions import ObjectDoesNotExist
4 | from cccatalog.api.models import Image
5 | import logging as log
6 | import time
7 | """
8 | Cron-like tasks run at a set interval. `python3 manage.py runcrons` will
9 | execute any scheduled tasks. This is intended to run on all instances of the
10 | server.
11 |
12 | Even though there may be multiple instances of the server running, a job is
13 | guaranteed to execute only once. Jobs are not run unless it can acquire a lock
14 | inside of the cache (shared by all instances of cccatalog-api).
15 | """
16 | model_name_to_instance = {
17 | 'Image': Image
18 | }
19 |
20 |
21 | class SaveCachedTrafficStats(CronJobBase):
22 | """
23 | Traffic statistics (view count, API usage) are stored in Redis for fast
24 | updates and retrieval. In order to ensure durability of statistics and
25 | minimize cache memory requirements, they are intermittently replicated to
26 | the database in small batches and subsequently evicted from the cache if
27 | they exceed a certain age. Recently updated view data is replicated but not
28 | evicted.
29 |
30 | After traffic statistics have been stored in the database, they are
31 | replicated to Elasticsearch by es-syncer and used to compute trending views.
32 | """
33 | RUN_EVERY_MINS = 20
34 | schedule = Schedule(run_every_mins=RUN_EVERY_MINS)
35 | # Number of failures before notification is sent
36 | MIN_NUM_FAILURES = 5
37 | code = 'cccatalog.api.utils.scheduled_tasks.SaveCachedTrafficStats'
38 |
39 | def do(self):
40 | log.info('Starting view count persistence job')
41 | redis = get_redis_connection('traffic_stats')
42 | one_day_ago = time.time() - 60 * 60 * 24
43 | last_save_time = time.time() - (self.RUN_EVERY_MINS * 60)
44 | old_view_data = redis.zrangebyscore(
45 | 'model-last-accessed', '-inf', one_day_ago
46 | )
47 | recent_view_data = redis.zrangebyscore(
48 | 'model-last-accessed', last_save_time, 'inf'
49 | )
50 | self._save_views_to_db(old_view_data, evict_from_cache=True)
51 | redis.zremrangebyscore('model-last-accessed', '-inf', one_day_ago)
52 | self._save_views_to_db(recent_view_data)
53 | log.info('Saved cached traffic stats')
54 |
55 | @staticmethod
56 | def _save_views_to_db(view_keys, evict_from_cache=False):
57 | if not view_keys:
58 | return
59 | redis = get_redis_connection('traffic_stats')
60 | view_keys = [x.decode('utf-8') for x in view_keys]
61 | for obj in view_keys:
62 | model_name, model_id = obj.split(':')
63 | if model_name in model_name_to_instance:
64 | model = model_name_to_instance[model_name]
65 | try:
66 | instance = model.objects.get(id=model_id)
67 | instance.view_count = redis.get(obj)
68 | instance.save(update_fields=['view_count'])
69 | except ObjectDoesNotExist:
70 | log.warning('Tried to save views of non-existent instance.')
71 | else:
72 | log.warning(
73 | 'Tried to persist views of non-existent model ' + model_name
74 | )
75 | if evict_from_cache:
76 | redis.delete(*view_keys)
77 | log.info('Saved ' + str(view_keys))
78 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/utils/throttle.py:
--------------------------------------------------------------------------------
1 | from rest_framework.throttling import SimpleRateThrottle
2 | import logging
3 | from cccatalog.api.utils.oauth2_helper import get_token_info
4 | from django_redis import get_redis_connection
5 |
6 | log = logging.getLogger(__name__)
7 |
8 |
9 | def _from_internal_network(ip):
10 | redis = get_redis_connection('default')
11 | return redis.sismember('ip-whitelist', ip)
12 |
13 |
14 | class AnonRateThrottle(SimpleRateThrottle):
15 | """
16 | Limits the rate of API calls that may be made by a anonymous users.
17 |
18 | The IP address of the request will be used as the unique cache key.
19 | """
20 | scope = 'anon'
21 |
22 | def get_cache_key(self, request, view):
23 | if _from_internal_network(self.get_ident(request)):
24 | return None
25 | # Do not throttle requests with a valid access token.
26 | if request.auth:
27 | client_id, _, verified = get_token_info(str(request.auth))
28 | if client_id and verified:
29 | return None
30 |
31 | return self.cache_format % {
32 | 'scope': self.scope,
33 | 'ident': self.get_ident(request)
34 | }
35 |
36 |
37 | class PostRequestThrottler(AnonRateThrottle):
38 | rate = '30/day'
39 |
40 |
41 | class BurstRateThrottle(AnonRateThrottle):
42 | scope = 'anon_burst'
43 |
44 |
45 | class SustainedRateThrottle(AnonRateThrottle):
46 | scope = 'anon_sustained'
47 |
48 |
49 | class TenPerDay(AnonRateThrottle):
50 | rate = '10/day'
51 |
52 |
53 | class OneThousandPerMinute(AnonRateThrottle):
54 | rate = '1000/min'
55 |
56 |
57 | class OnePerSecond(AnonRateThrottle):
58 | rate = '1/second'
59 |
60 |
61 | class OAuth2IdThrottleRate(SimpleRateThrottle):
62 | """
63 | Limits the rate of API calls that may be made by a given user's Oauth2
64 | client ID. Can be configured to apply to either standard or enhanced
65 | API keys.
66 | """
67 | scope = 'oauth2_client_credentials'
68 | applies_to_rate_limit_model = 'standard'
69 |
70 | def get_cache_key(self, request, view):
71 | if _from_internal_network(self.get_ident(request)):
72 | return None
73 | # Find the client ID associated with the access token.
74 | auth = str(request.auth)
75 | client_id, rate_limit_model, verified = get_token_info(auth)
76 | if client_id and rate_limit_model == self.applies_to_rate_limit_model:
77 | ident = client_id
78 | else:
79 | # Don't throttle invalid tokens; leave that to the anonymous
80 | # throttlers. Don't throttle enhanced rate limit tokens either.
81 | return None
82 |
83 | return self.cache_format % {
84 | 'scope': self.scope,
85 | 'ident': ident
86 | }
87 |
88 |
89 | class OAuth2IdThrottleSustainedRate(OAuth2IdThrottleRate):
90 | applies_to_rate_limit_model = 'standard'
91 | scope = 'oauth2_client_credentials_sustained'
92 |
93 |
94 | class OAuth2IdThrottleBurstRate(OAuth2IdThrottleRate):
95 | applies_to_rate_limit_model = 'standard'
96 | scope = 'oauth2_client_credentials_burst'
97 |
98 |
99 | class EnhancedOAuth2IdThrottleSustainedRate(OAuth2IdThrottleRate):
100 | applies_to_rate_limit_model = 'enhanced'
101 | scope = 'enhanced_oauth2_client_credentials_sustained'
102 |
103 |
104 | class EnhancedOAuth2IdThrottleBurstRate(OAuth2IdThrottleRate):
105 | applies_to_rate_limit_model = 'enhanced'
106 | scope = 'enhanced_oauth2_client_credentials_burst'
107 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/utils/validate_images.py:
--------------------------------------------------------------------------------
1 | import time
2 | import grequests
3 | import logging
4 | from django_redis import get_redis_connection
5 | from cccatalog.api.utils.dead_link_mask import get_query_mask, save_query_mask
6 |
7 | log = logging.getLogger(__name__)
8 |
9 |
10 | def validate_images(query_hash, start_slice, results, image_urls):
11 | """
12 | Make sure images exist before we display them. Treat redirects as broken
13 | links since 99% of the time the redirect leads to a generic "not found"
14 | placeholder.
15 |
16 | Results are cached in redis and shared amongst all API servers in the
17 | cluster.
18 | """
19 | if not image_urls:
20 | return
21 | start_time = time.time()
22 | # Pull matching images from the cache.
23 | redis = get_redis_connection("default")
24 | cache_prefix = 'valid:'
25 | cached_statuses = redis.mget([cache_prefix + url for url in image_urls])
26 | cached_statuses = [
27 | int(b.decode('utf-8'))
28 | if b is not None else None for b in cached_statuses
29 | ]
30 | # Anything that isn't in the cache needs to be validated via HEAD request.
31 | to_verify = {}
32 | for idx, url in enumerate(image_urls):
33 | if cached_statuses[idx] is None:
34 | to_verify[url] = idx
35 | reqs = (
36 | grequests.head(u, allow_redirects=False, timeout=2, verify=False)
37 | for u in to_verify.keys()
38 | )
39 | verified = grequests.map(reqs, exception_handler=_validation_failure)
40 | # Cache newly verified image statuses.
41 | to_cache = {}
42 | for idx, url in enumerate(to_verify.keys()):
43 | cache_key = cache_prefix + url
44 | if verified[idx]:
45 | status = verified[idx].status_code
46 | # Response didn't arrive in time. Try again later.
47 | else:
48 | status = -1
49 | to_cache[cache_key] = status
50 |
51 | thirty_minutes = 60 * 30
52 | twenty_four_hours_seconds = 60 * 60 * 24
53 | pipe = redis.pipeline()
54 | if len(to_cache) > 0:
55 | pipe.mset(to_cache)
56 | for key, status in to_cache.items():
57 | # Cache successful links for a day, and broken links for 120 days.
58 | if status == 200:
59 | pipe.expire(key, twenty_four_hours_seconds)
60 | elif status == -1:
61 | # Content provider failed to respond; try again in a short interval
62 | pipe.expire(key, thirty_minutes)
63 | else:
64 | pipe.expire(key, twenty_four_hours_seconds * 120)
65 | pipe.execute()
66 |
67 | # Merge newly verified results with cached statuses
68 | for idx, url in enumerate(to_verify):
69 | cache_idx = to_verify[url]
70 | if verified[idx] is not None:
71 | cached_statuses[cache_idx] = verified[idx].status_code
72 | else:
73 | cached_statuses[cache_idx] = -1
74 |
75 | # Create a new dead link mask
76 | new_mask = [1] * len(results)
77 | # Delete broken images from the search results response.
78 | for idx, _ in enumerate(cached_statuses):
79 | del_idx = len(cached_statuses) - idx - 1
80 | status = cached_statuses[del_idx]
81 | if status == 429 or status == 403:
82 | log.warning(
83 | 'Image validation failed due to rate limiting or blocking. '
84 | 'Affected URL: {}'.format(image_urls[idx])
85 | )
86 | elif status != 200:
87 | log.info(
88 | 'Deleting broken image with ID {} from results.'
89 | .format(results[del_idx]['identifier'])
90 | )
91 | del results[del_idx]
92 | new_mask[del_idx] = 0
93 |
94 | # Merge and cache the new mask
95 | mask = get_query_mask(query_hash)
96 | if mask:
97 | new_mask = mask[:start_slice] + new_mask
98 | save_query_mask(query_hash, new_mask)
99 |
100 | end_time = time.time()
101 | log.info('Validated images in {} '.format(end_time - start_time))
102 |
103 |
104 | def _validation_failure(request, exception):
105 | log.warning('Failed to validate image! Reason: {}'.format(exception))
106 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/views/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/cccatalog/api/views/__init__.py
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/api/views/link_views.py:
--------------------------------------------------------------------------------
1 | from django.http import HttpResponsePermanentRedirect
2 | from cccatalog.api.models import ShortenedLink
3 | from rest_framework.generics import GenericAPIView
4 | from rest_framework.views import APIView
5 | from rest_framework.decorators import throttle_classes
6 | from cccatalog.api.utils.throttle import PostRequestThrottler
7 | from cccatalog.api.serializers.link_serializers import ShortenedLinkSerializer
8 | from cccatalog.api.models import ShortenedLink
9 | from cccatalog import settings
10 | from rest_framework.response import Response
11 | from rest_framework import serializers
12 | from drf_yasg.utils import swagger_auto_schema
13 |
14 |
15 | class _LinkCreatedResponse(serializers.Serializer):
16 | shortened_url = serializers.URLField()
17 |
18 |
19 | class CreateShortenedLink(GenericAPIView):
20 | serializer_class = ShortenedLinkSerializer
21 | swagger_schema = None
22 |
23 | @throttle_classes([PostRequestThrottler])
24 | def post(self, request, format=None):
25 | """ Create a shortened URL. Only domains within the CC Catalog platform
26 | will be accepted. The `full_url` must be a whitelisted endpoint."""
27 | full_url = request.data['full_url']
28 | serialized = ShortenedLinkSerializer(data={'full_url': full_url})
29 | if not serialized.is_valid():
30 | return Response(
31 | status=400,
32 | data=serialized.errors
33 | )
34 |
35 | try:
36 | existing_path = ShortenedLink \
37 | .objects \
38 | .get(full_url=full_url) \
39 | .shortened_path
40 | shortened_url = settings.ROOT_SHORTENING_URL + '/' + existing_path
41 | except ShortenedLink.DoesNotExist:
42 | shortened_path = serialized.save()
43 | shortened_url = settings.ROOT_SHORTENING_URL + '/' + shortened_path
44 |
45 | return Response(
46 | status=200,
47 | data={
48 | 'shortened_url': shortened_url
49 | }
50 | )
51 |
52 |
53 | class ResolveShortenedLink(APIView):
54 | swagger_schema = None
55 |
56 | def get(self, request, path, format=None):
57 | """
58 | Given a shortened URL path, such as 'zb3k0', resolve the full URL
59 | and redirect the caller.
60 | """
61 | try:
62 | link_instance = ShortenedLink.objects.get(shortened_path=path)
63 | except ShortenedLink.DoesNotExist:
64 | return Response(
65 | status=404,
66 | data='Not Found'
67 | )
68 | full_url = link_instance.full_url
69 | return HttpResponsePermanentRedirect(full_url)
70 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/custom_auto_schema.py:
--------------------------------------------------------------------------------
1 | from drf_yasg import openapi
2 | from drf_yasg.utils import (
3 | filter_none, force_real_str, force_serializer_instance, get_consumes,
4 | get_produces, guess_response_status, merge_params, no_body,
5 | param_list_to_odict
6 | )
7 | from drf_yasg.inspectors import SwaggerAutoSchema
8 |
9 |
10 | class CustomAutoSchema(SwaggerAutoSchema):
11 |
12 | def get_operation(self, operation_keys=None):
13 | operation_keys = operation_keys or self.operation_keys
14 |
15 | consumes = self.get_consumes()
16 | produces = self.get_produces()
17 |
18 | body = self.get_request_body_parameters(consumes)
19 | query = self.get_query_parameters()
20 | parameters = body + query
21 | parameters = filter_none(parameters)
22 | parameters = self.add_manual_parameters(parameters)
23 |
24 | operation_id = self.get_operation_id(operation_keys)
25 | summary, description = self.get_summary_and_description()
26 | security = self.get_security()
27 | assert security is None or isinstance(security, list), \
28 | "security must be a list of security requirement objects"
29 | deprecated = self.is_deprecated()
30 | tags = self.get_tags(operation_keys)
31 |
32 | responses = self.get_responses()
33 |
34 | return openapi.Operation(
35 | operation_id=operation_id,
36 | description=force_real_str(description),
37 | summary=force_real_str(summary),
38 | responses=responses,
39 | parameters=parameters,
40 | consumes=consumes,
41 | produces=produces,
42 | tags=tags,
43 | security=security,
44 | deprecated=deprecated,
45 | **{'x-code-samples': self.overrides.get('code_examples')}
46 | )
47 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/scripts/api_load_testing/locustfile.py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 | import uuid
4 | from locust import HttpLocust, TaskSet, task
5 |
6 |
7 | class BrowseResults(TaskSet):
8 | @task(30)
9 | def view_image(self):
10 | if self.parent.results:
11 | image_id = random.choice(self.parent.results)['id']
12 | self.client.get("/image/{}".format(image_id), name="/image/[id]")
13 |
14 | @task(10)
15 | def favorite_images(self):
16 | pass
17 | if self.parent.results:
18 | list_length = random.choice([2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 9])
19 | selected_images = self.parent.results[0:list_length]
20 | ids = [image['id'] for image in selected_images]
21 | self.client.post("/list",
22 | {"title": "Load test" + str(ids), "images": ids})
23 |
24 | @task(10)
25 | def shorten_link(self):
26 | _unique = str(uuid.uuid4())
27 | image_link = "http://api-dev.creativecommons.engineering/list/{}"\
28 | .format(_unique)
29 | self.client.post("/link", {"full_url": image_link})
30 |
31 |
32 | class UserBehavior(TaskSet):
33 | tasks = {BrowseResults: 8}
34 |
35 | def __init__(self, parent):
36 | self.results = None
37 | self.query = None
38 | with open("./common_english_words.txt", "r") as f:
39 | self.common_words = f.read().splitlines()
40 | super().__init__(parent)
41 |
42 | @task(1000)
43 | def search(self):
44 | query_length = random.choice([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 5])
45 | query = [random.choice(self.common_words) for _ in range(query_length)]
46 | query = ','.join(query)
47 | self.query = query
48 | response = self.client.get(
49 | "/image/search?q={}".format(query),
50 | name="/image/search?q=[keywords]"
51 | )
52 | self.results = json.loads(response.content.decode("utf-8"))['results']
53 |
54 |
55 | class SearchUser(HttpLocust):
56 | task_set = UserBehavior
57 | min_wait = 3000
58 | max_wait = 9000
59 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/scripts/migration/migrate_lists.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import requests
3 | import json
4 | import logging as log
5 | """
6 | Tools for migrating legacy lists from CC Search Beta to the CC Catalog platform.
7 | """
8 |
9 |
10 | def import_lists_to_catalog(parsed_lists):
11 | success = 0
12 | errors = []
13 | for _list in parsed_lists:
14 | _list = parsed_lists[_list]
15 | payload = {
16 | 'title': _list['title'],
17 | 'images': _list['images']
18 | }
19 | response = requests.post(
20 | 'http://api.creativecommons.engineering/list',
21 | data=payload
22 | )
23 | if 300 > response.status_code >= 200:
24 | json_response = json.loads(response.text)
25 | new_url = json_response['url']
26 | success += 1
27 | print(_list['email'], new_url, _list['title'], sep='||')
28 | else:
29 | # A handful of lists from the legacy application are empty, which
30 | # isn't accepted in the new API. Skip over them and log it.
31 | errors.append((_list['title'], response.text))
32 | continue
33 | log.info('Migrated {} lists successfully'.format(success))
34 | if errors:
35 | log.error("The following errors occurred:")
36 | for error in errors:
37 | log.error(error)
38 |
39 |
40 | if __name__ == '__main__':
41 | with open('csvs/prod/lists.csv', 'r') as lists, \
42 | open('csvs/prod/list_images.csv', 'r') as list_images, \
43 | open('csvs/prod/users.csv', 'r') as users:
44 | lists = csv.DictReader(lists)
45 | list_images = csv.DictReader(list_images)
46 | users = csv.DictReader(users)
47 |
48 | # Compile all of the data required to migrate the lists and find the
49 | # emails of their owners.
50 | users_dict = {row['id']: row['email'] for row in users}
51 | lists_dict = {}
52 | for row in lists:
53 | if row['owner_id'] == '':
54 | continue
55 | lists_dict[row['id']] = {
56 | 'email': users_dict[row['owner_id']],
57 | 'title': row['title'],
58 | 'images': []
59 | }
60 | for row in list_images:
61 | if row['list_id'] in lists_dict:
62 | lists_dict[row['list_id']]['images'].append(row['image_id'])
63 |
64 | # Use the API to migrate the lists.
65 | import_lists_to_catalog(lists_dict)
66 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/scripts/thumbnail_load_test/locustfile.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import gevent.queue
3 | import gevent.pool
4 | import grequests
5 | import statistics
6 | import json
7 | import datetime
8 | from locust import HttpLocust, TaskSet, task, between
9 | from collections import defaultdict
10 | """
11 | Swarm the API server with async requests for thumbnails. Requires `url_dump.csv`
12 | in the same directory as the script. It is intentionally omitted from source
13 | control.
14 |
15 | The format of the csv is:
16 | url,provider
17 | https://example.com,exampleprovider
18 | http://secondexample.com,secondprovider
19 | . . .
20 |
21 | To prepare the server for testing:
22 | - Ensure that the hardware allocation matches production.
23 | - Disable referer origin limiting in the imageproxy server.
24 | - Empty the S3 thumbnail cache bucket.
25 |
26 | To run the test:
27 | `locust`
28 | Open the web interface and start a test with the desired number of workers.
29 | Watch the console for updates on the progress of the test and the number of
30 | successful vs failed thumbnails.
31 |
32 | Optionally rerun the test after the cache has been warmed up.
33 | """
34 | PROXY_URL = "https://api-dev.creativecommons.engineering/t/600/"
35 |
36 | url_queue = gevent.queue.Queue()
37 | provider_counts = defaultdict(int)
38 | url_provider = {}
39 | thumb_statuses = defaultdict(int)
40 | statuses_by_provider = {}
41 | response_times = []
42 |
43 |
44 | with open('url_dump.csv') as urls_csv:
45 | reader = csv.reader(urls_csv)
46 | for row in reader:
47 | if row[0] == 'url':
48 | continue
49 | url = row[0]
50 | provider = row[1]
51 | url_queue.put((url, provider))
52 | url_provider[url] = provider
53 | provider_counts[provider] += 1
54 |
55 |
56 | def print_current_stats():
57 | """
58 | Re-compute and print current thumbnail statistics.
59 | """
60 | mean_response_time = statistics.mean(response_times)
61 | failed = 0
62 | successful = 0
63 | for status in thumb_statuses:
64 | num_statuses = thumb_statuses[status]
65 | if status >= 300 and status != 404:
66 | failed += num_statuses
67 | else:
68 | successful += num_statuses
69 |
70 | out = {
71 | 'timestamp': str(datetime.datetime.now()),
72 | 'mean_response_time': mean_response_time,
73 | 'successful': successful,
74 | 'failed': failed,
75 | 'statuses': thumb_statuses,
76 | 'provider_statuses': statuses_by_provider
77 | }
78 | print(json.dumps(out))
79 |
80 |
81 | def record_stats(responses, providers):
82 | for idx, resp in enumerate(responses):
83 | response_times.append(resp.elapsed.total_seconds())
84 | thumb_statuses[resp.status_code] += 1
85 | provider = providers[idx]
86 | if provider not in statuses_by_provider:
87 | statuses_by_provider[provider] = defaultdict(int)
88 | statuses_by_provider[provider][resp.status_code] += 1
89 |
90 |
91 | class ThumbTask(TaskSet):
92 | @task
93 | def load_thumbs(self):
94 | reqs = []
95 | providers = []
96 | for _ in range(20):
97 | base_url, provider = url_queue.get()
98 | providers.append(provider)
99 | proxied_url = f'{PROXY_URL}{base_url}'
100 | reqs.append(grequests.get(proxied_url))
101 | thumb_responses = grequests.map(reqs)
102 | record_stats(thumb_responses, providers)
103 | print_current_stats()
104 |
105 |
106 | class ThumbLocust(HttpLocust):
107 | """
108 | Load a page's worth of thumbnails every 3 to 6 seconds.
109 | """
110 | wait_time = between(3, 6)
111 | task_set = ThumbTask
112 |
--------------------------------------------------------------------------------
/cccatalog-api/cccatalog/wsgi.py:
--------------------------------------------------------------------------------
1 | """
2 | WSGI config for cccatalog project.
3 |
4 | It exposes the WSGI callable as a module-level variable named ``application``.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/2.0/howto/deployment/wsgi/
8 | """
9 | from gevent import monkey; monkey.patch_all()
10 | import os
11 |
12 | from django.core.wsgi import get_wsgi_application
13 | from wsgi_basic_auth import BasicAuth
14 |
15 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cccatalog.settings")
16 |
17 | application = get_wsgi_application()
18 | application = BasicAuth(application)
19 |
--------------------------------------------------------------------------------
/cccatalog-api/manage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | import sys
4 | from gevent import monkey
5 | monkey.patch_all()
6 |
7 | if __name__ == "__main__":
8 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cccatalog.settings")
9 | try:
10 | from django.core.management import execute_from_command_line
11 | except ImportError as exc:
12 | raise ImportError(
13 | "Couldn't import Django. Are you sure it's installed and "
14 | "available on your PYTHONPATH environment variable? Did you "
15 | "forget to activate a virtual environment?"
16 | ) from exc
17 | execute_from_command_line(sys.argv)
18 |
--------------------------------------------------------------------------------
/cccatalog-api/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | DJANGO_SETTINGS_MODULE = cccatalog.settings
3 |
--------------------------------------------------------------------------------
/cccatalog-api/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | while [[ "$(curl --insecure -s -o /dev/null -w '%{http_code}' http://es:9200/)" != "200" ]]
6 | do
7 | echo "Waiting for Elasticsearch connection..."
8 | sleep 2
9 | done
10 |
11 | exec "$@"
12 |
--------------------------------------------------------------------------------
/cccatalog-api/test/README:
--------------------------------------------------------------------------------
1 | 1. Set environment variable INTEGRATION_TEST_URL to the instance you would like to test. Defaults to localhost.
2 | 2. Run `pytest -s`
3 |
--------------------------------------------------------------------------------
/cccatalog-api/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/cccatalog-api/test/__init__.py
--------------------------------------------------------------------------------
/cccatalog-api/test/api_live_search_qa.py:
--------------------------------------------------------------------------------
1 | import json
2 | import requests
3 |
4 | """
5 | Tests to run against a live instance of CC Search with a significant (10M+)
6 | number of records. Quality of search rankings can be affected by the number of
7 | documents in the search index, so toy examples with five or six documents
8 | do not accurately model relevance at scale.
9 | """
10 |
11 | API_URL = 'https://api-dev.creativecommons.engineering'
12 |
13 |
14 | def _phrase_in_tags(tags, term):
15 | for tag in tags:
16 | if 'name' in tag:
17 | if tag['name'] == term:
18 | return True
19 | return False
20 |
21 |
22 | def _phrase_in_title(title, term):
23 | return term in title
24 |
25 |
26 | def test_phrase_relevance():
27 | """
28 | If I search for "home office", the top results ought to have the phrase
29 | 'home office' in the tags or title.
30 | """
31 | search_term = 'home office'
32 | response = requests.get(
33 | API_URL + '/image/search?q={}'.format(search_term),
34 | verify=False
35 | )
36 | assert response.status_code == 200
37 | parsed = json.loads(response.text)
38 | first_result = parsed['results'][0]
39 | assert (
40 | _phrase_in_tags(first_result['tags'], search_term) or
41 | _phrase_in_title(first_result['title'], search_term)
42 | )
43 |
--------------------------------------------------------------------------------
/cccatalog-api/test/run_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Local environments don't have valid certificates; suppress this warning.
3 | export PYTHONWARNINGS="ignore:Unverified HTTPS request"
4 | export INTEGRATION_TEST_URL="http://localhost:8000"
5 | DJANGO_SETTINGS_MODULE='cccatalog.settings' PYTHONPATH=. DJANGO_SECRET_KEY='ny#b__$f6ry4wy8oxre97&-68u_0lk3gw(z=d40_dxey3zw0v1' DJANGO_DATABASE_NAME='openledger' DJANGO_DATABASE_USER='deploy' DJANGO_DATABASE_PASSWORD='deploy' DJANGO_DATABASE_HOST='localhost' REDIS_HOST='localhost' pytest -s --disable-pytest-warnings test/v1_integration_test.py
6 | succeeded=$?
7 | if [ $succeeded != 0 ]; then
8 | echo 'Tests failed. Full system logs: '
9 | docker-compose logs
10 | fi
11 | exit $succeeded
12 |
--------------------------------------------------------------------------------
/cccatalog-api/test/search_qa_test.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import pprint
3 | import json
4 | import pytest
5 | from enum import Enum
6 | from .api_live_integration_test import API_URL
7 |
8 | """
9 | Perform some basic tests to ensure that search rankings work as anticipated.
10 | """
11 |
12 |
13 | class QAScores(Enum):
14 | TARGET = 1
15 | LESS_RELEVANT = 2
16 | NOT_RELEVANT = 3
17 |
18 |
19 | @pytest.mark.skip(reason="This test is nondeterministic")
20 | def test_phrase_relevance():
21 | res = requests.get(
22 | "{}/image/search?q=home office&filter_dead=false&qa=true"
23 | .format(API_URL)
24 | )
25 | parsed = json.loads(res.text)
26 | pprint.pprint(parsed)
27 | assert int(parsed['results'][0]['id']) == QAScores.TARGET.value
28 | assert int(parsed['results'][1]['id']) < QAScores.NOT_RELEVANT.value
29 | assert int(parsed['results'][-1]['id']) != QAScores.NOT_RELEVANT.value
30 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | db:
4 | image: postgres:10.3-alpine
5 | ports:
6 | - "5432:5432"
7 | environment:
8 | POSTGRES_DB: "openledger"
9 | POSTGRES_USER: "deploy"
10 | POSTGRES_PASSWORD: "deploy"
11 | POSTGRES_HOST: "0.0.0.0"
12 | healthcheck:
13 | test: "pg_isready -U deploy -d openledger"
14 |
15 | thumbs:
16 | image: willnorris/imageproxy
17 | ports:
18 | - "8222:8222"
19 | command: ["-addr", "0.0.0.0:8222"]
20 |
21 | upstream_db:
22 | image: postgres:10.3-alpine
23 | ports:
24 | - "5433:5432"
25 | environment:
26 | POSTGRES_DB: "openledger"
27 | POSTGRES_USER: "deploy"
28 | POSTGRES_PASSWORD: "deploy"
29 | POSTGRES_HOST: "0.0.0.0"
30 | healthcheck:
31 | test: "pg_isready -U deploy -d openledger"
32 |
33 | es:
34 | image: docker.elastic.co/elasticsearch/elasticsearch:7.1.0
35 | ports:
36 | - "9200:9200"
37 | environment:
38 | # disable XPack
39 | # https://www.elastic.co/guide/en/elasticsearch/reference/5.3/docker.html#_security_note
40 | - xpack.security.enabled=false
41 | - discovery.type=single-node
42 | healthcheck:
43 | test: ["CMD-SHELL", "curl -si -XGET 'localhost:9200/_cluster/health?pretty' | grep -qE 'yellow|green'"]
44 | interval: 10s
45 | timeout: 60s
46 | retries: 10
47 | ulimits:
48 | nofile:
49 | soft: 65536
50 | hard: 65536
51 |
52 | web:
53 | build: ./cccatalog-api/
54 | image: cccatalog_api
55 | command: python manage.py runserver 0.0.0.0:8000
56 | container_name: cccatalog-api_web_1
57 | volumes:
58 | - ./cccatalog-api:/cccatalog-api
59 | ports:
60 | - "8000:8000"
61 | - "4444:4444"
62 | depends_on:
63 | - db
64 | - es
65 | environment:
66 | - DJANGO_DATABASE_NAME=openledger
67 | - DJANGO_DATABASE_USER=deploy
68 | - DJANGO_DATABASE_PASSWORD=deploy
69 | - DJANGO_DATABASE_HOST=db
70 | - UPSTREAM_DATABASE_HOST=upstream_db
71 | - PYTHONUNBUFFERED=0
72 | - DJANGO_DEBUG_ENABLED=True
73 | - ELASTICSEARCH_URL=es
74 | - ELASTICSEARCH_PORT=9200
75 | - DISABLE_GLOBAL_THROTTLING=True
76 | - ROOT_SHORTENING_URL=localhost:8000
77 | - THUMBNAIL_PROXY_URL=http://thumbs:8222
78 | - DJANGO_SECRET_KEY=ny#b__$$f6ry4wy8oxre97&-68u_0lk3gw(z=d40_dxey3zw0v1
79 | - AWS_SECRET_ACCESS_KEY
80 | - AWS_ACCESS_KEY_ID
81 | stdin_open: true
82 | tty: true
83 |
84 | cache:
85 | image: redis:4.0.10
86 | container_name: cccatalog-api_cache_1
87 | ports:
88 | - "6379:6379"
89 |
90 | ingestion-server:
91 | build: ./ingestion_server/
92 | command: bash -c 'sleep 20 && supervisord -c config/supervisord.conf'
93 | ports:
94 | - "8001:8001"
95 | depends_on:
96 | - db
97 | - es
98 | - indexer-worker
99 | volumes:
100 | - ./ingestion_server:/ingestion-server
101 | environment:
102 | PYTHONUNBUFFERED: "0"
103 | ELASTICSEARCH_URL: 'es'
104 | ELASTICSEARCH_PORT: "9200"
105 | DATABASE_HOST: 'db'
106 | DATABASE_USER: 'deploy'
107 | DATABASE_PASSWORD: 'deploy'
108 | DATABASE_NAME: 'openledger'
109 | DATABASE_PORT: '5432'
110 | UPSTREAM_DB_HOST: 'upstream_db'
111 | UPSTREAM_DB_PORT: 5432
112 | DB_BUFFER_SIZE: '100000'
113 | COPY_TABLES: 'image'
114 | SYNCER_POLL_INTERVAL: '60'
115 | stdin_open: true
116 | tty: true
117 |
118 | indexer-worker:
119 | build:
120 | context: ./ingestion_server/
121 | dockerfile: Dockerfile-worker
122 | container_name: indexer-worker
123 | ports:
124 | - "8002:8002"
125 | depends_on:
126 | - db
127 | - es
128 | volumes:
129 | - ./ingestion_server:/ingestion-server
130 | environment:
131 | PYTHONUNBUFFERED: "0"
132 | ELASTICSEARCH_URL: 'es'
133 | ELASTICSEARCH_PORT: "9200"
134 | DATABASE_HOST: 'db'
135 | DATABASE_USER: 'deploy'
136 | DATABASE_PASSWORD: 'deploy'
137 | DATABASE_NAME: 'openledger'
138 | DATABASE_PORT: '5432'
139 | UPSTREAM_DB_HOST: 'upstream_db'
140 | UPSTREAM_DB_PORT: 5432
141 | DB_BUFFER_SIZE: '100000'
142 | COPY_TABLES: 'image'
143 | SYNCER_POLL_INTERVAL: '60'
144 | stdin_open: true
145 | tty: true
146 |
147 | analytics:
148 | build: ./analytics/
149 | image: analytics
150 | container_name: cccatalog-api_analytics_1
151 | ports:
152 | - "8090:8090"
153 | environment:
154 | DATABASE_CONN: 'postgres+psycopg2://deploy:deploy@db/openledger'
155 |
--------------------------------------------------------------------------------
/ingestion_server/.dockerignore:
--------------------------------------------------------------------------------
1 | venv
2 | venv2
3 | es-venv
4 |
--------------------------------------------------------------------------------
/ingestion_server/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7
2 |
3 | ENV PYTHONBUFFERED 1
4 |
5 | RUN groupadd --system supervisord && useradd --system --gid supervisord supervisord
6 |
7 | RUN apt-get update \
8 | && apt-get install -y supervisor \
9 | && mkdir -p /var/log/supervisord/ \
10 | && chown -R supervisord:supervisord /var/log/supervisord
11 |
12 | # Install Python dependency management tools
13 | RUN pip install --upgrade pip \
14 | && pip install --upgrade setuptools \
15 | && pip install --upgrade pipenv
16 |
17 | # Copy all files into the container
18 | COPY . /ingestion_server/
19 | WORKDIR /ingestion_server
20 | RUN chown -R supervisord:supervisord /ingestion_server
21 | ENV PYTHONPATH=$PYTHONPATH:/ingestion_server/
22 |
23 | # Install the dependencies system-wide
24 | # TODO: Use build args to avoid installing dev dependencies in production
25 | RUN pipenv install --deploy --system --dev
26 | USER supervisord
27 | EXPOSE 8001
28 | CMD ["supervisord", "-c", "/ingestion_server/config/supervisord.conf"]
29 |
--------------------------------------------------------------------------------
/ingestion_server/Dockerfile-worker:
--------------------------------------------------------------------------------
1 | FROM python:3.7
2 |
3 | ENV PYTHONBUFFERED 1
4 |
5 | # Install Python dependency management tools
6 | RUN pip install --upgrade pip \
7 | && pip install --upgrade setuptools \
8 | && pip install --upgrade pipenv
9 |
10 | # Copy all files into the container
11 | COPY . /ingestion_server/
12 | WORKDIR /ingestion_server
13 | ENV PYTHONPATH=$PYTHONPATH:/ingestion_server/
14 |
15 | RUN pipenv install --deploy --system --dev
16 | EXPOSE 8002
17 | CMD gunicorn indexer_worker:api -b 0.0.0.0:8002 --reload --access-logfile '-' --error-logfile '-' --chdir ./ingestion_server/
18 |
--------------------------------------------------------------------------------
/ingestion_server/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | name = "pypi"
3 | url = "https://pypi.org/simple"
4 | verify_ssl = true
5 |
6 | [dev-packages]
7 | remote-pdb = "*"
8 | ipython = "*"
9 | pipdeptree = "*"
10 | pycodestyle = "*"
11 |
12 | [packages]
13 | aws-requests-auth = "*"
14 | bottle = "*"
15 | elasticsearch-dsl = "==7.0.0"
16 | falcon = "*"
17 | gunicorn = "*"
18 | psycopg2-binary = "*"
19 | PyYAML = "*"
20 | boto3 = "*"
21 | filelock = "*"
22 | pytest = "*"
23 | tldextract = "*"
24 |
--------------------------------------------------------------------------------
/ingestion_server/README.md:
--------------------------------------------------------------------------------
1 | # Ingestion Server
2 |
3 | ## Introduction
4 | Ingestion Server is a small private API for copying data from an upstream source and loading it into the CC Catalog API. This is a two step process:
5 | 1. The data is copied from the upstream CC Catalog database and into the downstream API database.
6 | 2. Data from the downstream API database gets indexed in Elasticsearch.
7 |
8 | For example, let's say that I want to download and index all new images.
9 | `http POST ingestion.private:8001/task <<<'{"model": "image", "action": "INGEST_UPSTREAM"}'`
10 |
11 | Performance is dependent on the size of the target Elasticsearch cluster, database throughput, and bandwidth available to the ingestion server. The primary bottleneck is indexing to Elasticsearch.
12 |
13 | ## How Indexing Works
14 | 
15 |
16 | ## Safety and security considerations
17 | The server has been designed to fail gracefully in the event of network interruptions, full disks, etc. If a task fails to complete successfully, the whole process is rolled back with zero impact to production.
18 |
19 | The server is designed to be run in a private network only. You must not expose the private Ingestion Server API to the public internet.
20 |
21 | ## Running the tests
22 | This runs a simulated environment in Docker containers and ensures that ingestion is working properly.
23 | ```
24 | mkvirtualenv venv
25 | source venv/bin/activate
26 | python test/integration_tests.py
27 | ```
28 | Set `ENABLE_DETAILED_LOGS` to `True` if more information is needed about the failing test.
29 |
30 | ## Configuration
31 | All configuration is performed through environment variables.
32 |
33 | #### Required
34 | * **COPY_TABLES**: A comma-separated list of database tables that should be replicated to Elasticsearch. **Example**: image,text
35 |
36 | * ELASTICSEARCH_URL
37 | * ELASTICSEARCH_PORT
38 | * DATABASE_HOST
39 | * DATABASE_USER
40 | * DATABASE_PASSWORD
41 | * DATABASE_NAME
42 | * DATABASE_PORT
43 |
44 | #### Optional
45 | * **DB_BUFFER_SIZE**: The number of rows to load from the database at once while replicating. **Default**: 100000
46 |
47 | To access a cluster on AWS, define these additional environment variables.
48 | * AWS_ACCESS_KEY_ID
49 | * AWS_SECRET_ACCESS_KEY
50 | * AWS_REGION
51 |
52 | ## Mapping database tables to Elasticsearch
53 | In order to synchronize a given table to Elasticsearch, the following requirements must be met:
54 | * The database table must have an autoincrementing integer primary key named `id`.
55 | * A SyncableDoctype must be defined in `es_syncer/elasticsearch_models`. The SyncableDoctype must implement the function `database_row_to_elasticsearch_model`.
56 | * The table name must be mapped to the corresponding Elasticsearch SyncableDoctype in `database_table_to_elasticsearch_model` map.
57 |
58 | Example from `es_syncer/elasticsearch_models.py`:
59 | ```
60 | class Image(SyncableDocType):
61 | title = Text(analyzer="english")
62 | identifier = Text(index="not_analyzed")
63 | creator = Text()
64 | creator_url = Text(index="not_analyzed")
65 | tags = Text(multi=True)
66 | created_on = Date()
67 | url = Text(index="not_analyzed")
68 | thumbnail = Text(index="not_analyzed")
69 | provider = Text(index="not_analyzed")
70 | source = Text(index="not_analyzed")
71 | license = Text(index="not_analyzed")
72 | license_version = Text("not_analyzed")
73 | foreign_landing_url = Text(index="not_analyzed")
74 | meta_data = Nested()
75 |
76 | class Meta:
77 | index = 'image'
78 |
79 | @staticmethod
80 | def database_row_to_elasticsearch_doc(row, schema):
81 | return Image(
82 | pg_id=row[schema['id']],
83 | title=row[schema['title']],
84 | identifier=row[schema['identifier']],
85 | creator=row[schema['creator']],
86 | creator_url=row[schema['creator_url']],
87 | tags=row[schema['tags_list']],
88 | created_on=row[schema['created_on']],
89 | url=row[schema['url']],
90 | thumbnail=row[schema['thumbnail']],
91 | provider=row[schema['provider']],
92 | source=row[schema['source']],
93 | license=row[schema['license']],
94 | license_version=row[schema['license_version']],
95 | foreign_landing_url=row[schema['foreign_landing_url']],
96 | meta_data=row[schema['meta_data']],
97 | )
98 |
99 |
100 | # Table name -> Elasticsearch model
101 | database_table_to_elasticsearch_model = {
102 | 'image': Image
103 | }
104 | ```
105 |
--------------------------------------------------------------------------------
/ingestion_server/config/supervisord.conf:
--------------------------------------------------------------------------------
1 | [supervisord]
2 | logfile=/var/log/supervisord/supervisord.log
3 | childlogdir=/var/log/supervisord/
4 | logfile_maxbytes=50MB
5 | logfile_backups=5
6 | loglevel=info
7 | pidfile=/tmp/supervisord.pid
8 | nodaemon=true
9 |
10 | [program:ingestion-server]
11 | directory=/ingestion_server
12 | command=/bin/bash -c 'gunicorn api:api -b 0.0.0.0:8001 --reload --chdir ./ingestion_server/ --timeout 120'
13 | user=supervisord
14 | autostart=true
15 | autorestart=true
16 | startretries=9999999999
17 | stdout_logfile=/dev/stdout
18 | stdout_logfile_maxbytes=0
19 | stderr_logfile=/dev/stdout
20 | stderr_logfile_maxbytes=0
21 |
--------------------------------------------------------------------------------
/ingestion_server/howitworks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/ingestion_server/howitworks.png
--------------------------------------------------------------------------------
/ingestion_server/ingestion_server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/ingestion_server/ingestion_server/__init__.py
--------------------------------------------------------------------------------
/ingestion_server/ingestion_server/api.py:
--------------------------------------------------------------------------------
1 | import falcon
2 | import logging
3 | import sys
4 | import json
5 | import uuid
6 | import time
7 | from urllib.parse import urlparse
8 | from multiprocessing import Value, Process
9 | from ingestion_server.tasks import TaskTracker, Task, TaskTypes
10 | from ingestion_server.state import worker_finished, clear_state
11 | import ingestion_server.indexer as indexer
12 |
13 | """
14 | A small RPC API server for scheduling ingestion of upstream data and
15 | Elasticsearch indexing tasks.
16 | """
17 |
18 |
19 | MODEL = 'model'
20 | ACTION = 'action'
21 | CALLBACK_URL = 'callback_url'
22 | SINCE_DATE = 'since_date'
23 |
24 |
25 | class TaskResource:
26 | def __init__(self, tracker: TaskTracker):
27 | self.tracker = tracker
28 |
29 | @staticmethod
30 | def _get_base_url(req):
31 | parsed = urlparse(req.url)
32 | return parsed.scheme + '://' + parsed.netloc
33 |
34 | @staticmethod
35 | def _validate_create_task(request):
36 | """
37 | Validate an index creation task.
38 | :return: None if valid else a string containing an error message.
39 | """
40 | if request == b'':
41 | return "Expected JSON request body but found nothing."
42 | request = json.loads(request.decode('utf-8'))
43 | if MODEL not in request:
44 | return "No model supplied in request body."
45 | if ACTION not in request:
46 | return "No action supplied in request body."
47 | if request[ACTION] not in [x.name for x in TaskTypes]:
48 | return "Invalid action."
49 | if request[ACTION] in TaskTypes and SINCE_DATE not in request:
50 | return "Received UPDATE request but no since_date."
51 |
52 | return None
53 |
54 | def on_post(self, req, resp):
55 | """ Create a task. """
56 | raw_body = req.stream.read()
57 | request_error = self._validate_create_task(raw_body)
58 | if request_error:
59 | logging.warning(
60 | 'Invalid request made. Reason: {}'.format(request_error)
61 | )
62 | resp.status = falcon.HTTP_400
63 | resp.media = {
64 | 'message': request_error
65 | }
66 | return
67 | body = json.loads(raw_body.decode('utf-8'))
68 | model = body[MODEL]
69 | action = body[ACTION]
70 | callback_url = None
71 | if CALLBACK_URL in body:
72 | callback_url = body[CALLBACK_URL]
73 | since_date = body[SINCE_DATE] if SINCE_DATE in body else None
74 | task_id = str(uuid.uuid4())
75 | # Inject shared memory
76 | progress = Value('d', 0.0)
77 | finish_time = Value('d', 0.0)
78 | task = Task(
79 | model=model,
80 | task_type=TaskTypes[action],
81 | since_date=since_date,
82 | progress=progress,
83 | task_id=task_id,
84 | finish_time=finish_time,
85 | callback_url=callback_url
86 | )
87 | task.start()
88 | task_id = self.tracker \
89 | .add_task(task, task_id, action, progress, finish_time)
90 | base_url = self._get_base_url(req)
91 | status_url = base_url + '/task/{}'.format(task_id)
92 | # Give the task a moment to start so we can detect immediate failure.
93 | # TODO: Use IPC to detect if the job launched successfully instead
94 | # of giving it 100ms to crash. This is prone to race conditions.
95 | time.sleep(0.1)
96 | if task.is_alive():
97 | resp.status = falcon.HTTP_202
98 | resp.media = {
99 | 'message': 'Successfully scheduled task',
100 | 'task_id': task_id,
101 | 'status_check': status_url
102 | }
103 | return
104 | else:
105 | resp.status = falcon.HTTP_500
106 | resp.media = {
107 | 'message': 'Failed to schedule task due to an internal server '
108 | 'error. Check scheduler logs.'
109 | }
110 | return
111 |
112 | def on_get(self, req, resp):
113 | """ List all indexing tasks. """
114 | resp.media = self.tracker.list_task_statuses()
115 |
116 |
117 | class TaskStatus:
118 | def __init__(self, tracker: TaskTracker):
119 | self.tracker = tracker
120 |
121 | def on_get(self, req, resp, task_id):
122 | """ Check the status of a single task."""
123 | task = self.tracker.id_task[task_id]
124 | active = task.is_alive()
125 |
126 | percent_completed = self.tracker.id_progress[task_id].value
127 | resp.media = {
128 | 'active': active,
129 | 'percent_completed': percent_completed,
130 | 'error': percent_completed < 100 and not active
131 | }
132 |
133 |
134 | class WorkerFinishedResource:
135 | """
136 | For notifying ingestion server that an indexing worker has finished its
137 | task.
138 | """
139 | def on_post(self, req, resp):
140 | target_index = worker_finished(str(req.remote_addr))
141 | if target_index:
142 | logging.info(
143 | 'All indexer workers finished! Attempting to promote index '
144 | f'{target_index}'
145 | )
146 | f = indexer.TableIndexer.go_live
147 | p = Process(target=f, args=(target_index, 'image'))
148 | p.start()
149 |
150 |
151 | class StateResource:
152 | def on_delete(self, req, resp):
153 | """
154 | Forget about the last scheduled indexing job.
155 | """
156 | clear_state()
157 |
158 |
159 | def create_api(log=True):
160 | """ Create an instance of the Falcon API server. """
161 | if log:
162 | root = logging.getLogger()
163 | root.setLevel(logging.DEBUG)
164 | handler = logging.StreamHandler(sys.stdout)
165 | handler.setLevel(logging.INFO)
166 | formatter = logging.Formatter(
167 | '%(asctime)s %(levelname)s %(filename)s:%(lineno)d - %(message)s'
168 | )
169 | handler.setFormatter(formatter)
170 | root.addHandler(handler)
171 |
172 | _api = falcon.API()
173 | task_tracker = TaskTracker()
174 | task_resource = TaskResource(task_tracker)
175 | get_task_status = TaskStatus(task_tracker)
176 | _api.add_route('/task', task_resource)
177 | _api.add_route('/task/{task_id}', get_task_status)
178 | _api.add_route('/worker_finished', WorkerFinishedResource())
179 | _api.add_route('/state', StateResource())
180 |
181 | return _api
182 |
183 |
184 | api = create_api()
185 |
--------------------------------------------------------------------------------
/ingestion_server/ingestion_server/authority.py:
--------------------------------------------------------------------------------
1 | from enum import Enum, auto
2 | """
3 | Authority is a ranking from 0 to 100 (with 0 being least authoritative)
4 | indicating the pedigree of an image. Some examples of things that could impact
5 | authority:
6 | - The reputation of the website that posted an image
7 | - The popularity of the uploader on a social media site in terms of number of
8 | followers
9 | - Whether the uploader has uploaded images that have previously been flagged for
10 | copyright infringement.
11 | - etc
12 |
13 | The authority can be set from the catalog layer through the meta_data field
14 | or through the ingestion layer. As of now, we are only factoring in the
15 | reputation of the website as a static hand-picked list based on experience
16 | and search result quality, with the intention to add more sophisticated and
17 | tailored measures of authority later on.
18 |
19 | Also note that this is just one factor in rankings, and the magnitude of the
20 | boost can be adjusted at search-time.
21 | """
22 |
23 |
24 | class AuthorityTypes(Enum):
25 | CURATED = auto()
26 | SOCIAL_MEDIA = auto()
27 | DEFAULT = auto()
28 |
29 |
30 | # We want to boost curated collections where each image has been vetted for
31 | # cultural significance.
32 | boost = {
33 | AuthorityTypes.CURATED: 90,
34 | AuthorityTypes.SOCIAL_MEDIA: 80,
35 | AuthorityTypes.DEFAULT: 85
36 | }
37 | authority_types = {
38 | 'flickr': AuthorityTypes.SOCIAL_MEDIA,
39 | 'behance': AuthorityTypes.SOCIAL_MEDIA,
40 | 'thingiverse': AuthorityTypes.SOCIAL_MEDIA,
41 | 'sketchfab': AuthorityTypes.SOCIAL_MEDIA,
42 | 'deviantart': AuthorityTypes.SOCIAL_MEDIA,
43 | 'thorvaldsensmuseum': AuthorityTypes.CURATED,
44 | 'svgsilh': AuthorityTypes.CURATED,
45 | 'smithsonian': AuthorityTypes.CURATED,
46 | 'rijksmuseum': AuthorityTypes.CURATED,
47 | 'museumsvictoria': AuthorityTypes.CURATED,
48 | 'met': AuthorityTypes.CURATED,
49 | 'mccordsmuseum': AuthorityTypes.CURATED,
50 | 'digitaltmuseum': AuthorityTypes.CURATED,
51 | 'clevelandmuseum': AuthorityTypes.CURATED,
52 | 'brooklynmuseum': AuthorityTypes.CURATED
53 | }
54 |
55 |
56 | def get_authority_boost(source):
57 | authority_boost = None
58 | if source in authority_types:
59 | authority_type = authority_types[source]
60 | if authority_type in boost:
61 | authority_boost = boost[authority_type]
62 | else:
63 | authority_boost = boost[AuthorityTypes.DEFAULT]
64 | return authority_boost
65 |
--------------------------------------------------------------------------------
/ingestion_server/ingestion_server/categorize.py:
--------------------------------------------------------------------------------
1 | from enum import Enum, auto
2 |
3 | """
4 | https://github.com/creativecommons/cccatalog-api/issues/340
5 |
6 | Attempt to figure out the image type (illustration, vector, photograph, or
7 | digitized artwork) based on its source and file extension.
8 | """
9 |
10 |
11 | class Category(Enum):
12 | PHOTOGRAPH = auto()
13 | DIGITIZED_ARTWORK = auto()
14 | ILLUSTRATION = auto()
15 |
16 |
17 | # Map each provider to a set of categories..
18 | source_category = {
19 | '__default': [],
20 | 'thorvaldsenmuseum': [Category.DIGITIZED_ARTWORK],
21 | 'svgsilh': [Category.ILLUSTRATION],
22 | 'phylopic': [Category.ILLUSTRATION],
23 | 'floraon': [Category.PHOTOGRAPH],
24 | 'animaldiversity': [Category.PHOTOGRAPH],
25 | 'WoRMS': [Category.PHOTOGRAPH],
26 | 'clevelandmuseum': [Category.DIGITIZED_ARTWORK],
27 | 'CAPL': [Category.PHOTOGRAPH],
28 | 'sciencemuseum': [Category.PHOTOGRAPH],
29 | 'rijksmuseum': [Category.DIGITIZED_ARTWORK],
30 | 'museumsvictoria': [Category.DIGITIZED_ARTWORK],
31 | 'met': [Category.DIGITIZED_ARTWORK],
32 | 'mccordmuseum': [Category.DIGITIZED_ARTWORK],
33 | 'digitaltmuseum': [Category.DIGITIZED_ARTWORK],
34 | 'deviantart': [Category.DIGITIZED_ARTWORK],
35 | 'brooklynmuseum': [Category.DIGITIZED_ARTWORK]
36 | }
37 |
38 |
39 | def get_categories(extension, source):
40 | if extension and extension.lower() == 'svg':
41 | categories = [Category.ILLUSTRATION]
42 | elif source in source_category:
43 | categories = source_category[source]
44 | else:
45 | categories = source_category['__default']
46 | return [x.name for x in categories]
47 |
--------------------------------------------------------------------------------
/ingestion_server/ingestion_server/distributed_reindex_scheduler.py:
--------------------------------------------------------------------------------
1 | """
2 | Allocate hardware for performing a distributed index by spawning several
3 | indexer_worker instances on multiple machines. Then, partition the work across
4 | each worker, notifying each worker which partition to reindex through an HTTP
5 | request.
6 |
7 | Once the reindexing job is finished, each worker will notify Ingestion Server,
8 | which should then shut down the instances.
9 | """
10 | import math
11 | import requests
12 | import logging as log
13 | import os
14 | import time
15 | import boto3
16 | import socket
17 | from ingestion_server.state import register_indexing_job
18 |
19 |
20 | client = boto3.client('ec2', region_name=os.getenv('AWS_REGION', 'us-east-1'))
21 |
22 |
23 | def schedule_distributed_index(db_conn, target_index):
24 | workers = _prepare_workers()
25 | registered = register_indexing_job(workers, target_index)
26 | if registered:
27 | _assign_work(db_conn, workers, target_index)
28 |
29 |
30 | def _assign_work(db_conn, workers, target_index):
31 | est_records_query = 'SELECT id FROM image ORDER BY id DESC LIMIT 1'
32 | with db_conn.cursor() as cur:
33 | cur.execute(est_records_query)
34 | estimated_records = cur.fetchone()[0]
35 | records_per_worker = math.floor(estimated_records / len(workers))
36 |
37 | worker_url_template = 'http://{}:8002'
38 | # Wait for the workers to start.
39 | for worker in workers:
40 | worker_url = worker_url_template.format(worker)
41 | succeeded = _wait_for_healthcheck(f'{worker_url}/healthcheck')
42 | if not succeeded:
43 | return False
44 | for idx, worker in enumerate(workers):
45 | worker_url = worker_url_template.format(worker)
46 | params = {
47 | 'start_id': idx * records_per_worker,
48 | 'end_id': (1 + idx) * records_per_worker,
49 | 'target_index': target_index
50 | }
51 | log.info(f'Assigning job {params} to {worker_url}')
52 | requests.post(worker_url + '/indexing_task', json=params)
53 |
54 |
55 | def _prepare_workers():
56 | """
57 | Get a list of internal URLs bound to each indexing worker. If the worker is
58 | stopped, start the worker.
59 |
60 | :return: A list of private URLs pointing to each available indexing worker
61 | """
62 | environment = os.getenv('ENVIRONMENT', 'local')
63 | if environment == 'local':
64 | return [socket.gethostbyname('indexer-worker')]
65 | instance_filters = [
66 | {
67 | 'Name': 'tag:Name',
68 | 'Values': ['indexer-worker-' + environment + '*']
69 | },
70 | {
71 | 'Name': 'instance-state-name',
72 | 'Values': ['stopped', 'running']
73 | }
74 | ]
75 | response = client.describe_instances(Filters=instance_filters)
76 | servers = []
77 | ids = []
78 | for reservation in response['Reservations']:
79 | instance = reservation['Instances'][0]
80 | server = instance['PrivateIpAddress']
81 | _id = instance['InstanceId']
82 | servers.append(server)
83 | ids.append(_id)
84 | log.info('Selected worker instances {}'.format(servers))
85 | client.start_instances(InstanceIds=ids)
86 | return servers
87 |
88 |
89 | def _wait_for_healthcheck(endpoint, attempts=60, wait=5):
90 | """
91 | Wait for the instance at `endpoint` to become healthy before assigning work.
92 |
93 | :param endpoint: The URL to test
94 | :param attempts: Number of attempts at reaching healthcheck
95 | :param wait: Amount of time to wait between each attempt
96 | :return: True if the healthcheck succeeded
97 | """
98 | num_attempts = 0
99 | healthcheck_passed = False
100 | while not healthcheck_passed and num_attempts < attempts:
101 | try:
102 | log.info(f'Checking {endpoint}. . .')
103 | response = requests.get(endpoint, timeout=3)
104 | if response.status_code == 200:
105 | healthcheck_passed = True
106 | break
107 | except requests.exceptions.RequestException:
108 | pass
109 | time.sleep(wait)
110 | num_attempts += 1
111 | if num_attempts >= attempts or not healthcheck_passed:
112 | log.error(f'Timed out waiting for {endpoint}.')
113 | return False
114 | else:
115 | log.info(f'{endpoint} passed healthcheck')
116 | return True
117 |
--------------------------------------------------------------------------------
/ingestion_server/ingestion_server/indexer_worker.py:
--------------------------------------------------------------------------------
1 | """
2 | A single worker responsible for indexing a subset of the records stored in the
3 | database.
4 |
5 | Accept an HTTP request specifying a range of image IDs to reindex. After the
6 | data has been indexed, notify Ingestion Server and stop the instance.
7 | """
8 | import falcon
9 | import sys
10 | import logging as log
11 | import os
12 | import boto3
13 | import requests
14 | from multiprocessing import Value, Process
15 | from psycopg2.sql import SQL
16 | from ingestion_server.indexer import elasticsearch_connect, TableIndexer
17 |
18 |
19 | ec2_client = boto3.client(
20 | 'ec2',
21 | region_name=os.getenv('AWS_REGION', 'us-east-1'),
22 | aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID', None),
23 | aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY', None)
24 | )
25 |
26 |
27 | class IndexingJobResource:
28 | def on_post(self, req, resp):
29 | j = req.media
30 | start_id = j['start_id']
31 | end_id = j['end_id']
32 | target_index = j['target_index']
33 | notify_url = f'http://{req.remote_addr}:8001/worker_finished'
34 | _execute_indexing_task(target_index, start_id, end_id, notify_url)
35 | log.info(f'Received indexing request for records {start_id}-{end_id}')
36 | resp.status = falcon.HTTP_201
37 |
38 |
39 | class HealthcheckResource:
40 | def on_get(self, req, resp):
41 | resp.status = falcon.HTTP_200
42 |
43 |
44 | def _execute_indexing_task(target_index, start_id, end_id, notify_url):
45 | table = 'image'
46 | elasticsearch = elasticsearch_connect()
47 | progress = Value('d', 0.0)
48 | finish_time = Value('d', 0.0)
49 | exists_in_table = \
50 | 'exists(SELECT 1 FROM {table} ' \
51 | 'WHERE identifier = image.identifier) as "{name}"'
52 | exists_in_deleted_table = exists_in_table.format(
53 | table='api_deletedimage', name='deleted'
54 | )
55 | exists_in_mature_table = exists_in_table.format(
56 | table='api_matureimage', name='mature'
57 | )
58 |
59 | query = SQL(f'''
60 | SELECT *,
61 | {exists_in_deleted_table}, {exists_in_mature_table}
62 | FROM image
63 | WHERE id BETWEEN {start_id} AND {end_id}
64 | ''')
65 | log.info('Querying {}'.format(query))
66 | indexer = TableIndexer(
67 | elasticsearch, table, progress, finish_time
68 | )
69 | p = Process(
70 | target=_launch_reindex,
71 | args=(table, target_index, query, indexer, notify_url)
72 | )
73 | p.start()
74 | log.info('Started indexing task')
75 |
76 |
77 | def _launch_reindex(table, target_index, query, indexer, notify_url):
78 | try:
79 | indexer.replicate(table, target_index, query)
80 | except Exception:
81 | log.error("Indexing error occurred: ", exc_info=True)
82 |
83 | log.info(f'Notifying {notify_url}')
84 | requests.post(notify_url)
85 | _self_destruct()
86 | return
87 |
88 |
89 | def _self_destruct():
90 | """
91 | Stop this EC2 instance once the task is finished.
92 | """
93 | # Get instance ID from AWS metadata service
94 | if os.getenv('ENVIRONMENT', 'local') == 'local':
95 | log.info(
96 | 'Skipping self destruction because worker is in local environment'
97 | )
98 | return
99 | endpoint = 'http://169.254.169.254/latest/meta-data/instance-id'
100 | response = requests.get(endpoint)
101 | instance_id = response.content.decode('utf8')
102 | log.info('Shutting self down')
103 | ec2_client.stop_instances(InstanceIds=[instance_id])
104 |
105 |
106 | root = log.getLogger()
107 | root.setLevel(log.DEBUG)
108 | handler = log.StreamHandler(sys.stdout)
109 | handler.setLevel(log.INFO)
110 | formatter = log.Formatter(
111 | '%(asctime)s %(levelname)s %(filename)s:%(lineno)d - %(message)s'
112 | )
113 | handler.setFormatter(formatter)
114 | root.addHandler(handler)
115 | api = falcon.API()
116 | api.add_route('/indexing_task', IndexingJobResource())
117 | api.add_route('/healthcheck', HealthcheckResource())
118 |
--------------------------------------------------------------------------------
/ingestion_server/ingestion_server/qa.py:
--------------------------------------------------------------------------------
1 | import uuid
2 | import random
3 | from enum import Enum
4 | from ingestion_server.elasticsearch_models import Image
5 |
6 |
7 | class QAScores(Enum):
8 | TARGET = 1
9 | LESS_RELEVANT = 2
10 | NOT_RELEVANT = 3
11 |
12 |
13 | def create_search_qa_index():
14 | test_idx = 'search-qa'
15 | _phrase_relevance(test_idx)
16 |
17 |
18 | def test_image(title, tags, creator, relevance):
19 | _id = random.randint(0, 1000000000)
20 | sample_url = 'https://example.com/'
21 | img = Image(
22 | _id=_id,
23 | id=_id,
24 | title=title,
25 | identifier=relevance,
26 | creator=creator,
27 | creator_url=sample_url,
28 | tags=tags,
29 | created_on=None,
30 | url=sample_url,
31 | thumbnail='',
32 | provider='test',
33 | source=sample_url,
34 | license='by',
35 | license_version='3.0',
36 | foreign_landing_url=sample_url,
37 | metadata=None,
38 | view_count=0
39 | )
40 | return img
41 |
42 |
43 | def _phrase_relevance(index):
44 | less_relevant1 = test_image(
45 | 'A picture of my office',
46 | [{'name': 'office'}],
47 | 'Alice Foo',
48 | QAScores.LESS_RELEVANT.value
49 | )
50 | less_relevant1.save(index=index)
51 |
52 | less_relevant2 = test_image(
53 | 'My office in my home',
54 | [{'name': 'office'}, {'name': 'home'}],
55 | 'Gordon',
56 | QAScores.LESS_RELEVANT.value
57 | )
58 | less_relevant2.save(index=index)
59 |
60 | not_relevant = test_image(
61 | 'Mastiff', [{'name': 'dog'}], 'Liam', QAScores.NOT_RELEVANT.value
62 | )
63 | not_relevant.save(index=index)
64 |
65 | # This should be the top result.
66 | target_tags = [
67 | {'name': 'home office'},
68 | {'name': 'noise'},
69 | {'name': 'clutter'}
70 | ]
71 | target = test_image(
72 | 'My home office', target_tags, 'John Fooson', QAScores.TARGET.value
73 | )
74 | target.save(index=index)
75 |
--------------------------------------------------------------------------------
/ingestion_server/ingestion_server/state.py:
--------------------------------------------------------------------------------
1 | import shelve
2 | import datetime
3 | import enum
4 | import logging as log
5 | from filelock import FileLock
6 | """
7 | Indexing is distributed across multiple independent hosts. We don't want to
8 | "go live" in production with the newly indexed data until all of the indexing
9 | workers have finished their tasks. To that end, we need to track the state of
10 | each worker, and be notified when the job has finished.
11 |
12 | State is persisted to the disk using shelve. Concurrent writes aren't allowed,
13 | so all operations need to acquire a lock.
14 | """
15 |
16 |
17 | class WorkerStatus(enum.Enum):
18 | RUNNING = 0
19 | FINISHED = 1
20 |
21 |
22 | def register_indexing_job(worker_ips, target_index):
23 | """
24 | Track the hosts that are running indexing jobs. Only one indexing job can
25 | run at a time.
26 |
27 | :param worker_ips: A list of private IP addresses corresponding to the pool
28 | of relevant indexer-worker instances.
29 | :param target_index: The name of the Elasticsearch index that will be
30 | promoted to production after indexing is complete
31 | :return: Return True if scheduling succeeds
32 | """
33 | with FileLock('lock'), shelve.open('db', writeback=True) as db:
34 | # Wipe last job out if it has finished.
35 | indexing_in_progress = False
36 | if 'worker_statuses' in db:
37 | for worker in db['worker_statuses']:
38 | if db['worker_statuses'][worker] == WorkerStatus.RUNNING:
39 | indexing_in_progress = True
40 | if indexing_in_progress:
41 | log.error(
42 | 'Failed to schedule indexing job; another one is running.'
43 | )
44 | return False
45 |
46 | # Register the workers.
47 | worker_statuses = {}
48 | for worker_url in worker_ips:
49 | worker_statuses[worker_url] = WorkerStatus.RUNNING
50 | db['worker_statuses'] = worker_statuses
51 | db['start_time'] = datetime.datetime.now()
52 | db['target_index'] = target_index
53 | return True
54 |
55 |
56 | def worker_finished(worker_ip):
57 | """
58 | The scheduler received a notification indicating an indexing worker has
59 | finished its task.
60 | :param worker_ip: The private IP of the worker.
61 | :return: The target index if all workers are finished, else False.
62 | """
63 | with FileLock('lock'), shelve.open('db', writeback=True) as db:
64 | try:
65 | _ = db['worker_statuses'][worker_ip]
66 | db['worker_statuses'][worker_ip] = WorkerStatus.FINISHED
67 | log.info(f'Received worker_finished signal from {worker_ip}')
68 | except KeyError:
69 | log.error(
70 | 'An indexer worker notified us it finished its task, but '
71 | 'we are not tracking it.'
72 | )
73 | for worker_key in db['worker_statuses']:
74 | if db['worker_statuses'][worker_key] == WorkerStatus.RUNNING:
75 | log.info(f'{worker_key} is still indexing')
76 | return False
77 | return db['target_index']
78 |
79 |
80 | def clear_state():
81 | """
82 | Forget about all running index jobs. Use with care.
83 | """
84 | with FileLock('lock'), shelve.open('db', writeback=True) as db:
85 | for key in db:
86 | log.info('Deleting ' + str(db[key]))
87 | del db[key]
88 | log.info('Cleared indexing state.')
89 |
--------------------------------------------------------------------------------
/ingestion_server/ingestion_server/tasks.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import datetime as dt
3 | import requests
4 | from enum import Enum
5 | from multiprocessing import Process
6 | from ingestion_server.indexer import elasticsearch_connect, TableIndexer
7 | from ingestion_server.ingest import reload_upstream
8 |
9 | """ Simple in-memory tracking of executed tasks. """
10 |
11 |
12 | class TaskTypes(Enum):
13 | # Completely reindex all data for a given model.
14 | REINDEX = 0
15 | # Reindex updates to a model from the database since a certain date.
16 | UPDATE_INDEX = 1
17 | # Download the latest copy of the data from the upstream database, then
18 | # completely reindex the newly imported data.
19 | INGEST_UPSTREAM = 2
20 | # Create indices in Elasticsearch for QA tests.
21 | # This is not intended for production use, but can be safely executed in a
22 | # production environment without consequence.
23 | LOAD_TEST_DATA = 3
24 |
25 |
26 | class TaskTracker:
27 | def __init__(self):
28 | self.id_task = {}
29 | self.id_action = {}
30 | self.id_progress = {}
31 | self.id_start_time = {}
32 | self.id_finish_time = {}
33 |
34 | def add_task(self, task, task_id, action, progress, finish_time):
35 | self._prune_old_tasks()
36 | self.id_task[task_id] = task
37 | self.id_action[task_id] = action
38 | self.id_progress[task_id] = progress
39 | self.id_start_time[task_id] = dt.datetime.utcnow().timestamp()
40 | self.id_finish_time[task_id] = finish_time
41 | return task_id
42 |
43 | def _prune_old_tasks(self):
44 | pass
45 |
46 | def list_task_statuses(self):
47 | self._prune_old_tasks()
48 | results = []
49 | for _id, task in self.id_task.items():
50 | percent_completed = self.id_progress[_id].value
51 | active = task.is_alive()
52 | start_time = self.id_start_time[_id]
53 | finish_time = self.id_finish_time[_id].value
54 | results.append({
55 | 'task_id': _id,
56 | 'active': active,
57 | 'action': self.id_action[_id],
58 | 'progress': percent_completed,
59 | 'error': percent_completed < 100 and not active,
60 | 'start_time': start_time,
61 | 'finish_time': finish_time
62 | })
63 | sorted_results = sorted(
64 | results,
65 | key=lambda x: x['finish_time']
66 | )
67 |
68 | to_utc = dt.datetime.utcfromtimestamp
69 |
70 | def render_date(x):
71 | return to_utc(x) if x != 0.0 else None
72 |
73 | # Convert date to a readable format
74 | for idx, task in enumerate(sorted_results):
75 | start_time = task['start_time']
76 | finish_time = task['finish_time']
77 | sorted_results[idx]['start_time'] = str(render_date(start_time))
78 | sorted_results[idx]['finish_time'] = str(render_date(finish_time))
79 |
80 | return sorted_results
81 |
82 |
83 | class Task(Process):
84 | def __init__(self, model, task_type, since_date, progress, task_id,
85 | finish_time, callback_url):
86 | Process.__init__(self)
87 | self.model = model
88 | self.task_type = task_type
89 | self.since_date = since_date
90 | self.progress = progress
91 | self.task_id = task_id
92 | self.finish_time = finish_time
93 | self.callback_url = callback_url
94 |
95 | def run(self):
96 | # Map task types to actions.
97 | elasticsearch = elasticsearch_connect()
98 | indexer = TableIndexer(
99 | elasticsearch, self.model, self.progress, self.finish_time
100 | )
101 | if self.task_type == TaskTypes.REINDEX:
102 | indexer.reindex(self.model)
103 | elif self.task_type == TaskTypes.UPDATE_INDEX:
104 | indexer.update(self.model, self.since_date)
105 | elif self.task_type == TaskTypes.INGEST_UPSTREAM:
106 | reload_upstream(self.model)
107 | indexer.reindex(self.model)
108 | elif self.task_type == TaskTypes.LOAD_TEST_DATA:
109 | indexer.load_test_data()
110 | logging.info('Task {} exited.'.format(self.task_id))
111 | if self.callback_url:
112 | try:
113 | requests.post(self.callback_url)
114 | except requests.exceptions.RequestException as e:
115 | logging.error('Failed to send callback!')
116 | logging.error(e)
117 |
--------------------------------------------------------------------------------
/ingestion_server/publish_release.sh:
--------------------------------------------------------------------------------
1 | # Usage: ./public_release.sh [VERSION]
2 | docker build -t creativecommons/ingestion_server:$1 .
3 | docker build -f Dockerfile-worker -t creativecommons/indexer_worker:$1 .
4 | docker push creativecommons/ingestion_server:$1
5 | docker push creativecommons/indexer_worker:$1
6 |
--------------------------------------------------------------------------------
/ingestion_server/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/ingestion_server/test/__init__.py
--------------------------------------------------------------------------------
/ingestion_server/test/generate_integration_test_docker_compose.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import yaml
3 | import datetime
4 | import os
5 | import sys
6 | import traceback
7 | import textwrap
8 |
9 | """
10 | Parses docker-compose file and generates an integration-test-docker-compose.yml.
11 | The generated file is written to the same directory this script resides in.
12 |
13 | Q: Why didn't you just use multiple docker-compose files and inheritance?
14 |
15 | A: If you are running the development docker-compose file already, launching
16 | an inherited elasticsearch/postgres service will result in the containers
17 | being destroyed and recreated. Using this approach ensures that:
18 | 1) Running tests doesn't interfere with your development environment.
19 | 2) The file stays up-to-date without manual copy-pasting.
20 | 3) We don't blow up running containers on Travis CI.
21 | """
22 |
23 | this_dir = os.path.dirname(os.path.realpath(__file__))
24 | outname = this_dir + '/integration-test-docker-compose.yml'
25 | parent_docker_compose = this_dir + '/../../docker-compose.yml'
26 |
27 | with open(parent_docker_compose, 'r') as docker_compose_file:
28 | docker_compose = yaml.safe_load(docker_compose_file)
29 | try:
30 | db = docker_compose['services']['db']
31 | es = docker_compose['services']['es']
32 | ingestion_server = docker_compose['services']['ingestion-server']
33 | upstream_db = docker_compose['services']['upstream_db']
34 | # Delete services we're not testing.
35 | desired_services = {'es', 'db', 'ingestion-server', 'upstream_db'}
36 | for service in dict(docker_compose['services']):
37 | if service not in desired_services:
38 | del docker_compose['services'][service]
39 | del docker_compose['services']['es']['healthcheck']
40 |
41 | # Expose alternate ports. Use the same internal port defined in the
42 | # original docker-compose file.
43 | upstream_db_port = upstream_db['ports'][0].split(':')[1]
44 | upstream_db['ports'][0] = '59999' + ':' + upstream_db_port
45 | db['ports'][0] = '60000' + ':' + db['ports'][0].split(':')[1]
46 | es['ports'][0] = '60001' + ':' + es['ports'][0].split(':')[1]
47 | ingestion_api_port = ingestion_server['ports'][0].split(':')[1]
48 | ingestion_server['ports'][0] = '60002' + ':' + ingestion_api_port
49 |
50 | # Configure ingestion server to point to integration containers.
51 | upstream_name = 'integration-upstream'
52 | ingestion_server['environment']['DATABASE_HOST'] = 'integration-db'
53 | ingestion_server['environment']['ELASTICSEARCH_URL'] = 'integration-es'
54 | ingestion_server['environment']['UPSTREAM_DB_HOST'] = upstream_name
55 | ingestion_server['depends_on'] = ['integration-es', 'integration-db']
56 | ingestion_server['build'] = '../'
57 |
58 | # Create a volume for the mock data
59 | db['volumes'] = ['./mock_data:/mock_data']
60 | upstream_db['volumes'] = ['./mock_data:/mock_data']
61 |
62 | # Rename the services and update ports.
63 | for service in dict(docker_compose['services']):
64 | if service in desired_services:
65 | del docker_compose['services'][service]
66 | docker_compose['services']['integration-db'] = db
67 | docker_compose['services']['integration-es'] = es
68 | docker_compose['services']['integration-ingestion'] = ingestion_server
69 | docker_compose['services']['integration-upstream'] = upstream_db
70 |
71 |
72 | # Start the document with a warning message
73 | warning_message = '\n'.join(textwrap.wrap(
74 | 'This docker-compose file was generated from '
75 | + parent_docker_compose + '. Do not modify this file directly. '
76 | 'Your changes will be overwritten. Last update: '
77 | + str(datetime.datetime.now()), width=79,
78 | initial_indent='# ', subsequent_indent='# ')) + '\n\n'
79 |
80 | with open(outname, 'w') as integration_docker_compose:
81 | integration_docker_compose.truncate()
82 | integration_docker_compose.write(warning_message)
83 | yaml.dump(docker_compose, integration_docker_compose,
84 | default_flow_style=False)
85 |
86 | except KeyError as e:
87 | print(traceback.format_exc())
88 | print('Failed to parse docker-compose.yml due to missing key. No file'
89 | ' was written to disk. Missing key: ' + str(e))
90 | sys.exit(1)
91 | except Exception as e:
92 | print(traceback.format_exc())
93 | print('Failed to generate', outname, 'due to exception:', e)
94 |
--------------------------------------------------------------------------------
/ingestion_server/test/integration-test-docker-compose.yml:
--------------------------------------------------------------------------------
1 | # This docker-compose file was generated from /home/alden/code/cccatalog-
2 | # api/ingestion_server/test/../../docker-compose.yml. Do not modify this file
3 | # directly. Your changes will be overwritten. Last update: 2019-01-09
4 | # 11:36:00.858884
5 |
6 | services:
7 | integration-db:
8 | environment:
9 | POSTGRES_DB: openledger
10 | POSTGRES_HOST: 0.0.0.0
11 | POSTGRES_PASSWORD: deploy
12 | POSTGRES_USER: deploy
13 | healthcheck:
14 | test: pg_isready -U deploy -d openledger
15 | image: postgres:10.3-alpine
16 | ports:
17 | - 60000:5432
18 | volumes:
19 | - ./mock_data:/mock_data
20 | integration-es:
21 | environment:
22 | - xpack.security.enabled=false
23 | image: docker.elastic.co/elasticsearch/elasticsearch:6.2.4
24 | ports:
25 | - 60001:9200
26 | ulimits:
27 | nofile:
28 | hard: 65536
29 | soft: 65536
30 | integration-ingestion:
31 | build: ../
32 | command: bash -c 'sleep 20 && supervisord -c config/supervisord.conf'
33 | depends_on:
34 | - integration-es
35 | - integration-db
36 | environment:
37 | COPY_TABLES: image
38 | DATABASE_HOST: integration-db
39 | DATABASE_NAME: openledger
40 | DATABASE_PASSWORD: deploy
41 | DATABASE_PORT: '5432'
42 | DATABASE_USER: deploy
43 | DB_BUFFER_SIZE: '100000'
44 | ELASTICSEARCH_PORT: '9200'
45 | ELASTICSEARCH_URL: integration-es
46 | PYTHONUNBUFFERED: '0'
47 | SYNCER_POLL_INTERVAL: '60'
48 | UPSTREAM_DB_HOST: integration-upstream
49 | UPSTREAM_DB_PORT: 5432
50 | ports:
51 | - 60002:8001
52 | stdin_open: true
53 | tty: true
54 | volumes:
55 | - ./ingestion_server:/ingestion-server
56 | integration-upstream:
57 | environment:
58 | POSTGRES_DB: openledger
59 | POSTGRES_HOST: 0.0.0.0
60 | POSTGRES_PASSWORD: deploy
61 | POSTGRES_USER: deploy
62 | healthcheck:
63 | test: pg_isready -U deploy -d openledger
64 | image: postgres:10.3-alpine
65 | ports:
66 | - 59999:5432
67 | volumes:
68 | - ./mock_data:/mock_data
69 | version: '3'
70 |
--------------------------------------------------------------------------------
/ingestion_server/test/mock_data/no_constraints_schema.sql:
--------------------------------------------------------------------------------
1 | --
2 | -- PostgreSQL database dump
3 | --
4 |
5 | -- Dumped from database version 10.3
6 | -- Dumped by pg_dump version 10.3 (Debian 10.3-1.pgdg90+1)
7 |
8 | SET statement_timeout = 0;
9 | SET lock_timeout = 0;
10 | SET idle_in_transaction_session_timeout = 0;
11 | SET client_encoding = 'UTF8';
12 | SET standard_conforming_strings = on;
13 | SET check_function_bodies = false;
14 | SET client_min_messages = warning;
15 | SET row_security = off;
16 |
17 | SET default_tablespace = '';
18 |
19 | SET default_with_oids = false;
20 |
21 | --
22 | -- Name: image; Type: TABLE; Schema: public; Owner: deploy
23 | --
24 |
25 | CREATE TABLE public.image (
26 | id integer NOT NULL,
27 | created_on timestamp with time zone NOT NULL,
28 | updated_on timestamp with time zone NOT NULL,
29 | identifier character varying(255),
30 | perceptual_hash character varying(255),
31 | provider character varying(80),
32 | source character varying(80),
33 | foreign_identifier character varying(1000),
34 | foreign_landing_url character varying(1000),
35 | url character varying(1000) NOT NULL,
36 | thumbnail character varying(1000),
37 | width integer,
38 | height integer,
39 | filesize integer,
40 | license character varying(50) NOT NULL,
41 | license_version character varying(25),
42 | creator character varying(2000),
43 | creator_url character varying(2000),
44 | title character varying(2000),
45 | tags_list character varying(255)[],
46 | last_synced_with_source timestamp with time zone,
47 | removed_from_source boolean NOT NULL,
48 | meta_data jsonb,
49 | view_count integer NOT NULL,
50 | tags jsonb NOT NULL,
51 | watermarked boolean NOT NULL
52 | );
53 |
54 |
55 | ALTER TABLE public.image OWNER TO deploy;
56 |
57 | --
58 | -- Name: image_id_seq; Type: SEQUENCE; Schema: public; Owner: deploy
59 | --
60 |
61 | CREATE SEQUENCE public.image_id_seq
62 | AS integer
63 | START WITH 1
64 | INCREMENT BY 1
65 | NO MINVALUE
66 | NO MAXVALUE
67 | CACHE 1;
68 |
69 |
70 | ALTER TABLE public.image_id_seq OWNER TO deploy;
71 |
72 | --
73 | -- Name: image_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: deploy
74 | --
75 |
76 | ALTER SEQUENCE public.image_id_seq OWNED BY public.image.id;
77 |
78 |
79 | --
80 | -- Name: image id; Type: DEFAULT; Schema: public; Owner: deploy
81 | --
82 |
83 | ALTER TABLE ONLY public.image ALTER COLUMN id SET DEFAULT nextval('public.image_id_seq'::regclass);
84 |
85 |
86 |
87 | --
88 | -- Name: image image_pkey; Type: CONSTRAINT; Schema: public; Owner: deploy
89 | --
90 |
91 | ALTER TABLE ONLY public.image
92 | ADD CONSTRAINT image_pkey PRIMARY KEY (id);
93 |
94 |
95 | --
96 | -- Name: image_foreign_identifier_4c72d3ee_like; Type: INDEX; Schema: public; Owner: deploy
97 | --
98 |
99 | CREATE INDEX image_foreign_identifier_4c72d3ee_like ON public.image USING btree (foreign_identifier varchar_pattern_ops);
100 |
101 |
102 | --
103 | -- Name: image_identifier_d102a6e0_like; Type: INDEX; Schema: public; Owner: deploy
104 | --
105 |
106 | CREATE INDEX image_identifier_d102a6e0_like ON public.image USING btree (identifier varchar_pattern_ops);
107 |
108 |
109 | --
110 | -- Name: image_last_synced_with_source_187adf09; Type: INDEX; Schema: public; Owner: deploy
111 | --
112 |
113 | CREATE INDEX image_last_synced_with_source_187adf09 ON public.image USING btree (last_synced_with_source);
114 |
115 |
116 | --
117 | -- Name: image_perceptual_hash_0d126a7a; Type: INDEX; Schema: public; Owner: deploy
118 | --
119 |
120 | CREATE INDEX image_perceptual_hash_0d126a7a ON public.image USING btree (perceptual_hash);
121 |
122 |
123 | --
124 | -- Name: image_perceptual_hash_0d126a7a_like; Type: INDEX; Schema: public; Owner: deploy
125 | --
126 |
127 | CREATE INDEX image_perceptual_hash_0d126a7a_like ON public.image USING btree (perceptual_hash varchar_pattern_ops);
128 |
129 |
130 | --
131 | -- Name: image_provider_7d11f847; Type: INDEX; Schema: public; Owner: deploy
132 | --
133 |
134 | CREATE INDEX image_provider_7d11f847 ON public.image USING btree (provider);
135 |
136 |
137 | --
138 | -- Name: image_provider_7d11f847_like; Type: INDEX; Schema: public; Owner: deploy
139 | --
140 |
141 | CREATE INDEX image_provider_7d11f847_like ON public.image USING btree (provider varchar_pattern_ops);
142 |
143 |
144 | --
145 | -- Name: image_source_d5a89e97; Type: INDEX; Schema: public; Owner: deploy
146 | --
147 |
148 | CREATE INDEX image_source_d5a89e97 ON public.image USING btree (source);
149 |
150 |
151 | --
152 | -- Name: image_source_d5a89e97_like; Type: INDEX; Schema: public; Owner: deploy
153 | --
154 |
155 | CREATE INDEX image_source_d5a89e97_like ON public.image USING btree (source varchar_pattern_ops);
156 |
157 |
158 | --
159 | -- Name: image_url_c6aabda2_like; Type: INDEX; Schema: public; Owner: deploy
160 | --
161 |
162 | CREATE INDEX image_url_c6aabda2_like ON public.image USING btree (url varchar_pattern_ops);
163 |
164 |
165 | --
166 | -- PostgreSQL database dump complete
167 | --
168 |
169 |
--------------------------------------------------------------------------------
/ingestion_server/test/mock_data/schema.sql:
--------------------------------------------------------------------------------
1 | --
2 | -- PostgreSQL database dump
3 | --
4 |
5 | -- Dumped from database version 10.3
6 | -- Dumped by pg_dump version 10.3 (Debian 10.3-1.pgdg90+1)
7 |
8 | SET statement_timeout = 0;
9 | SET lock_timeout = 0;
10 | SET idle_in_transaction_session_timeout = 0;
11 | SET client_encoding = 'UTF8';
12 | SET standard_conforming_strings = on;
13 | SET check_function_bodies = false;
14 | SET client_min_messages = warning;
15 | SET row_security = off;
16 |
17 | SET default_tablespace = '';
18 |
19 | SET default_with_oids = false;
20 |
21 | --
22 | -- Name: image; Type: TABLE; Schema: public; Owner: deploy
23 | --
24 |
25 | CREATE TABLE public.image (
26 | id integer NOT NULL,
27 | created_on timestamp with time zone NOT NULL,
28 | updated_on timestamp with time zone NOT NULL,
29 | identifier character varying(255),
30 | perceptual_hash character varying(255),
31 | provider character varying(80),
32 | source character varying(80),
33 | foreign_identifier character varying(1000),
34 | foreign_landing_url character varying(1000),
35 | url character varying(1000) NOT NULL,
36 | thumbnail character varying(1000),
37 | width integer,
38 | height integer,
39 | filesize integer,
40 | license character varying(50) NOT NULL,
41 | license_version character varying(25),
42 | creator character varying(2000),
43 | creator_url character varying(2000),
44 | title character varying(2000),
45 | tags_list character varying(255)[],
46 | last_synced_with_source timestamp with time zone,
47 | removed_from_source boolean NOT NULL,
48 | meta_data jsonb,
49 | view_count integer NOT NULL,
50 | tags jsonb NOT NULL,
51 | watermarked boolean NOT NULL
52 | );
53 |
54 |
55 | ALTER TABLE public.image OWNER TO deploy;
56 |
57 | --
58 | -- Name: image_id_seq; Type: SEQUENCE; Schema: public; Owner: deploy
59 | --
60 |
61 | CREATE SEQUENCE public.image_id_seq
62 | AS integer
63 | START WITH 1
64 | INCREMENT BY 1
65 | NO MINVALUE
66 | NO MAXVALUE
67 | CACHE 1;
68 |
69 |
70 | ALTER TABLE public.image_id_seq OWNER TO deploy;
71 |
72 | --
73 | -- Name: image_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: deploy
74 | --
75 |
76 | ALTER SEQUENCE public.image_id_seq OWNED BY public.image.id;
77 |
78 |
79 | --
80 | -- Name: image id; Type: DEFAULT; Schema: public; Owner: deploy
81 | --
82 |
83 | ALTER TABLE ONLY public.image ALTER COLUMN id SET DEFAULT nextval('public.image_id_seq'::regclass);
84 |
85 |
86 | --
87 | -- Name: image image_foreign_identifier_key; Type: CONSTRAINT; Schema: public; Owner: deploy
88 | --
89 |
90 | ALTER TABLE ONLY public.image
91 | ADD CONSTRAINT image_foreign_identifier_key UNIQUE (foreign_identifier);
92 |
93 |
94 | --
95 | -- Name: image image_identifier_key; Type: CONSTRAINT; Schema: public; Owner: deploy
96 | --
97 |
98 | ALTER TABLE ONLY public.image
99 | ADD CONSTRAINT image_identifier_key UNIQUE (identifier);
100 |
101 |
102 | --
103 | -- Name: image image_pkey; Type: CONSTRAINT; Schema: public; Owner: deploy
104 | --
105 |
106 | ALTER TABLE ONLY public.image
107 | ADD CONSTRAINT image_pkey PRIMARY KEY (id);
108 |
109 |
110 | --
111 | -- Name: image image_url_key; Type: CONSTRAINT; Schema: public; Owner: deploy
112 | --
113 |
114 | ALTER TABLE ONLY public.image
115 | ADD CONSTRAINT image_url_key UNIQUE (url);
116 |
117 |
118 | --
119 | -- Name: image_foreign_identifier_4c72d3ee_like; Type: INDEX; Schema: public; Owner: deploy
120 | --
121 |
122 | CREATE INDEX image_foreign_identifier_4c72d3ee_like ON public.image USING btree (foreign_identifier varchar_pattern_ops);
123 |
124 |
125 | --
126 | -- Name: image_identifier_d102a6e0_like; Type: INDEX; Schema: public; Owner: deploy
127 | --
128 |
129 | CREATE INDEX image_identifier_d102a6e0_like ON public.image USING btree (identifier varchar_pattern_ops);
130 |
131 |
132 | --
133 | -- Name: image_last_synced_with_source_187adf09; Type: INDEX; Schema: public; Owner: deploy
134 | --
135 |
136 | CREATE INDEX image_last_synced_with_source_187adf09 ON public.image USING btree (last_synced_with_source);
137 |
138 |
139 | --
140 | -- Name: image_perceptual_hash_0d126a7a; Type: INDEX; Schema: public; Owner: deploy
141 | --
142 |
143 | CREATE INDEX image_perceptual_hash_0d126a7a ON public.image USING btree (perceptual_hash);
144 |
145 |
146 | --
147 | -- Name: image_perceptual_hash_0d126a7a_like; Type: INDEX; Schema: public; Owner: deploy
148 | --
149 |
150 | CREATE INDEX image_perceptual_hash_0d126a7a_like ON public.image USING btree (perceptual_hash varchar_pattern_ops);
151 |
152 |
153 | --
154 | -- Name: image_provider_7d11f847; Type: INDEX; Schema: public; Owner: deploy
155 | --
156 |
157 | CREATE INDEX image_provider_7d11f847 ON public.image USING btree (provider);
158 |
159 |
160 | --
161 | -- Name: image_provider_7d11f847_like; Type: INDEX; Schema: public; Owner: deploy
162 | --
163 |
164 | CREATE INDEX image_provider_7d11f847_like ON public.image USING btree (provider varchar_pattern_ops);
165 |
166 |
167 | --
168 | -- Name: image_source_d5a89e97; Type: INDEX; Schema: public; Owner: deploy
169 | --
170 |
171 | CREATE INDEX image_source_d5a89e97 ON public.image USING btree (source);
172 |
173 |
174 | --
175 | -- Name: image_source_d5a89e97_like; Type: INDEX; Schema: public; Owner: deploy
176 | --
177 |
178 | CREATE INDEX image_source_d5a89e97_like ON public.image USING btree (source varchar_pattern_ops);
179 |
180 |
181 | --
182 | -- Name: image_url_c6aabda2_like; Type: INDEX; Schema: public; Owner: deploy
183 | --
184 |
185 | CREATE INDEX image_url_c6aabda2_like ON public.image USING btree (url varchar_pattern_ops);
186 |
187 |
188 | --
189 | -- PostgreSQL database dump complete
190 | --
191 |
192 |
--------------------------------------------------------------------------------
/ingestion_server/test/mock_data/update_mocks.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Fetches mock data from a running postgres database.
4 |
5 | export PGPASSWORD="deploy"
6 | # Dump schema
7 | pg_dump -s -h localhost -U deploy -d openledger -t 'image' > schema.sql
8 | # Remove search path (so we can refer to the public schema implicitly)
9 | sed -ie '/search_path/d' schema.sql
10 | # Select some images and save to CSV
11 | psql -h localhost -U deploy -d openledger -c "\\copy (select * from image where meta_data is not null limit 1000) to './mocked_images.csv' with CSV"
12 | exit 0
13 |
--------------------------------------------------------------------------------
/ingestion_server/test/unit_tests.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import datetime
3 | from uuid import uuid4
4 | from psycopg2.extras import Json
5 | from ingestion_server.cleanup import CleanupFunctions
6 | from ingestion_server.elasticsearch_models import Image
7 |
8 |
9 | def create_mock_image(override=None):
10 | """
11 | Produce a mock image. Override default fields by passing in a dict with the
12 | desired keys and values.
13 |
14 | For example, to make an image with a custom title and default everything
15 | else:
16 | >>> create_mock_image({'title': 'My title'})
17 | :return:
18 | """
19 | test_popularity = {
20 | 'views': 50,
21 | 'likes': 3,
22 | 'comments': 1
23 | }
24 | license_url = 'https://creativecommons.org/licenses/by/2.0/fr/legalcode'
25 | meta_data = {
26 | 'popularity_metrics': test_popularity,
27 | 'license_url': license_url
28 | }
29 | test_data = {
30 | 'id': 0,
31 | 'title': 'Unit test title',
32 | 'identifier': str(uuid4()),
33 | 'creator': 'Eric Idle',
34 | 'creator_url': 'https://creativecommons.org',
35 | 'tags': [{'name': 'test', 'accuracy': 0.9}],
36 | 'created_on': datetime.datetime.now(),
37 | 'url': 'https://creativecommons.org',
38 | 'thumbnail': 'https://creativecommons.org',
39 | 'provider': 'test',
40 | 'source': 'test',
41 | 'license': 'cc-by',
42 | 'license_version': '4.0',
43 | 'foreign_landing_url': 'https://creativecommons.org',
44 | 'view_count': 0,
45 | 'height': 500,
46 | 'width': 500,
47 | 'mature': False,
48 | 'meta_data': meta_data
49 | }
50 | if override:
51 | for k, v in override.items():
52 | test_data[k] = v
53 | schema = {}
54 | row = []
55 | idx = 0
56 | for k, v in test_data.items():
57 | schema[k] = idx
58 | row.append(v)
59 | idx += 1
60 | return Image.database_row_to_elasticsearch_doc(row, schema)
61 |
62 |
63 | class TestImage:
64 | @staticmethod
65 | def test_size():
66 | small = create_mock_image({'height': 600, 'width': 300})
67 | assert small.size == Image.ImageSizes.SMALL.name.lower()
68 | huge = create_mock_image({'height': 4096, 'width': 4096})
69 | assert huge.size == Image.ImageSizes.LARGE.name.lower()
70 |
71 | @staticmethod
72 | def test_aspect_ratio():
73 | square = create_mock_image({'height': 300, 'width': 300})
74 | assert square.aspect_ratio == Image.AspectRatios.SQUARE.name.lower()
75 | tall = create_mock_image({'height': 500, 'width': 200})
76 | assert tall.aspect_ratio == Image.AspectRatios.TALL.name.lower()
77 | wide = create_mock_image({'height': 200, 'width': 500})
78 | assert wide.aspect_ratio == Image.AspectRatios.WIDE.name.lower()
79 |
80 | @staticmethod
81 | def test_extension():
82 | no_extension = create_mock_image({
83 | 'url': 'https://creativecommons.org/hello'
84 | })
85 | assert no_extension.extension is None
86 | jpg = create_mock_image({
87 | 'url': 'https://creativecommons.org/hello.jpg'
88 | })
89 | assert jpg.extension == 'jpg'
90 |
91 | @staticmethod
92 | def test_mature_metadata():
93 | # Received upstream indication the work is mature
94 | meta = {
95 | 'mature': True
96 | }
97 | mature_metadata = create_mock_image({'meta_data': meta})
98 | assert mature_metadata['mature']
99 |
100 | @staticmethod
101 | def test_mature_api():
102 | # Manually flagged work as mature ourselves
103 | mature_work = create_mock_image({'mature': True})
104 | assert mature_work['mature']
105 |
106 | @staticmethod
107 | def test_default_maturity():
108 | # Default to not flagged
109 | sfw = create_mock_image()
110 | assert not sfw['mature']
111 |
112 |
113 | class TestCleanup:
114 | @staticmethod
115 | def test_tag_blacklist():
116 | tags = [
117 | {
118 | 'name': 'cc0'
119 | },
120 | {
121 | 'name': ' cc0'
122 | },
123 | {
124 | 'name': 'valid',
125 | 'accuracy': 0.99
126 | },
127 | {
128 | 'name': 'valid_no_accuracy'
129 | },
130 | {
131 | 'name': 'garbage:=metacrap',
132 | }
133 | ]
134 | result = str(CleanupFunctions.cleanup_tags(tags))
135 | expected = str(Json([
136 | {'name': 'valid', 'accuracy': 0.99},
137 | {'name': 'valid_no_accuracy'}
138 | ]))
139 |
140 | assert result == expected
141 |
142 | @staticmethod
143 | def test_tag_no_update():
144 | tags = [
145 | {
146 | 'name': 'valid',
147 | 'accuracy': 0.92
148 | }
149 | ]
150 | result = CleanupFunctions.cleanup_tags(tags)
151 | assert result is None
152 |
153 | @staticmethod
154 | def test_accuracy_filter():
155 | tags = [
156 | {
157 | 'name': 'inaccurate',
158 | 'accuracy': 0.5
159 | },
160 | {
161 | 'name': 'accurate',
162 | 'accuracy': 0.999
163 | }
164 | ]
165 | result = str(CleanupFunctions.cleanup_tags(tags))
166 | expected = str(Json([{'name': 'accurate', 'accuracy': 0.999}]))
167 | assert result == expected
168 |
169 | @staticmethod
170 | def test_url_protocol_fix():
171 | bad_url = 'flickr.com'
172 | tls_support_cache = {}
173 | result = CleanupFunctions.cleanup_url(bad_url, tls_support_cache)
174 | expected = "'https://flickr.com'"
175 |
176 | bad_http = 'neverssl.com'
177 | result_http = CleanupFunctions.cleanup_url(bad_http, tls_support_cache)
178 | expected_http = "'http://neverssl.com'"
179 | assert result == expected
180 | assert result_http == expected_http
181 |
182 | @staticmethod
183 | def test_rank_feature_verify():
184 | img = create_mock_image({'standardized_popularity': 200})
185 | assert img.standardized_popularity == 100
186 | img2 = create_mock_image({'standardized_popularity': 0})
187 | assert img2.standardized_popularity is None
188 |
--------------------------------------------------------------------------------
/initialization.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cc-archive/cccatalog-api/731074ee543d50edac9aacb9aa0362d03a6bec41/initialization.PNG
--------------------------------------------------------------------------------
/load_sample_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | CCCAPI_CONTAINER_NAME="${CCCAPI_CONTAINER_NAME:-cccatalog-api_web_1}"
4 | ANALYTICS_CONTAINER_NAME="${ANALYTICS_CONTAINER_NAME:-cccatalog-api_analytics_1}"
5 | # Set up API database and upstream
6 | docker exec -i $CCCAPI_CONTAINER_NAME /bin/bash -c 'python3 manage.py migrate --noinput'
7 | # Create a user for integration testing.
8 | docker exec -i $CCCAPI_CONTAINER_NAME /bin/bash <<'EOF'
9 | python3 manage.py shell -c "from django.contrib.auth.models import User
10 | user = User.objects.create_user('continuous_integration', 'test@test.test', 'deploydeploy')
11 | user.save()
12 | "
13 | EOF
14 | # Migrate analytics
15 | docker exec -i $ANALYTICS_CONTAINER_NAME /bin/bash -c 'PYTHONPATH=. pipenv run alembic upgrade head'
16 | PGPASSWORD=deploy pg_dump -s -t image -U deploy -d openledger -h localhost -p 5432 | PGPASSWORD=deploy psql -U deploy -d openledger -p 5433 -h localhost
17 | # Load sample data
18 | PGPASSWORD=deploy psql -U deploy -d openledger -h localhost -p 5432 -c "INSERT INTO content_provider (created_on, provider_identifier, provider_name, domain_name, filter_content) VALUES (now(), 'flickr', 'Flickr', 'https://www.flickr.com', false), (now(), 'behance', 'Behance', 'https://www.behance.net', false);"
19 | PGPASSWORD=deploy psql -U deploy -d openledger -h localhost -p 5433 <