├── requirements.dev.txt ├── .gitignore ├── pyproject.toml ├── schema ├── schema.sh ├── notesreview.users.json ├── notesreview.notes.json └── schema.json ├── api ├── models │ ├── comment.py │ └── note.py ├── grammars │ └── users.lark ├── auth.py └── query.py ├── requirements.txt ├── Makefile ├── config.py ├── .github └── workflows │ └── lint.yml ├── scripts ├── iteration.py ├── indices.py ├── delete.py ├── import.py └── update.py ├── README.md ├── blueprints ├── status.py ├── auth.py └── search.py └── app.py /requirements.dev.txt: -------------------------------------------------------------------------------- 1 | black==25.9.0 2 | isort==6.1.0 3 | flake8==7.3.0 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | venv/ 3 | 4 | __pycache__/ 5 | __init__.py 6 | 7 | scripts/*.txt 8 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 79 3 | 4 | [tool.isort] 5 | line_length = 79 6 | profile = "black" 7 | -------------------------------------------------------------------------------- /schema/schema.sh: -------------------------------------------------------------------------------- 1 | jq -n 'reduce inputs as $s (.; .[input_filename|rtrimstr(".json")] += $s)' [!schema]*.json > schema.json 2 | -------------------------------------------------------------------------------- /api/models/comment.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from dataclasses import dataclass 3 | 4 | 5 | @dataclass 6 | class Comment: 7 | date: datetime.datetime 8 | action: str 9 | uid: int 10 | user: str 11 | text: str 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lark==1.3.0 2 | lxml==6.0.2 3 | motor==3.7.1 4 | orjson==3.11.3 5 | pyjwt[crypto]==2.10.1 6 | pymongo==4.15.2 7 | python-dateutil==2.9.0.post0 8 | python-dotenv==1.1.1 9 | requests==2.32.5 10 | sanic[ext]==25.3.0 11 | tqdm==4.67.1 12 | -------------------------------------------------------------------------------- /api/models/note.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from dataclasses import dataclass 3 | from typing import List, Tuple 4 | 5 | from .comment import Comment 6 | 7 | 8 | @dataclass 9 | class Note: 10 | _id: int 11 | coordinates: Tuple[float, float] 12 | status: str 13 | updated_at: datetime.datetime 14 | comments: List[Comment] 15 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | LINT_FILES = app.py config.py api/ blueprints/ scripts/ 2 | 3 | install: 4 | pip install -r requirements.txt 5 | 6 | install-dev: 7 | pip install -r requirements.dev.txt 8 | 9 | lint: 10 | flake8 --count --extend-ignore=E501 --show-source --statistics $(LINT_FILES) 11 | 12 | format: 13 | isort --check-only $(LINT_FILES) 14 | black --check --diff --skip-string-normalization $(LINT_FILES) 15 | 16 | download: 17 | curl -o notes.osn.bz2 https://ftp5.gwdg.de/pub/misc/openstreetmap/planet.openstreetmap.org/notes/planet-notes-latest.osn.bz2 && bzip2 -d notes.osn.bz2 18 | -------------------------------------------------------------------------------- /api/grammars/users.lark: -------------------------------------------------------------------------------- 1 | start: exp (_sep " "? exp)* 2 | 3 | exp: not | user 4 | not: ("NOT " | "-") user 5 | ?user: "\""? name "\""? 6 | 7 | _sep: "," | ";" 8 | ?name: WORD+ 9 | 10 | // Allow any characters except invalid ranges as defined in 11 | // https://github.com/openstreetmap/openstreetmap-website/blob/9aa3e7a/app/validators/characters_validator.rb#L2-L3 12 | // by using negative lookaheads (https://stackoverflow.com/questions/1687620/regex-match-everything-but-a-specific-pattern) 13 | // 14 | // Additionally " is not allowed as well 15 | WORD: /(?![\x00-\x08\x0b-\x0c\x0e-\x1f\x7f\ufffe\uffff\/;\.,\?%#\"])./+ 16 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dotenv import load_dotenv 4 | 5 | load_dotenv() 6 | 7 | # fmt: off 8 | config = dict( 9 | DEFAULT_LIMIT=50, 10 | MAX_LIMIT=250, 11 | ROOT_PATH=os.path.dirname(os.path.realpath(__file__)), 12 | DB_USER=os.environ.get('DB_USER'), 13 | DB_PASSWORD=os.environ.get('DB_PASSWORD'), 14 | DB_HOST=os.environ.get('DB_HOST'), 15 | OPENSTREETMAP_OAUTH_JWKS_URI=os.environ.get('OPENSTREETMAP_OAUTH_JWKS_URI'), 16 | OPENSTREETMAP_OAUTH_CLIENT_ID=os.environ.get('OPENSTREETMAP_OAUTH_CLIENT_ID'), 17 | CORS_ORIGINS='*', 18 | CORS_ALWAYS_SEND=False, 19 | ) 20 | # fmt: on 21 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: [push] 4 | 5 | jobs: 6 | lint: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - name: Set up Python 11 | uses: actions/setup-python@v5 12 | with: 13 | python-version: '3.x' 14 | cache: 'pip' 15 | 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | if [ -f requirements.txt ]; then make install; fi 20 | - name: Install dependencies for development 21 | run: | 22 | if [ -f requirements.dev.txt ]; then make install-dev; fi 23 | 24 | - name: Lint 25 | run: | 26 | make lint 27 | - name: Format 28 | run: | 29 | make format 30 | -------------------------------------------------------------------------------- /schema/notesreview.users.json: -------------------------------------------------------------------------------- 1 | { 2 | "$jsonSchema": { 3 | "bsonType": "object", 4 | "required": [ 5 | "_id", 6 | "created_at", 7 | "last_validated_at", 8 | "token", 9 | "user" 10 | ], 11 | "properties": { 12 | "_id": { 13 | "bsonType": "int", 14 | "description": "must be an int and is required" 15 | }, 16 | "created_at": { 17 | "bsonType": "date", 18 | "description": "must be a date and is required" 19 | }, 20 | "last_validated_at": { 21 | "bsonType": "date", 22 | "description": "must be a date and is required" 23 | }, 24 | "token": { 25 | "bsonType": [ 26 | "string", 27 | "null" 28 | ], 29 | "description": "must be a string or null and is required" 30 | }, 31 | "user": { 32 | "bsonType": "string", 33 | "description": "must be a string and is required" 34 | } 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /scripts/iteration.py: -------------------------------------------------------------------------------- 1 | # The previous iteration method used too much memory, but there are some enhancements to improve this: 2 | # https://stackoverflow.com/questions/7171140/using-python-iterparse-for-large-xml-files 3 | # https://stackoverflow.com/questions/12160418/why-is-lxml-etree-iterparse-eating-up-all-my-memory 4 | # https://web.archive.org/web/20210309115224/http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ 5 | def fast_iter(context, func, *args, **kwargs): 6 | """ 7 | http://lxml.de/parsing.html#modifying-the-tree 8 | Based on Liza Daly's fast_iter 9 | http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ 10 | See also http://effbot.org/zone/element-iterparse.htm 11 | """ 12 | for event, element in context: 13 | func(element, *args, **kwargs) 14 | # It's safe to call clear() here because no descendants will be accessed 15 | element.clear() 16 | # Also eliminate now-empty references from the root node to elem 17 | for ancestor in element.xpath('ancestor-or-self::*'): 18 | while ancestor.getprevious() is not None: 19 | del ancestor.getparent()[0] 20 | del context 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # notesreview-api 2 | > API and backend for [notesreview](https://github.com/ENT8R/notesreview) 3 | 4 | ## Scripts 5 | 6 | #### `delete.py` 7 | ```sh 8 | # Deletes all notes that are not included in the notes dump 9 | python scripts/delete.py notes.osn 10 | ``` 11 | 12 | --- 13 | 14 | #### `import.py` 15 | ```sh 16 | # Imports all notes from the notes dump 17 | python scripts/import.py notes.osn 18 | ``` 19 | 20 | --- 21 | 22 | #### `indices.py` 23 | ```sh 24 | # Creates all necessary indices for the database 25 | python scripts/indices.py 26 | ``` 27 | --- 28 | 29 | #### `update.py` 30 | ```sh 31 | # Updates the database by querying the OSM Notes API 32 | # in order to receive the latest notes 33 | # since a given date of the last check 34 | python scripts/update.py 35 | ``` 36 | 37 | ## Notes Dump 38 | 39 | ##### Download 40 | ```sh 41 | # Download and extract the notes dump 42 | # (hosted on https://planet.openstreetmap.org/ or any other mirror) 43 | 44 | # ${URL} needs to be replaced with the location of the notes dump 45 | curl -L -o notes.osn.bz2 ${URL} && pbzip2 -d notes.osn.bz2 46 | ``` 47 | 48 | ##### XML Structure 49 | The structure of the notes dump follows this scheme: 50 | ```xml 51 | 52 | 53 | 54 | ${comment|optional} 55 | 56 | 57 | ``` -------------------------------------------------------------------------------- /blueprints/status.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from sanic import Blueprint 4 | from sanic.response import json 5 | from sanic_ext import openapi 6 | 7 | from config import config 8 | 9 | blueprint = Blueprint('Status', url_prefix='/status') 10 | 11 | 12 | @blueprint.route('/') 13 | @openapi.description( 14 | 'Status information about the database and update frequency' 15 | ) 16 | @openapi.response( 17 | 200, 18 | { 19 | 'application/json': openapi.Object( 20 | properties={ 21 | 'last_import': openapi.DateTime(), 22 | 'last_sync': openapi.DateTime(), 23 | 'last_update': openapi.DateTime(), 24 | } 25 | ), 26 | }, 27 | 'The response is an object with the currently available status information', 28 | ) 29 | async def status(request): 30 | last_import = None 31 | last_sync = None 32 | last_update = None 33 | 34 | with ( 35 | open( 36 | os.path.join(config['ROOT_PATH'], 'scripts', 'LAST_IMPORT.txt') 37 | ) as file1, 38 | open( 39 | os.path.join(config['ROOT_PATH'], 'scripts', 'LAST_SYNC.txt') 40 | ) as file2, 41 | open( 42 | os.path.join(config['ROOT_PATH'], 'scripts', 'LAST_UPDATE.txt') 43 | ) as file3, 44 | ): 45 | last_import = file1.read().strip() 46 | last_sync = file2.read().strip() 47 | last_update = file3.read().strip() 48 | 49 | return json( 50 | { 51 | 'last_import': last_import, 52 | 'last_sync': last_sync, 53 | 'last_update': last_update, 54 | } 55 | ) 56 | -------------------------------------------------------------------------------- /scripts/indices.py: -------------------------------------------------------------------------------- 1 | # import json 2 | import os 3 | 4 | import pymongo 5 | from dotenv import load_dotenv 6 | 7 | load_dotenv() 8 | 9 | client = pymongo.MongoClient( 10 | f'mongodb://{os.environ.get("DB_USER")}:{os.environ.get("DB_PASSWORD")}@{os.environ.get("DB_HOST")}:27017/?authSource=notesreview' 11 | ) 12 | db = client.notesreview 13 | 14 | DIRECTORY = os.path.dirname(os.path.realpath(__file__)) 15 | RUN_IN_BACKGROUND = False 16 | 17 | # Apply validation schemes (requires the collection to exist) 18 | # TODO: This operation requires admin access 19 | # with open(os.path.join(DIRECTORY, '..', 'schema', 'schema.json')) as schema: 20 | # schema = json.load(schema) 21 | # db.command({ 22 | # 'collMod': 'notes', 23 | # 'validator': schema['notesreview.notes'] 24 | # }) 25 | 26 | # Create indices used for faster queries 27 | db.notes.create_index( 28 | [('updated_at', pymongo.DESCENDING)], 29 | name='updated_at', 30 | background=RUN_IN_BACKGROUND, 31 | ) 32 | db.notes.create_index( 33 | [('comments.0.date', pymongo.DESCENDING)], 34 | name='created_at', 35 | background=RUN_IN_BACKGROUND, 36 | ) 37 | db.notes.create_index( 38 | [('coordinates', pymongo.GEOSPHERE)], 39 | name='coordinates', 40 | background=RUN_IN_BACKGROUND, 41 | ) 42 | db.notes.create_index('status', name='status', background=RUN_IN_BACKGROUND) 43 | db.notes.create_index( 44 | 'comments.0.user', name='author', background=RUN_IN_BACKGROUND 45 | ) 46 | db.notes.create_index( 47 | 'comments.user', name='user', background=RUN_IN_BACKGROUND 48 | ) 49 | db.notes.create_index( 50 | [('comments.text', pymongo.TEXT)], 51 | default_language='none', 52 | name='text', 53 | background=RUN_IN_BACKGROUND, 54 | ) 55 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | 3 | from jwt import PyJWKClient 4 | from motor.motor_asyncio import AsyncIOMotorClient 5 | from sanic import Blueprint, Sanic 6 | 7 | from api.auth import attach_uid 8 | from blueprints.auth import blueprint as auth 9 | from blueprints.notes import blueprint as notes 10 | from blueprints.search import blueprint as search 11 | from blueprints.status import blueprint as status 12 | from config import config 13 | 14 | app = Sanic(__name__) 15 | app.config.update(config) 16 | 17 | app.ext.openapi.describe( 18 | 'notesreview-api', 19 | version='0.1.0', 20 | description=dedent( 21 | """\ 22 | # Information 23 | This API is still subject to change, especially the behavior of the `query` parameter might change in the future, 24 | because right now the possibilities are still a little bit limited. 25 | """ 26 | ), 27 | ) 28 | app.ext.openapi.add_security_scheme( 29 | 'token', 30 | 'http', 31 | scheme='bearer', 32 | bearer_format='JWT', 33 | description='OpenID Connect Token issued by OpenStreetMap', 34 | ) 35 | 36 | 37 | @app.before_server_start 38 | async def setup(app, loop): 39 | client = AsyncIOMotorClient( 40 | f'mongodb://{app.config.DB_USER}:{app.config.DB_PASSWORD}@{app.config.DB_HOST}:27017?authSource=notesreview', 41 | io_loop=loop, 42 | ) 43 | jwks_client = PyJWKClient(app.config.OPENSTREETMAP_OAUTH_JWKS_URI) 44 | 45 | app.ctx.client = client 46 | app.ctx.db = client.notesreview 47 | app.ctx.jwks_client = jwks_client 48 | 49 | 50 | @app.before_server_stop 51 | async def shutdown(app, loop): 52 | app.ctx.client.close() 53 | 54 | 55 | app.blueprint(Blueprint.group(auth, status, search, url_prefix='/api')) 56 | 57 | app.register_middleware(attach_uid, 'request') 58 | -------------------------------------------------------------------------------- /schema/notesreview.notes.json: -------------------------------------------------------------------------------- 1 | { 2 | "$jsonSchema": { 3 | "required": [ "_id", "coordinates", "status", "updated_at", "comments" ], 4 | "properties": { 5 | "_id": { 6 | "bsonType": "int", 7 | "description": "must be an int and is required" 8 | }, 9 | "coordinates": { 10 | "bsonType": "array", 11 | "description": "must be an array and is required" 12 | }, 13 | "status": { 14 | "enum": [ "open", "closed" ], 15 | "description": "can only be one of the enum values and is required" 16 | }, 17 | "updated_at": { 18 | "bsonType": "date", 19 | "description": "must be a date and is required" 20 | }, 21 | "comments": { 22 | "bsonType": "array", 23 | "description": "must be an array of objects and is required", 24 | "items": { 25 | "bsonType": "object", 26 | "required": [ 27 | "date", 28 | "action" 29 | ], 30 | "properties": { 31 | "date": { 32 | "bsonType": "date", 33 | "description": "must be a date and is required" 34 | }, 35 | "action": { 36 | "enum": [ "opened", "commented", "closed", "reopened", "hidden" ], 37 | "description": "can only be one of the enum values and is required" 38 | }, 39 | "text": { 40 | "bsonType": "string", 41 | "description": "must be a string and is not required" 42 | }, 43 | "uid": { 44 | "bsonType": "int", 45 | "description": "must be an int and is not required" 46 | }, 47 | "user": { 48 | "bsonType": "string", 49 | "description": "must be a string and is not required" 50 | } 51 | } 52 | } 53 | } 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /blueprints/auth.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import jwt 4 | from sanic import Blueprint, Sanic 5 | from sanic.response import text 6 | from sanic_ext import openapi 7 | 8 | from api.auth import decode_token, protected 9 | 10 | blueprint = Blueprint('Authentication', url_prefix='/auth') 11 | 12 | 13 | @blueprint.get('/login') 14 | @openapi.description('Login with a valid OpenID Connect Token (JWT)') 15 | @openapi.secured('token') 16 | @openapi.response( 17 | 200, 18 | { 19 | 'text/plain': openapi.String(), 20 | }, 21 | 'OK', 22 | ) 23 | @openapi.response( 24 | 401, 25 | { 26 | 'text/plain': openapi.String(), 27 | }, 28 | 'Invalid token or unauthorized', 29 | ) 30 | async def login(request): 31 | token = request.token 32 | info = None 33 | 34 | if token is None: 35 | return text('No token provided', 401) 36 | 37 | try: 38 | info = decode_token(token) 39 | except jwt.exceptions.InvalidTokenError: 40 | return text('The provided token is invalid', 401) 41 | 42 | # Do not proceed if the token does not contain the required information 43 | if info is None or 'sub' not in info or 'preferred_username' not in info: 44 | return text( 45 | 'The provided token can not be used for authentication', 401 46 | ) 47 | 48 | # Store or update the token and the user id in the database 49 | uid = int(info['sub']) 50 | username = info['preferred_username'] 51 | timestamp = datetime.datetime.now(datetime.timezone.utc) 52 | 53 | await Sanic.get_app().ctx.db.users.update_one( 54 | { 55 | '_id': uid, 56 | }, 57 | { 58 | '$setOnInsert': { 59 | '_id': uid, 60 | 'created_at': timestamp, 61 | }, 62 | '$set': { 63 | 'token': token, 64 | 'user': username, 65 | 'last_validated_at': timestamp, 66 | }, 67 | }, 68 | upsert=True, 69 | ) 70 | 71 | return text('OK', 200) 72 | 73 | 74 | @blueprint.get('/logout') 75 | @openapi.description('Logout with a valid OpenID Connect Token (JWT)') 76 | @openapi.secured('token') 77 | @openapi.response( 78 | 200, 79 | { 80 | 'text/plain': openapi.String(), 81 | }, 82 | 'OK', 83 | ) 84 | @protected 85 | async def logout(request): 86 | await Sanic.get_app().ctx.db.users.update_one( 87 | { 88 | '_id': request.ctx.uid, 89 | }, 90 | {'$set': {'token': None}}, 91 | ) 92 | return text('OK', 200) 93 | -------------------------------------------------------------------------------- /api/auth.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | 3 | import jwt 4 | from sanic import HTTPResponse, Sanic 5 | from sanic.request import Request 6 | from sanic.response import text 7 | 8 | from config import config 9 | 10 | 11 | def protected(wrapped): 12 | def decorator(f): 13 | @wraps(f) 14 | async def decorated_function(request, *args, **kwargs): 15 | # Call the request handler only if there is a known uid for the 16 | # token which is already attached to the request context through 17 | # the middleware below before every request 18 | if request.ctx.uid is None: 19 | return text('You are unauthorized', 401) 20 | else: 21 | response = await f(request, *args, **kwargs) 22 | return response 23 | 24 | return decorated_function 25 | 26 | return decorator(wrapped) 27 | 28 | 29 | def decode_token(token): 30 | signing_key = Sanic.get_app().ctx.jwks_client.get_signing_key_from_jwt( 31 | token 32 | ) 33 | return jwt.decode( 34 | token, 35 | signing_key, 36 | audience=config['OPENSTREETMAP_OAUTH_CLIENT_ID'], 37 | options={'verify_exp': False}, 38 | algorithms=['RS256'], 39 | ) 40 | 41 | 42 | async def is_authenticated(request): 43 | token = request.token 44 | if token is None: 45 | return False 46 | 47 | info = None 48 | try: 49 | info = decode_token(token) 50 | except jwt.exceptions.InvalidTokenError: 51 | return False 52 | 53 | if info is None or 'sub' not in info: 54 | return False 55 | 56 | return ( 57 | await Sanic.get_app().ctx.db.users.find_one({'_id': int(info['sub'])}) 58 | is not None 59 | ) 60 | 61 | 62 | async def attach_uid(request): 63 | request.ctx.uid = None 64 | 65 | token = request.token 66 | if token is None: 67 | return 68 | 69 | # Validate the JWT and extract the user id (sub claim) 70 | info = None 71 | try: 72 | info = decode_token(token) 73 | except jwt.exceptions.InvalidTokenError: 74 | return 75 | 76 | # Do not attach a uid if there is no information after decoding the token 77 | if info is None or 'sub' not in info: 78 | return 79 | 80 | # Check if the user exists (logged in before) and is currently using this token 81 | uid = int(info['sub']) 82 | user = await Sanic.get_app().ctx.db.users.find_one({'_id': uid}) 83 | if user is None or user['token'] != token: 84 | return 85 | 86 | # Finally attach the uid to the request context 87 | request.ctx.uid = uid 88 | -------------------------------------------------------------------------------- /schema/schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "notesreview.notes": { 3 | "$jsonSchema": { 4 | "required": [ 5 | "_id", 6 | "coordinates", 7 | "status", 8 | "updated_at", 9 | "comments" 10 | ], 11 | "properties": { 12 | "_id": { 13 | "bsonType": "int", 14 | "description": "must be an int and is required" 15 | }, 16 | "coordinates": { 17 | "bsonType": "array", 18 | "description": "must be an array and is required" 19 | }, 20 | "status": { 21 | "enum": [ 22 | "open", 23 | "closed" 24 | ], 25 | "description": "can only be one of the enum values and is required" 26 | }, 27 | "updated_at": { 28 | "bsonType": "date", 29 | "description": "must be a date and is required" 30 | }, 31 | "comments": { 32 | "bsonType": "array", 33 | "description": "must be an array of objects and is required", 34 | "items": { 35 | "bsonType": "object", 36 | "required": [ 37 | "date", 38 | "action" 39 | ], 40 | "properties": { 41 | "date": { 42 | "bsonType": "date", 43 | "description": "must be a date and is required" 44 | }, 45 | "action": { 46 | "enum": [ 47 | "opened", 48 | "commented", 49 | "closed", 50 | "reopened", 51 | "hidden" 52 | ], 53 | "description": "can only be one of the enum values and is required" 54 | }, 55 | "text": { 56 | "bsonType": "string", 57 | "description": "must be a string and is not required" 58 | }, 59 | "uid": { 60 | "bsonType": "int", 61 | "description": "must be an int and is not required" 62 | }, 63 | "user": { 64 | "bsonType": "string", 65 | "description": "must be a string and is not required" 66 | } 67 | } 68 | } 69 | } 70 | } 71 | } 72 | }, 73 | "notesreview.users": { 74 | "$jsonSchema": { 75 | "bsonType": "object", 76 | "required": [ 77 | "_id", 78 | "created_at", 79 | "last_validated_at", 80 | "token", 81 | "user" 82 | ], 83 | "properties": { 84 | "_id": { 85 | "bsonType": "int", 86 | "description": "must be an int and is required" 87 | }, 88 | "created_at": { 89 | "bsonType": "date", 90 | "description": "must be a date and is required" 91 | }, 92 | "last_validated_at": { 93 | "bsonType": "date", 94 | "description": "must be a date and is required" 95 | }, 96 | "token": { 97 | "bsonType": [ 98 | "string", 99 | "null" 100 | ], 101 | "description": "must be a string or null and is required" 102 | }, 103 | "user": { 104 | "bsonType": "string", 105 | "description": "must be a string and is required" 106 | } 107 | } 108 | } 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /scripts/delete.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import iteration 5 | from dotenv import load_dotenv 6 | from lxml import etree 7 | from pymongo import MongoClient 8 | from tqdm import tqdm 9 | 10 | load_dotenv() 11 | 12 | client = MongoClient( 13 | f'mongodb://{os.environ.get("DB_USER")}:{os.environ.get("DB_PASSWORD")}@127.0.0.1:27017/?authSource=notesreview', 14 | tz_aware=True, 15 | ) 16 | collection = client.notesreview.notes 17 | 18 | DIRECTORY = os.path.dirname(os.path.realpath(__file__)) 19 | 20 | 21 | # Find all ids of the notes which are included in the current notes dump 22 | def ids(file): 23 | ids = set() 24 | last_id = 0 25 | 26 | def process_element(element): 27 | nonlocal last_id 28 | 29 | attributes = element.attrib 30 | id = int(attributes['id']) 31 | last_id = id 32 | ids.add(id) 33 | 34 | iteration.fast_iter( 35 | tqdm(etree.iterparse(file, tag='note', events=('end',))), 36 | process_element, 37 | ) 38 | return ids, last_id 39 | 40 | 41 | # Delete (or only print the ids of) all notes that are stored in the database but not included in the set of ids 42 | def delete(ids_in_dump, last_id, delete): 43 | ids_in_db = set() 44 | # Iterate over all documents with an id lower than the last id of the notes dump 45 | for note in tqdm( 46 | collection.find({}, {'_id': True}).max([('_id', last_id)]).hint('_id_') 47 | ): 48 | if note['_id'] not in ids_in_dump: 49 | # Add the id to the set if the note is in the database but not the dump 50 | ids_in_db.add(note['_id']) 51 | tqdm.write(str(note['_id'])) 52 | else: 53 | # Remove the id if the note is in the database and the dump 54 | ids_in_dump.remove(note['_id']) 55 | 56 | # ids_in_dump(_but_not_in_db) contains all notes that are in the dump but not in the database, 57 | # ids_in_db(_but_not_in_dump) contains all notes that are in the database but not in the dump 58 | tqdm.write( 59 | f'There are currently {len(ids_in_dump)} notes that are in the dump but not in the database' 60 | ) 61 | tqdm.write( 62 | f'There are currently {len(ids_in_db)} notes that are in the database but not in the dump' 63 | ) 64 | 65 | if delete: 66 | # Delete all notes that are currently in the database but not in the dump 67 | result = collection.delete_many( 68 | {'_id': {'$in': list(ids_in_db)}}, hint='_id_' 69 | ) 70 | tqdm.write( 71 | f'Deleted {result.deleted_count} notes which are not present in the notes dump anymore' 72 | ) 73 | # Use the creation date of the last note in the dump as the timestamp of the last synchronization 74 | last_note = collection.find_one({'_id': last_id}) 75 | last_date = last_note['comments'][0]['date'] 76 | with open(os.path.join(DIRECTORY, 'LAST_SYNC.txt'), 'w') as file: 77 | file.write(last_date.isoformat(timespec='seconds')) 78 | 79 | 80 | parser = argparse.ArgumentParser( 81 | description='Delete notes that are not included in the notes dump.' 82 | ) 83 | parser.add_argument( 84 | 'file', type=str, help='path to the file which contains the notes dump' 85 | ) 86 | parser.add_argument( 87 | '--delete', 88 | default=False, 89 | action='store_true', 90 | help='confirm deletion of the notes', 91 | ) 92 | args = parser.parse_args() 93 | 94 | delete(*ids(args.file), args.delete) 95 | -------------------------------------------------------------------------------- /scripts/import.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import os 4 | import textwrap 5 | 6 | import dateutil.parser 7 | import iteration 8 | from dotenv import load_dotenv 9 | from lxml import etree 10 | from pymongo import MongoClient, UpdateOne 11 | from tqdm import tqdm 12 | 13 | load_dotenv() 14 | 15 | client = MongoClient( 16 | f'mongodb://{os.environ.get("DB_USER")}:{os.environ.get("DB_PASSWORD")}@127.0.0.1:27017/?authSource=notesreview', 17 | tz_aware=True, 18 | ) 19 | collection = client.notesreview.notes 20 | 21 | DIRECTORY = os.path.dirname(os.path.realpath(__file__)) 22 | 23 | 24 | # Parses an XML file containing all notes and inserts them into the database 25 | def insert(file): 26 | # Notes are inserted/updated in batches of 50000 27 | BATCH_SIZE = 50000 28 | 29 | operations = [] 30 | # 0. Deleted 1. Added, 2. Updated, 3. Matched 31 | all_stats = [0, 0, 0, 0] 32 | last_id = 0 33 | 34 | def process_element(element): 35 | nonlocal operations, all_stats, last_id 36 | 37 | try: 38 | attributes = element.attrib 39 | id = int(attributes['id']) 40 | last_id = id 41 | comments = parse(element) 42 | note = { 43 | '_id': id, 44 | 'coordinates': [ 45 | float(attributes['lon']), 46 | float(attributes['lat']), 47 | ], 48 | 'status': 'closed' if 'closed_at' in attributes else 'open', 49 | 'updated_at': comments[-1]['date'], 50 | 'comments': comments, 51 | } 52 | except Exception: 53 | tqdm.write(f'Failed to parse note with the id {id}') 54 | return 55 | 56 | operations.append( 57 | UpdateOne( 58 | {'_id': id}, 59 | { 60 | '$set': { 61 | 'status': note['status'], 62 | 'updated_at': note['updated_at'], 63 | 'comments': note['comments'], 64 | }, 65 | '$setOnInsert': { 66 | 'coordinates': note['coordinates'], 67 | }, 68 | }, 69 | upsert=True, 70 | hint='_id_', 71 | ) 72 | ) 73 | 74 | if len(operations) >= BATCH_SIZE: 75 | stats = write(operations) 76 | all_stats = [sum(x) for x in zip(all_stats, stats)] 77 | operations = [] 78 | 79 | iteration.fast_iter( 80 | tqdm(etree.iterparse(file, tag='note', events=('end',))), 81 | process_element, 82 | ) 83 | 84 | if len(operations) > 0: 85 | stats = write(operations) 86 | all_stats = [sum(x) for x in zip(all_stats, stats)] 87 | operations = [] 88 | 89 | # Use the creation date of the last note in the dump as the timestamp of the last import 90 | last_date = collection.find_one({'_id': last_id})['comments'][0]['date'] 91 | with open(os.path.join(DIRECTORY, 'LAST_IMPORT.txt'), 'w') as file: 92 | file.write(last_date.isoformat(timespec='seconds')) 93 | 94 | tqdm.write( 95 | textwrap.dedent( 96 | f""" 97 | ---------------------------------------- 98 | IMPORT SUMMARY 99 | -------------------- 100 | Deleted {all_stats[0]} notes 101 | Added {all_stats[1]} new notes 102 | Updated {all_stats[2]} already existing notes 103 | Matched {all_stats[3]} notes 104 | ---------------------------------------- 105 | Please make sure to run the update script at least until {last_date.isoformat(timespec='seconds')} 106 | to import all changes between the creation of the notes dump and now. 107 | """ 108 | ) 109 | ) 110 | 111 | 112 | # Write operations to the database using the bulk write feature 113 | def write(operations): 114 | result = collection.bulk_write(operations, ordered=False) 115 | if result.bulk_api_result['writeErrors']: 116 | client.events.errors.insert_one( 117 | { 118 | 'type': 'import_error', 119 | 'timestamp': datetime.datetime.now(datetime.timezone.utc), 120 | 'error': result.bulk_api_result['writeErrors'], 121 | } 122 | ) 123 | return [ 124 | result.bulk_api_result['nRemoved'], 125 | result.bulk_api_result['nInserted'], 126 | result.bulk_api_result['nModified'], 127 | result.bulk_api_result['nMatched'], 128 | ] 129 | 130 | 131 | # Parse the comments and extract only the useful information 132 | def parse(note): 133 | comments = [] 134 | for element in note: 135 | attributes = element.attrib 136 | 137 | comment = { 138 | 'date': dateutil.parser.parse(attributes['timestamp']), 139 | 'action': attributes['action'], 140 | 'text': element.text, 141 | } 142 | if 'uid' in attributes: 143 | comment['uid'] = int(attributes['uid']) 144 | if 'user' in attributes: 145 | comment['user'] = attributes['user'] 146 | if not element.text: 147 | del comment['text'] 148 | 149 | comments.append(comment) 150 | return comments 151 | 152 | 153 | parser = argparse.ArgumentParser(description='Import notes from a notes dump.') 154 | parser.add_argument( 155 | 'file', type=str, help='path to the file which contains the notes dump' 156 | ) 157 | args = parser.parse_args() 158 | 159 | insert(args.file) 160 | -------------------------------------------------------------------------------- /blueprints/search.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | 3 | import orjson 4 | from sanic import Blueprint, Sanic 5 | from sanic.response import json 6 | from sanic_ext import openapi 7 | 8 | from api.models.note import Note 9 | from api.query import Filter, Sort 10 | from config import config 11 | 12 | blueprint = Blueprint('Search', url_prefix='/search') 13 | 14 | 15 | @blueprint.route('/', methods=['GET', 'POST']) 16 | @openapi.description('Search and filter all notes in the database') 17 | @openapi.parameter( 18 | 'query', 19 | openapi.String( 20 | description=dedent( 21 | """\ 22 | A word or sentence which can be found in the comments. 23 | To find an exact occurence of a word or sentence, wrap it in quotation marks `"{query}"`. 24 | Single words can be excluded from the result by prepending a dash `-` to the word. 25 | Spaces and other delimiters like dots are currently treated as a logical OR, 26 | though this will likely change in the future. 27 | """ 28 | ), 29 | default=None, 30 | required=False, 31 | ), 32 | ) 33 | @openapi.parameter( 34 | 'bbox', 35 | openapi.String( 36 | description='A pair of coordinates specifying a rectangular box where all results are located in', 37 | example='-87.6955,41.8353,-87.5871,41.9170', 38 | default=None, 39 | ), 40 | ) 41 | @openapi.parameter( 42 | 'polygon', 43 | openapi.String( 44 | description='A GeoJSON polygon specifying a region where all results are located in', 45 | default=None, 46 | ), 47 | ) 48 | @openapi.parameter( 49 | 'status', 50 | openapi.String( 51 | description='The current status of the note', 52 | enum=('all', 'open', 'closed'), 53 | default='all', 54 | ), 55 | ) 56 | @openapi.parameter( 57 | 'anonymous', 58 | openapi.String( 59 | description='Whether anonymous notes should be included inclusively, excluded or included exclusively in the results', 60 | enum=('include', 'hide', 'only'), 61 | default='include', 62 | ), 63 | ) 64 | @openapi.parameter( 65 | 'author', 66 | openapi.String( 67 | description='Name of the user who opened the note, searching for multiple users is possible by separating them with a comma', 68 | default=None, 69 | ), 70 | ) 71 | @openapi.parameter( 72 | 'user', 73 | openapi.String( 74 | description='Name of any user who commented on the note, searching for multiple users is possible by separating them with a comma', 75 | default=None, 76 | ), 77 | ) 78 | @openapi.parameter( 79 | 'after', 80 | openapi.DateTime( 81 | description='Only return notes updated or created after this date', 82 | default=None, 83 | example='2020-03-13T10:20:24', 84 | ), 85 | ) 86 | @openapi.parameter( 87 | 'before', 88 | openapi.DateTime( 89 | description='Only return notes updated or created before this date', 90 | default=None, 91 | example='2020-05-11T07:10:45', 92 | ), 93 | ) 94 | @openapi.parameter( 95 | 'comments', 96 | openapi.Integer( 97 | description='Filters the amount of comments on a note', 98 | minimum=0, 99 | default=None, 100 | ), 101 | ) 102 | @openapi.parameter( 103 | 'commented', 104 | openapi.String( 105 | description='Whether commented notes should be included inclusively, excluded or included exclusively in the results', 106 | enum=('include', 'hide', 'only'), 107 | default='include', 108 | ), 109 | ) 110 | @openapi.parameter( 111 | 'sort_by', 112 | openapi.String( 113 | description='Sort notes either by no criteria, the date of the last update or their creation date', 114 | enum=('none', 'updated_at', 'created_at'), 115 | default='updated_at', 116 | ), 117 | ) 118 | @openapi.parameter( 119 | 'order', 120 | openapi.String( 121 | description='Sort notes either in ascending or descending order', 122 | enum=('descending', 'desc', 'ascending', 'asc'), 123 | default='descending', 124 | ), 125 | ) 126 | @openapi.parameter( 127 | 'limit', 128 | openapi.Integer( 129 | description='Limit the amount of notes to return', 130 | minimum=1, 131 | maximum=config['MAX_LIMIT'], 132 | default=config['DEFAULT_LIMIT'], 133 | ), 134 | ) 135 | @openapi.response( 136 | 200, 137 | {'application/json': openapi.Array(items=Note, uniqueItems=True)}, 138 | 'The response is an array containing the notes with the requested information', 139 | ) 140 | @openapi.response( 141 | 400, 142 | { 143 | 'application/json': openapi.Object( 144 | properties={'error': openapi.String()} 145 | ) 146 | }, 147 | 'In case one of the parameters is invalid, the response contains the error message', 148 | ) 149 | async def index(request): 150 | try: 151 | args = None 152 | if request.method == 'GET': 153 | args = request.args 154 | elif request.method == 'POST': 155 | args = request.json 156 | sort, filter, limit = parse(args) 157 | except ValueError as error: 158 | return json({'error': str(error)}, status=400) 159 | 160 | return await find(sort, filter, limit) 161 | 162 | 163 | def parse(data): 164 | sort = ( 165 | Sort() 166 | .by(data.get('sort_by', 'updated_at')) 167 | .order(data.get('order', 'descending')) 168 | .build() 169 | ) 170 | filter = ( 171 | Filter(sort) 172 | .query(data.get('query')) 173 | .bbox(data.get('bbox')) 174 | .polygon(data.get('polygon')) 175 | .status(data.get('status')) 176 | .anonymous(data.get('anonymous')) 177 | .author(data.get('author')) 178 | .user(data.get('user')) 179 | .after(data.get('after', None)) 180 | .before(data.get('before', None)) 181 | .comments(data.get('comments', None)) 182 | .commented(data.get('commented')) 183 | .build() 184 | ) 185 | limit = data.get('limit', config['DEFAULT_LIMIT']) 186 | 187 | return sort, filter, limit 188 | 189 | 190 | async def find(sort, filter, limit): 191 | # Apply the default limit in case the argument could not be parsed (e.g. for limit=NaN) 192 | try: 193 | limit = int(limit) 194 | except ValueError: 195 | limit = config['DEFAULT_LIMIT'] 196 | 197 | if limit > config['MAX_LIMIT']: 198 | return json( 199 | {'error': f'Limit must not be higher than {config['MAX_LIMIT']}.'}, 200 | status=400, 201 | ) 202 | 203 | # Prevent that a limit of 0 is treated as no limit at all 204 | if limit == 0: 205 | limit = config['DEFAULT_LIMIT'] 206 | 207 | cursor = Sanic.get_app().ctx.db.notes.find(filter).limit(limit) 208 | # Queries are faster if the sorting is not explicitly specified (if desired) 209 | if sort[0] is not None: 210 | cursor = cursor.sort(*sort) 211 | 212 | result = [] 213 | async for document in cursor: 214 | result.append(document) 215 | return json(result, dumps=orjson.dumps, option=orjson.OPT_NAIVE_UTC) 216 | -------------------------------------------------------------------------------- /scripts/update.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import math 4 | import os 5 | import textwrap 6 | import urllib.parse 7 | 8 | import dateutil.parser 9 | import requests 10 | from dotenv import load_dotenv 11 | from pymongo import DeleteOne, InsertOne, MongoClient, UpdateOne 12 | 13 | load_dotenv() 14 | 15 | client = MongoClient( 16 | f'mongodb://{os.environ.get("DB_USER")}:{os.environ.get("DB_PASSWORD")}@127.0.0.1:27017/?authSource=notesreview', 17 | tz_aware=True, 18 | ) 19 | collection = client.notesreview.notes 20 | 21 | DIRECTORY = os.path.dirname(os.path.realpath(__file__)) 22 | 23 | 24 | # Fills the database up by iterating over the OSM Notes API 25 | # The current implementation is based on the last update of a note, 26 | # all notes between now and another given date (the date of the last update) are imported into the database 27 | def update(limit=100): 28 | # This variable is used in the while loop to ensure only notes of a specific timespan are fetched 29 | upper_bound = datetime.datetime.now(datetime.timezone.utc) 30 | # The start time of this function is used at the end to update the timestamp of the last update 31 | update_start_time = upper_bound 32 | with open(os.path.join(DIRECTORY, 'LAST_UPDATE.txt')) as file: 33 | last_update = dateutil.parser.parse(file.read()) 34 | 35 | # Estimate a useful limit with a new note action every 15 seconds 36 | diff = (upper_bound - last_update).total_seconds() 37 | useful_limit = math.ceil(diff * (1 / 15)) 38 | useful_limit = min(10000, useful_limit) 39 | 40 | # 0. Deleted 1. Added, 2. Updated, 3. Ignored 41 | all_stats = [0, 0, 0, 0] 42 | all_ignored = False 43 | 44 | # Either stop in case the stop date (i.e. the date of the last update) is exceeded or all notes are being ignored when inserting 45 | while ( 46 | upper_bound is not None 47 | and upper_bound > last_update 48 | and not all_ignored 49 | ): 50 | url = build_url( 51 | { 52 | 'from': last_update.isoformat(), 53 | 'to': upper_bound.isoformat(), 54 | 'limit': str(limit), 55 | } 56 | ) 57 | response = requests.get(url).json() 58 | features = response['features'] 59 | 60 | stats, oldest = insert(features) 61 | all_stats = [sum(x) for x in zip(all_stats, stats)] 62 | 63 | # Check whether all features were ignored, meaning there are no updates anymore 64 | all_ignored = stats[3] == len(features) 65 | upper_bound = oldest 66 | 67 | print( 68 | textwrap.dedent( 69 | f""" 70 | ---------------------------------------- 71 | UPDATE SUMMARY 72 | -------------------- 73 | Last update: {last_update.isoformat(timespec='seconds')} 74 | End of update: {update_start_time.isoformat(timespec='seconds')} 75 | Time in seconds since last update: {round(diff)} 76 | Expected a useful limit of {useful_limit} while {all_stats[0] + all_stats[1] + all_stats[2]} was actually needed 77 | -------------------- 78 | Deleted {all_stats[0]} notes 79 | Added {all_stats[1]} new notes 80 | Updated {all_stats[2]} already existing notes 81 | Ignored {all_stats[3]} already existing notes 82 | ---------------------------------------- 83 | """ 84 | ) 85 | ) 86 | 87 | with open(os.path.join(DIRECTORY, 'LAST_UPDATE.txt'), 'w') as file: 88 | file.write(update_start_time.isoformat(timespec='seconds')) 89 | # ---------------------------------------- # 90 | 91 | 92 | def build_url(query={}): 93 | defaults = { 94 | 'sort': 'updated_at', 95 | 'closed': '-1', 96 | 'limit': '100', 97 | # The start date needs to be specified because otherwise the value of the 98 | # 'to-parameter' has no effect (Use the begin of OpenStreetMap notes) 99 | 'from': dateutil.parser.parse('2013-04-23T00:00:00'), 100 | } 101 | host = 'https://api.openstreetmap.org/api/0.6/notes/search.json' 102 | url = host + '?' + urllib.parse.urlencode({**defaults, **query}) 103 | return url 104 | 105 | 106 | # Parse the comments and extract only the useful information 107 | def parse(comments): 108 | for comment in comments: 109 | if 'date' in comment: 110 | comment['date'] = dateutil.parser.parse(comment['date']) 111 | if 'user_url' in comment: 112 | del comment['user_url'] 113 | if 'html' in comment: 114 | del comment['html'] 115 | if not comment['text']: 116 | del comment['text'] 117 | return comments 118 | 119 | 120 | # Loops through the provided list of notes and: 121 | # - Adds notes if they are unknown 122 | # - Updates notes if there is a different version 123 | # - Ignores notes which are the same 124 | def insert(features): 125 | operations = [] 126 | deleted = 0 127 | inserted = 0 128 | updated = 0 129 | ignored = 0 130 | oldest = None 131 | 132 | for feature in features: 133 | comments = parse(feature['properties']['comments']) 134 | note = { 135 | '_id': feature['properties']['id'], 136 | 'coordinates': feature['geometry']['coordinates'], 137 | 'status': feature['properties']['status'], 138 | 'updated_at': None if len(comments) == 0 else comments[-1]['date'], 139 | 'comments': comments, 140 | } 141 | query = {'_id': note['_id']} 142 | 143 | # If comments are invisible because of account deletion or other reasons, 144 | # a note might not contain any comments at all 145 | # see also https://github.com/openstreetmap/openstreetmap-website/issues/2146 146 | if len(note['comments']) == 0: 147 | # Notes without any comments are basically useless and should be deleted, 148 | # especially as the comments might have been removed by a moderator 149 | # and should not be visible to the public 150 | operations.append(DeleteOne(query)) 151 | deleted += 1 152 | continue 153 | 154 | # Check whether the note is already in the database and proceed with different operations 155 | document = collection.find_one(query) 156 | if document is None: 157 | # Note is not yet in the database, insert it 158 | operations.append(InsertOne(note)) 159 | inserted += 1 160 | elif note == document: 161 | # Note is already stored in the database, the statement is only true if 162 | # "both dictionaries have the same (key, value) pairs (regardless of ordering)" 163 | # See https://docs.python.org/3/library/stdtypes.html#dict 164 | # Note is the same as the one that is already saved, should be ignored 165 | ignored += 1 166 | else: 167 | # Note is different to the one that is already saved, needs to be updated 168 | operations.append( 169 | UpdateOne( 170 | query, 171 | { 172 | '$set': { 173 | 'status': note['status'], 174 | 'updated_at': note['updated_at'], 175 | 'comments': note['comments'], 176 | } 177 | }, 178 | ) 179 | ) 180 | updated += 1 181 | 182 | # Check whether this note is the one with the oldest update date (for the upper bound of the next request) 183 | last_changed = note['comments'][-1]['date'] 184 | # Only update the oldest changed date if the note is either new (i.e. no document exists yet) or has more comments than before. 185 | # This generally means that no comments were hidden and the last changed date is in fact also the last update date. 186 | # And obviously only update the date if it is older than the current oldest date. 187 | if ( 188 | document is None 189 | or len(note['comments']) > len(document['comments']) 190 | ) and (oldest is None or last_changed < oldest): 191 | oldest = last_changed 192 | 193 | if len(operations) != 0: 194 | result = collection.bulk_write(operations, ordered=False) 195 | if result.bulk_api_result['writeErrors']: 196 | client.events.errors.insert_one( 197 | { 198 | 'type': 'update_error', 199 | 'timestamp': datetime.datetime.now(datetime.timezone.utc), 200 | 'error': result.bulk_api_result['writeErrors'], 201 | } 202 | ) 203 | return [deleted, inserted, updated, ignored], oldest 204 | 205 | 206 | parser = argparse.ArgumentParser( 207 | description='Update notes between the last check and now.' 208 | ) 209 | parser.add_argument( 210 | '-l', 211 | '--limit', 212 | type=int, 213 | default=100, 214 | help='set the batch size limit (default: 100)', 215 | ) 216 | args = parser.parse_args() 217 | 218 | update(args.limit) 219 | -------------------------------------------------------------------------------- /api/query.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import dateutil.parser 5 | import lark 6 | import orjson 7 | 8 | 9 | class Sort(object): 10 | def __init__(self): 11 | self.sort = {} 12 | 13 | def build(self): 14 | return self.sort.get('by'), self.sort.get('order') 15 | 16 | def by(self, by): 17 | allowed = ['none', 'updated_at', 'created_at'] 18 | if by not in allowed: 19 | raise ValueError(f'Sort must be one of {allowed}') 20 | 21 | if by == 'none': 22 | pass 23 | elif by == 'updated_at': 24 | self.sort['by'] = 'updated_at' 25 | elif by == 'created_at': 26 | self.sort['by'] = 'comments.0.date' 27 | return self 28 | 29 | def order(self, order): 30 | allowed = ['desc', 'descending', 'asc', 'ascending'] 31 | if order not in allowed: 32 | raise ValueError(f'Order must be one of {allowed}') 33 | 34 | if order in ['asc', 'ascending']: 35 | self.sort['order'] = 1 36 | elif order in ['desc', 'descending']: 37 | self.sort['order'] = -1 38 | return self 39 | 40 | 41 | class Filter(object): 42 | def __init__(self, sort): 43 | self.filter = {} 44 | self.sort = sort 45 | self.users = Users() 46 | 47 | def build(self): 48 | return self.filter 49 | 50 | def query(self, query): 51 | if query is not None: 52 | self.filter['comments.0.text'] = { 53 | '$regex': ( 54 | query.removeprefix('regex:') 55 | if query.startswith('regex:') 56 | else re.escape(query) 57 | ), 58 | '$options': 'i', 59 | } 60 | return self 61 | 62 | def bbox(self, bbox): 63 | if bbox is not None: 64 | bbox = BoundingBox(bbox) 65 | self.filter['coordinates'] = { 66 | '$geoWithin': { 67 | '$box': [ 68 | # bottom left coordinates (longitude, latitude) 69 | [bbox.x1, bbox.y1], 70 | # upper right coordinates (longitude, latitude) 71 | [bbox.x2, bbox.y2], 72 | ] 73 | } 74 | } 75 | return self 76 | 77 | def polygon(self, polygon): 78 | if polygon is not None: 79 | polygon = Polygon(polygon) 80 | self.filter['coordinates'] = { 81 | '$geoWithin': { 82 | '$geometry': { 83 | 'type': polygon.type, 84 | 'coordinates': polygon.coordinates, 85 | } 86 | } 87 | } 88 | return self 89 | 90 | def status(self, status): 91 | if status not in [None, 'all', 'open', 'closed']: 92 | raise ValueError('Status must be one of [all, open, closed]') 93 | 94 | if status not in [None, 'all']: 95 | self.filter['status'] = status 96 | return self 97 | 98 | def anonymous(self, anonymous): 99 | if anonymous not in [None, 'include', 'hide', 'only']: 100 | raise ValueError('Anonymous must be one of [include, hide, only]') 101 | 102 | if anonymous is not None: 103 | # Filtering out anonymous notes means that there must be a user who created the note 104 | if anonymous == 'hide': 105 | self.filter['comments.0.user'] = {'$exists': True} 106 | if anonymous == 'only': 107 | self.filter['comments.0.user'] = {'$exists': False} 108 | return self 109 | 110 | def author(self, author): 111 | if author is not None: 112 | include, exclude = self.users.parse(author) 113 | if 'comments.0.user' not in self.filter: 114 | self.filter['comments.0.user'] = {} 115 | self.filter['comments.0.user'].update( 116 | self.clean({'$in': include, '$nin': exclude}) 117 | ) 118 | return self 119 | 120 | def user(self, user): 121 | if user is not None: 122 | include, exclude = self.users.parse(user) 123 | if 'comments.user' not in self.filter: 124 | self.filter['comments.user'] = {} 125 | self.filter['comments.user'].update( 126 | self.clean({'$all': include, '$nin': exclude}) 127 | ) 128 | return self 129 | 130 | def after(self, after): 131 | if after is not None: 132 | key = self.sort[0] 133 | # If results will be unsorted, use the creation date for the comparison 134 | if key is None: 135 | key = 'comments.0.date' 136 | 137 | if key not in self.filter: 138 | self.filter[key] = {} 139 | self.filter[key]['$gt'] = dateutil.parser.parse(after) 140 | return self 141 | 142 | def before(self, before): 143 | if before is not None: 144 | key = self.sort[0] 145 | # If results will be unsorted, use the creation date for the comparison 146 | if key is None: 147 | key = 'comments.0.date' 148 | 149 | if key not in self.filter: 150 | self.filter[key] = {} 151 | self.filter[key]['$lt'] = dateutil.parser.parse(before) 152 | return self 153 | 154 | def comments(self, amount_of_comments): 155 | if amount_of_comments is not None: 156 | # A comment of the note counts as everything after the original comment 157 | self.filter['comments'] = {'$size': int(amount_of_comments) + 1} 158 | return self 159 | 160 | def commented(self, commented): 161 | if commented not in [None, 'include', 'hide', 'only']: 162 | raise ValueError('Commented must be one of [include, hide, only]') 163 | 164 | if commented is not None: 165 | # Filtering out commented notes means that only the original comment exists 166 | if commented == 'hide': 167 | self.filter['comments'] = {'$size': 1} 168 | # Showing only commented notes requires the amount of comments to be greater than 1 169 | # This is not directly allowed (since $size does not accept ranges of values, e.g. via $gt), 170 | # so instead show only notes with an amount of comments different from 1 171 | # (notes with 0 comments do not exist) 172 | if commented == 'only': 173 | self.filter['comments'] = {'$not': {'$size': 1}} 174 | return self 175 | 176 | # Remove values that are not defined or empty from a given dictionary 177 | def clean(self, dictionary): 178 | return { 179 | k: v 180 | for k, v in dictionary.items() 181 | if v is not None and (type(v) is list and len(v) > 0) 182 | } 183 | 184 | 185 | class BoundingBox(object): 186 | def __init__(self, bbox): 187 | bbox = [float(x) for x in bbox.split(',')] 188 | if len(bbox) != 4: 189 | raise ValueError( 190 | 'The bounding box does not contain all required coordinates' 191 | ) 192 | 193 | self.x1 = bbox[0] 194 | self.y1 = bbox[1] 195 | self.x2 = bbox[2] 196 | self.y2 = bbox[3] 197 | self.check() 198 | 199 | def check(self): 200 | if self.x1 > self.x2: 201 | raise ValueError( 202 | 'The minimum longitude must be smaller than the maximum longitude' 203 | ) 204 | if self.y1 > self.y2: 205 | raise ValueError( 206 | 'The minimum latitude must be smaller than the maximum latitude' 207 | ) 208 | if self.x1 < -180 or self.y1 < -90 or self.x2 > +180 or self.y2 > +90: 209 | raise ValueError( 210 | 'The bounding box exceeds the size of the world, please specify a smaller bounding box' 211 | ) 212 | 213 | 214 | class Polygon(object): 215 | def __init__(self, polygon): 216 | polygon = orjson.loads(polygon) 217 | if 'type' not in polygon or 'coordinates' not in polygon: 218 | raise ValueError( 219 | 'Polygon does not contain information about type or any coordinates' 220 | ) 221 | 222 | self.type = polygon['type'] 223 | self.coordinates = polygon['coordinates'] 224 | self.check() 225 | 226 | def check(self): 227 | if self.type not in ['Polygon', 'MultiPolygon']: 228 | raise ValueError( 229 | 'The GeoJSON shape must be either a Polygon or a MultiPolygon' 230 | ) 231 | if type(self.coordinates) is not list: 232 | raise ValueError('Coordinates have to be supplied as an array') 233 | 234 | 235 | class Users(object): 236 | def __init__(self): 237 | with open( 238 | os.path.join(os.path.dirname(__file__), 'grammars', 'users.lark') 239 | ) as file: 240 | self.grammar = lark.Lark(file.read()) 241 | 242 | def parse(self, input): 243 | tree = self.grammar.parse(input) 244 | include = [] 245 | exclude = [] 246 | 247 | for node in tree.children: 248 | if isinstance(node.children[0], lark.Token): 249 | include.append(node.children[0].value) 250 | elif ( 251 | isinstance(node.children[0], lark.Tree) 252 | and node.children[0].data == 'not' 253 | ): 254 | exclude.append(node.children[0].children[0].value) 255 | 256 | if len(include) + len(exclude) > 10: 257 | raise ValueError( 258 | 'The amount of users to search for exceeds the limit' 259 | ) 260 | 261 | return include, exclude 262 | --------------------------------------------------------------------------------