├── requirements.dev.txt
├── .gitignore
├── pyproject.toml
├── schema
    ├── schema.sh
    ├── notesreview.users.json
    ├── notesreview.notes.json
    └── schema.json
├── api
    ├── models
    │   ├── comment.py
    │   └── note.py
    ├── grammars
    │   └── users.lark
    ├── auth.py
    └── query.py
├── requirements.txt
├── Makefile
├── config.py
├── .github
    └── workflows
    │   └── lint.yml
├── scripts
    ├── iteration.py
    ├── indices.py
    ├── delete.py
    ├── import.py
    └── update.py
├── README.md
├── blueprints
    ├── status.py
    ├── auth.py
    └── search.py
└── app.py


/requirements.dev.txt:
--------------------------------------------------------------------------------
1 | black==25.9.0
2 | isort==6.1.0
3 | flake8==7.3.0
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | venv/
3 | 
4 | __pycache__/
5 | __init__.py
6 | 
7 | scripts/*.txt
8 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 79
3 | 
4 | [tool.isort]
5 | line_length = 79
6 | profile = "black"
7 | 


--------------------------------------------------------------------------------
/schema/schema.sh:
--------------------------------------------------------------------------------
1 | jq -n 'reduce inputs as $s (.; .[input_filename|rtrimstr(".json")] += $s)' [!schema]*.json > schema.json
2 | 


--------------------------------------------------------------------------------
/api/models/comment.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | from dataclasses import dataclass
 3 | 
 4 | 
 5 | @dataclass
 6 | class Comment:
 7 |     date: datetime.datetime
 8 |     action: str
 9 |     uid: int
10 |     user: str
11 |     text: str
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | lark==1.3.0
 2 | lxml==6.0.2
 3 | motor==3.7.1
 4 | orjson==3.11.3
 5 | pyjwt[crypto]==2.10.1
 6 | pymongo==4.15.2
 7 | python-dateutil==2.9.0.post0
 8 | python-dotenv==1.1.1
 9 | requests==2.32.5
10 | sanic[ext]==25.3.0
11 | tqdm==4.67.1
12 | 


--------------------------------------------------------------------------------
/api/models/note.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | from dataclasses import dataclass
 3 | from typing import List, Tuple
 4 | 
 5 | from .comment import Comment
 6 | 
 7 | 
 8 | @dataclass
 9 | class Note:
10 |     _id: int
11 |     coordinates: Tuple[float, float]
12 |     status: str
13 |     updated_at: datetime.datetime
14 |     comments: List[Comment]
15 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | LINT_FILES = app.py config.py api/ blueprints/ scripts/
 2 | 
 3 | install:
 4 | 	pip install -r requirements.txt
 5 | 
 6 | install-dev:
 7 | 	pip install -r requirements.dev.txt
 8 | 
 9 | lint:
10 | 	flake8 --count --extend-ignore=E501 --show-source --statistics $(LINT_FILES)
11 | 
12 | format:
13 | 	isort --check-only $(LINT_FILES)
14 | 	black --check --diff --skip-string-normalization $(LINT_FILES)
15 | 
16 | download:
17 | 	curl -o notes.osn.bz2 https://ftp5.gwdg.de/pub/misc/openstreetmap/planet.openstreetmap.org/notes/planet-notes-latest.osn.bz2 && bzip2 -d notes.osn.bz2
18 | 


--------------------------------------------------------------------------------
/api/grammars/users.lark:
--------------------------------------------------------------------------------
 1 | start: exp (_sep " "? exp)*
 2 | 
 3 | exp: not | user
 4 | not: ("NOT " | "-") user
 5 | ?user: "\""? name "\""?
 6 | 
 7 | _sep: "," | ";"
 8 | ?name: WORD+
 9 | 
10 | // Allow any characters except invalid ranges as defined in
11 | // https://github.com/openstreetmap/openstreetmap-website/blob/9aa3e7a/app/validators/characters_validator.rb#L2-L3
12 | // by using negative lookaheads (https://stackoverflow.com/questions/1687620/regex-match-everything-but-a-specific-pattern)
13 | //
14 | // Additionally " is not allowed as well
15 | WORD: /(?![\x00-\x08\x0b-\x0c\x0e-\x1f\x7f\ufffe\uffff\/;\.,\?%#\"])./+
16 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dotenv import load_dotenv
 4 | 
 5 | load_dotenv()
 6 | 
 7 | # fmt: off
 8 | config = dict(
 9 |     DEFAULT_LIMIT=50,
10 |     MAX_LIMIT=250,
11 |     ROOT_PATH=os.path.dirname(os.path.realpath(__file__)),
12 |     DB_USER=os.environ.get('DB_USER'),
13 |     DB_PASSWORD=os.environ.get('DB_PASSWORD'),
14 |     DB_HOST=os.environ.get('DB_HOST'),
15 |     OPENSTREETMAP_OAUTH_JWKS_URI=os.environ.get('OPENSTREETMAP_OAUTH_JWKS_URI'),
16 |     OPENSTREETMAP_OAUTH_CLIENT_ID=os.environ.get('OPENSTREETMAP_OAUTH_CLIENT_ID'),
17 |     CORS_ORIGINS='*',
18 |     CORS_ALWAYS_SEND=False,
19 | )
20 | # fmt: on
21 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   lint:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v4
10 |       - name: Set up Python
11 |         uses: actions/setup-python@v5
12 |         with:
13 |           python-version: '3.x'
14 |           cache: 'pip'
15 | 
16 |       - name: Install dependencies
17 |         run: |
18 |           python -m pip install --upgrade pip
19 |           if [ -f requirements.txt ]; then make install; fi
20 |       - name: Install dependencies for development
21 |         run: |
22 |           if [ -f requirements.dev.txt ]; then make install-dev; fi
23 | 
24 |       - name: Lint
25 |         run: |
26 |           make lint
27 |       - name: Format
28 |         run: |
29 |           make format
30 | 


--------------------------------------------------------------------------------
/schema/notesreview.users.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$jsonSchema": {
 3 |     "bsonType": "object",
 4 |     "required": [
 5 |       "_id",
 6 |       "created_at",
 7 |       "last_validated_at",
 8 |       "token",
 9 |       "user"
10 |     ],
11 |     "properties": {
12 |       "_id": {
13 |         "bsonType": "int",
14 |         "description": "must be an int and is required"
15 |       },
16 |       "created_at": {
17 |         "bsonType": "date",
18 |         "description": "must be a date and is required"
19 |       },
20 |       "last_validated_at": {
21 |         "bsonType": "date",
22 |         "description": "must be a date and is required"
23 |       },
24 |       "token": {
25 |         "bsonType": [
26 |           "string",
27 |           "null"
28 |         ],
29 |         "description": "must be a string or null and is required"
30 |       },
31 |       "user": {
32 |         "bsonType": "string",
33 |         "description": "must be a string and is required"
34 |       }
35 |     }
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/scripts/iteration.py:
--------------------------------------------------------------------------------
 1 | # The previous iteration method used too much memory, but there are some enhancements to improve this:
 2 | # https://stackoverflow.com/questions/7171140/using-python-iterparse-for-large-xml-files
 3 | # https://stackoverflow.com/questions/12160418/why-is-lxml-etree-iterparse-eating-up-all-my-memory
 4 | # https://web.archive.org/web/20210309115224/http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
 5 | def fast_iter(context, func, *args, **kwargs):
 6 |     """
 7 |     http://lxml.de/parsing.html#modifying-the-tree
 8 |     Based on Liza Daly's fast_iter
 9 |     http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
10 |     See also http://effbot.org/zone/element-iterparse.htm
11 |     """
12 |     for event, element in context:
13 |         func(element, *args, **kwargs)
14 |         # It's safe to call clear() here because no descendants will be accessed
15 |         element.clear()
16 |         # Also eliminate now-empty references from the root node to elem
17 |         for ancestor in element.xpath('ancestor-or-self::*'):
18 |             while ancestor.getprevious() is not None:
19 |                 del ancestor.getparent()[0]
20 |     del context
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # notesreview-api
 2 | > API and backend for [notesreview](https://github.com/ENT8R/notesreview)
 3 | 
 4 | ## Scripts
 5 | 
 6 | #### `delete.py`
 7 | ```sh
 8 | # Deletes all notes that are not included in the notes dump
 9 | python scripts/delete.py notes.osn
10 | ```
11 | 
12 | ---
13 | 
14 | #### `import.py`
15 | ```sh
16 | # Imports all notes from the notes dump
17 | python scripts/import.py notes.osn
18 | ```
19 | 
20 | ---
21 | 
22 | #### `indices.py`
23 | ```sh
24 | # Creates all necessary indices for the database
25 | python scripts/indices.py
26 | ```
27 | ---
28 | 
29 | #### `update.py`
30 | ```sh
31 | # Updates the database by querying the OSM Notes API
32 | # in order to receive the latest notes
33 | # since a given date of the last check
34 | python scripts/update.py
35 | ```
36 | 
37 | ## Notes Dump
38 | 
39 | ##### Download
40 | ```sh
41 | # Download and extract the notes dump
42 | # (hosted on https://planet.openstreetmap.org/ or any other mirror)
43 | 
44 | # ${URL} needs to be replaced with the location of the notes dump
45 | curl -L -o notes.osn.bz2 ${URL} && pbzip2 -d notes.osn.bz2
46 | ```
47 | 
48 | ##### XML Structure
49 | The structure of the notes dump follows this scheme:
50 | ```xml
51 | <?xml version="1.0" encoding="UTF-8"?>
52 | <osm-notes>
53 |   <note id="${id|required}" lat="${lat|required}" lon="${lon|required}" created_at="${created_at|required}" closed_at="${closed_at|optional}">
54 |     <comment action="${action|required}" timestamp="${timestamp|required}" uid="${uid|optional}" user="${user|optional}">${comment|optional}</comment>
55 |   </note>
56 | </osm-notes>
57 | ```


--------------------------------------------------------------------------------
/blueprints/status.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from sanic import Blueprint
 4 | from sanic.response import json
 5 | from sanic_ext import openapi
 6 | 
 7 | from config import config
 8 | 
 9 | blueprint = Blueprint('Status', url_prefix='/status')
10 | 
11 | 
12 | @blueprint.route('/')
13 | @openapi.description(
14 |     'Status information about the database and update frequency'
15 | )
16 | @openapi.response(
17 |     200,
18 |     {
19 |         'application/json': openapi.Object(
20 |             properties={
21 |                 'last_import': openapi.DateTime(),
22 |                 'last_sync': openapi.DateTime(),
23 |                 'last_update': openapi.DateTime(),
24 |             }
25 |         ),
26 |     },
27 |     'The response is an object with the currently available status information',
28 | )
29 | async def status(request):
30 |     last_import = None
31 |     last_sync = None
32 |     last_update = None
33 | 
34 |     with (
35 |         open(
36 |             os.path.join(config['ROOT_PATH'], 'scripts', 'LAST_IMPORT.txt')
37 |         ) as file1,
38 |         open(
39 |             os.path.join(config['ROOT_PATH'], 'scripts', 'LAST_SYNC.txt')
40 |         ) as file2,
41 |         open(
42 |             os.path.join(config['ROOT_PATH'], 'scripts', 'LAST_UPDATE.txt')
43 |         ) as file3,
44 |     ):
45 |         last_import = file1.read().strip()
46 |         last_sync = file2.read().strip()
47 |         last_update = file3.read().strip()
48 | 
49 |     return json(
50 |         {
51 |             'last_import': last_import,
52 |             'last_sync': last_sync,
53 |             'last_update': last_update,
54 |         }
55 |     )
56 | 


--------------------------------------------------------------------------------
/scripts/indices.py:
--------------------------------------------------------------------------------
 1 | # import json
 2 | import os
 3 | 
 4 | import pymongo
 5 | from dotenv import load_dotenv
 6 | 
 7 | load_dotenv()
 8 | 
 9 | client = pymongo.MongoClient(
10 |     f'mongodb://{os.environ.get("DB_USER")}:{os.environ.get("DB_PASSWORD")}@{os.environ.get("DB_HOST")}:27017/?authSource=notesreview'
11 | )
12 | db = client.notesreview
13 | 
14 | DIRECTORY = os.path.dirname(os.path.realpath(__file__))
15 | RUN_IN_BACKGROUND = False
16 | 
17 | # Apply validation schemes (requires the collection to exist)
18 | # TODO: This operation requires admin access
19 | # with open(os.path.join(DIRECTORY, '..', 'schema', 'schema.json')) as schema:
20 | #     schema = json.load(schema)
21 | # db.command({
22 | #     'collMod': 'notes',
23 | #     'validator': schema['notesreview.notes']
24 | # })
25 | 
26 | # Create indices used for faster queries
27 | db.notes.create_index(
28 |     [('updated_at', pymongo.DESCENDING)],
29 |     name='updated_at',
30 |     background=RUN_IN_BACKGROUND,
31 | )
32 | db.notes.create_index(
33 |     [('comments.0.date', pymongo.DESCENDING)],
34 |     name='created_at',
35 |     background=RUN_IN_BACKGROUND,
36 | )
37 | db.notes.create_index(
38 |     [('coordinates', pymongo.GEOSPHERE)],
39 |     name='coordinates',
40 |     background=RUN_IN_BACKGROUND,
41 | )
42 | db.notes.create_index('status', name='status', background=RUN_IN_BACKGROUND)
43 | db.notes.create_index(
44 |     'comments.0.user', name='author', background=RUN_IN_BACKGROUND
45 | )
46 | db.notes.create_index(
47 |     'comments.user', name='user', background=RUN_IN_BACKGROUND
48 | )
49 | db.notes.create_index(
50 |     [('comments.text', pymongo.TEXT)],
51 |     default_language='none',
52 |     name='text',
53 |     background=RUN_IN_BACKGROUND,
54 | )
55 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | from textwrap import dedent
 2 | 
 3 | from jwt import PyJWKClient
 4 | from motor.motor_asyncio import AsyncIOMotorClient
 5 | from sanic import Blueprint, Sanic
 6 | 
 7 | from api.auth import attach_uid
 8 | from blueprints.auth import blueprint as auth
 9 | from blueprints.notes import blueprint as notes
10 | from blueprints.search import blueprint as search
11 | from blueprints.status import blueprint as status
12 | from config import config
13 | 
14 | app = Sanic(__name__)
15 | app.config.update(config)
16 | 
17 | app.ext.openapi.describe(
18 |     'notesreview-api',
19 |     version='0.1.0',
20 |     description=dedent(
21 |         """\
22 |         # Information
23 |         This API is still subject to change, especially the behavior of the `query` parameter might change in the future,
24 |         because right now the possibilities are still a little bit limited.
25 |         """
26 |     ),
27 | )
28 | app.ext.openapi.add_security_scheme(
29 |     'token',
30 |     'http',
31 |     scheme='bearer',
32 |     bearer_format='JWT',
33 |     description='OpenID Connect Token issued by OpenStreetMap',
34 | )
35 | 
36 | 
37 | @app.before_server_start
38 | async def setup(app, loop):
39 |     client = AsyncIOMotorClient(
40 |         f'mongodb://{app.config.DB_USER}:{app.config.DB_PASSWORD}@{app.config.DB_HOST}:27017?authSource=notesreview',
41 |         io_loop=loop,
42 |     )
43 |     jwks_client = PyJWKClient(app.config.OPENSTREETMAP_OAUTH_JWKS_URI)
44 | 
45 |     app.ctx.client = client
46 |     app.ctx.db = client.notesreview
47 |     app.ctx.jwks_client = jwks_client
48 | 
49 | 
50 | @app.before_server_stop
51 | async def shutdown(app, loop):
52 |     app.ctx.client.close()
53 | 
54 | 
55 | app.blueprint(Blueprint.group(auth, status, search, url_prefix='/api'))
56 | 
57 | app.register_middleware(attach_uid, 'request')
58 | 


--------------------------------------------------------------------------------
/schema/notesreview.notes.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$jsonSchema": {
 3 |     "required": [ "_id", "coordinates", "status", "updated_at", "comments" ],
 4 |     "properties": {
 5 |       "_id": {
 6 |         "bsonType": "int",
 7 |         "description": "must be an int and is required"
 8 |       },
 9 |       "coordinates": {
10 |         "bsonType": "array",
11 |         "description": "must be an array and is required"
12 |       },
13 |       "status": {
14 |         "enum": [ "open", "closed" ],
15 |         "description": "can only be one of the enum values and is required"
16 |       },
17 |       "updated_at": {
18 |         "bsonType": "date",
19 |         "description": "must be a date and is required"
20 |       },
21 |       "comments": {
22 |         "bsonType": "array",
23 |         "description": "must be an array of objects and is required",
24 |         "items": {
25 |           "bsonType": "object",
26 |           "required": [
27 |             "date",
28 |             "action"
29 |           ],
30 |           "properties": {
31 |             "date": {
32 |               "bsonType": "date",
33 |               "description": "must be a date and is required"
34 |             },
35 |             "action": {
36 |               "enum": [ "opened", "commented", "closed", "reopened", "hidden" ],
37 |               "description": "can only be one of the enum values and is required"
38 |             },
39 |             "text": {
40 |               "bsonType": "string",
41 |               "description": "must be a string and is not required"
42 |             },
43 |             "uid": {
44 |               "bsonType": "int",
45 |               "description": "must be an int and is not required"
46 |             },
47 |             "user": {
48 |               "bsonType": "string",
49 |               "description": "must be a string and is not required"
50 |             }
51 |           }
52 |         }
53 |       }
54 |     }
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/blueprints/auth.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | import jwt
 4 | from sanic import Blueprint, Sanic
 5 | from sanic.response import text
 6 | from sanic_ext import openapi
 7 | 
 8 | from api.auth import decode_token, protected
 9 | 
10 | blueprint = Blueprint('Authentication', url_prefix='/auth')
11 | 
12 | 
13 | @blueprint.get('/login')
14 | @openapi.description('Login with a valid OpenID Connect Token (JWT)')
15 | @openapi.secured('token')
16 | @openapi.response(
17 |     200,
18 |     {
19 |         'text/plain': openapi.String(),
20 |     },
21 |     'OK',
22 | )
23 | @openapi.response(
24 |     401,
25 |     {
26 |         'text/plain': openapi.String(),
27 |     },
28 |     'Invalid token or unauthorized',
29 | )
30 | async def login(request):
31 |     token = request.token
32 |     info = None
33 | 
34 |     if token is None:
35 |         return text('No token provided', 401)
36 | 
37 |     try:
38 |         info = decode_token(token)
39 |     except jwt.exceptions.InvalidTokenError:
40 |         return text('The provided token is invalid', 401)
41 | 
42 |     # Do not proceed if the token does not contain the required information
43 |     if info is None or 'sub' not in info or 'preferred_username' not in info:
44 |         return text(
45 |             'The provided token can not be used for authentication', 401
46 |         )
47 | 
48 |     # Store or update the token and the user id in the database
49 |     uid = int(info['sub'])
50 |     username = info['preferred_username']
51 |     timestamp = datetime.datetime.now(datetime.timezone.utc)
52 | 
53 |     await Sanic.get_app().ctx.db.users.update_one(
54 |         {
55 |             '_id': uid,
56 |         },
57 |         {
58 |             '$setOnInsert': {
59 |                 '_id': uid,
60 |                 'created_at': timestamp,
61 |             },
62 |             '$set': {
63 |                 'token': token,
64 |                 'user': username,
65 |                 'last_validated_at': timestamp,
66 |             },
67 |         },
68 |         upsert=True,
69 |     )
70 | 
71 |     return text('OK', 200)
72 | 
73 | 
74 | @blueprint.get('/logout')
75 | @openapi.description('Logout with a valid OpenID Connect Token (JWT)')
76 | @openapi.secured('token')
77 | @openapi.response(
78 |     200,
79 |     {
80 |         'text/plain': openapi.String(),
81 |     },
82 |     'OK',
83 | )
84 | @protected
85 | async def logout(request):
86 |     await Sanic.get_app().ctx.db.users.update_one(
87 |         {
88 |             '_id': request.ctx.uid,
89 |         },
90 |         {'$set': {'token': None}},
91 |     )
92 |     return text('OK', 200)
93 | 


--------------------------------------------------------------------------------
/api/auth.py:
--------------------------------------------------------------------------------
 1 | from functools import wraps
 2 | 
 3 | import jwt
 4 | from sanic import HTTPResponse, Sanic
 5 | from sanic.request import Request
 6 | from sanic.response import text
 7 | 
 8 | from config import config
 9 | 
10 | 
11 | def protected(wrapped):
12 |     def decorator(f):
13 |         @wraps(f)
14 |         async def decorated_function(request, *args, **kwargs):
15 |             # Call the request handler only if there is a known uid for the
16 |             # token which is already attached to the request context through
17 |             # the middleware below before every request
18 |             if request.ctx.uid is None:
19 |                 return text('You are unauthorized', 401)
20 |             else:
21 |                 response = await f(request, *args, **kwargs)
22 |                 return response
23 | 
24 |         return decorated_function
25 | 
26 |     return decorator(wrapped)
27 | 
28 | 
29 | def decode_token(token):
30 |     signing_key = Sanic.get_app().ctx.jwks_client.get_signing_key_from_jwt(
31 |         token
32 |     )
33 |     return jwt.decode(
34 |         token,
35 |         signing_key,
36 |         audience=config['OPENSTREETMAP_OAUTH_CLIENT_ID'],
37 |         options={'verify_exp': False},
38 |         algorithms=['RS256'],
39 |     )
40 | 
41 | 
42 | async def is_authenticated(request):
43 |     token = request.token
44 |     if token is None:
45 |         return False
46 | 
47 |     info = None
48 |     try:
49 |         info = decode_token(token)
50 |     except jwt.exceptions.InvalidTokenError:
51 |         return False
52 | 
53 |     if info is None or 'sub' not in info:
54 |         return False
55 | 
56 |     return (
57 |         await Sanic.get_app().ctx.db.users.find_one({'_id': int(info['sub'])})
58 |         is not None
59 |     )
60 | 
61 | 
62 | async def attach_uid(request):
63 |     request.ctx.uid = None
64 | 
65 |     token = request.token
66 |     if token is None:
67 |         return
68 | 
69 |     # Validate the JWT and extract the user id (sub claim)
70 |     info = None
71 |     try:
72 |         info = decode_token(token)
73 |     except jwt.exceptions.InvalidTokenError:
74 |         return
75 | 
76 |     # Do not attach a uid if there is no information after decoding the token
77 |     if info is None or 'sub' not in info:
78 |         return
79 | 
80 |     # Check if the user exists (logged in before) and is currently using this token
81 |     uid = int(info['sub'])
82 |     user = await Sanic.get_app().ctx.db.users.find_one({'_id': uid})
83 |     if user is None or user['token'] != token:
84 |         return
85 | 
86 |     # Finally attach the uid to the request context
87 |     request.ctx.uid = uid
88 | 


--------------------------------------------------------------------------------
/schema/schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "notesreview.notes": {
  3 |     "$jsonSchema": {
  4 |       "required": [
  5 |         "_id",
  6 |         "coordinates",
  7 |         "status",
  8 |         "updated_at",
  9 |         "comments"
 10 |       ],
 11 |       "properties": {
 12 |         "_id": {
 13 |           "bsonType": "int",
 14 |           "description": "must be an int and is required"
 15 |         },
 16 |         "coordinates": {
 17 |           "bsonType": "array",
 18 |           "description": "must be an array and is required"
 19 |         },
 20 |         "status": {
 21 |           "enum": [
 22 |             "open",
 23 |             "closed"
 24 |           ],
 25 |           "description": "can only be one of the enum values and is required"
 26 |         },
 27 |         "updated_at": {
 28 |           "bsonType": "date",
 29 |           "description": "must be a date and is required"
 30 |         },
 31 |         "comments": {
 32 |           "bsonType": "array",
 33 |           "description": "must be an array of objects and is required",
 34 |           "items": {
 35 |             "bsonType": "object",
 36 |             "required": [
 37 |               "date",
 38 |               "action"
 39 |             ],
 40 |             "properties": {
 41 |               "date": {
 42 |                 "bsonType": "date",
 43 |                 "description": "must be a date and is required"
 44 |               },
 45 |               "action": {
 46 |                 "enum": [
 47 |                   "opened",
 48 |                   "commented",
 49 |                   "closed",
 50 |                   "reopened",
 51 |                   "hidden"
 52 |                 ],
 53 |                 "description": "can only be one of the enum values and is required"
 54 |               },
 55 |               "text": {
 56 |                 "bsonType": "string",
 57 |                 "description": "must be a string and is not required"
 58 |               },
 59 |               "uid": {
 60 |                 "bsonType": "int",
 61 |                 "description": "must be an int and is not required"
 62 |               },
 63 |               "user": {
 64 |                 "bsonType": "string",
 65 |                 "description": "must be a string and is not required"
 66 |               }
 67 |             }
 68 |           }
 69 |         }
 70 |       }
 71 |     }
 72 |   },
 73 |   "notesreview.users": {
 74 |     "$jsonSchema": {
 75 |       "bsonType": "object",
 76 |       "required": [
 77 |         "_id",
 78 |         "created_at",
 79 |         "last_validated_at",
 80 |         "token",
 81 |         "user"
 82 |       ],
 83 |       "properties": {
 84 |         "_id": {
 85 |           "bsonType": "int",
 86 |           "description": "must be an int and is required"
 87 |         },
 88 |         "created_at": {
 89 |           "bsonType": "date",
 90 |           "description": "must be a date and is required"
 91 |         },
 92 |         "last_validated_at": {
 93 |           "bsonType": "date",
 94 |           "description": "must be a date and is required"
 95 |         },
 96 |         "token": {
 97 |           "bsonType": [
 98 |             "string",
 99 |             "null"
100 |           ],
101 |           "description": "must be a string or null and is required"
102 |         },
103 |         "user": {
104 |           "bsonType": "string",
105 |           "description": "must be a string and is required"
106 |         }
107 |       }
108 |     }
109 |   }
110 | }
111 | 


--------------------------------------------------------------------------------
/scripts/delete.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import iteration
 5 | from dotenv import load_dotenv
 6 | from lxml import etree
 7 | from pymongo import MongoClient
 8 | from tqdm import tqdm
 9 | 
10 | load_dotenv()
11 | 
12 | client = MongoClient(
13 |     f'mongodb://{os.environ.get("DB_USER")}:{os.environ.get("DB_PASSWORD")}@127.0.0.1:27017/?authSource=notesreview',
14 |     tz_aware=True,
15 | )
16 | collection = client.notesreview.notes
17 | 
18 | DIRECTORY = os.path.dirname(os.path.realpath(__file__))
19 | 
20 | 
21 | # Find all ids of the notes which are included in the current notes dump
22 | def ids(file):
23 |     ids = set()
24 |     last_id = 0
25 | 
26 |     def process_element(element):
27 |         nonlocal last_id
28 | 
29 |         attributes = element.attrib
30 |         id = int(attributes['id'])
31 |         last_id = id
32 |         ids.add(id)
33 | 
34 |     iteration.fast_iter(
35 |         tqdm(etree.iterparse(file, tag='note', events=('end',))),
36 |         process_element,
37 |     )
38 |     return ids, last_id
39 | 
40 | 
41 | # Delete (or only print the ids of) all notes that are stored in the database but not included in the set of ids
42 | def delete(ids_in_dump, last_id, delete):
43 |     ids_in_db = set()
44 |     # Iterate over all documents with an id lower than the last id of the notes dump
45 |     for note in tqdm(
46 |         collection.find({}, {'_id': True}).max([('_id', last_id)]).hint('_id_')
47 |     ):
48 |         if note['_id'] not in ids_in_dump:
49 |             # Add the id to the set if the note is in the database but not the dump
50 |             ids_in_db.add(note['_id'])
51 |             tqdm.write(str(note['_id']))
52 |         else:
53 |             # Remove the id if the note is in the database and the dump
54 |             ids_in_dump.remove(note['_id'])
55 | 
56 |     # ids_in_dump(_but_not_in_db) contains all notes that are in the dump but not in the database,
57 |     # ids_in_db(_but_not_in_dump) contains all notes that are in the database but not in the dump
58 |     tqdm.write(
59 |         f'There are currently {len(ids_in_dump)} notes that are in the dump but not in the database'
60 |     )
61 |     tqdm.write(
62 |         f'There are currently {len(ids_in_db)} notes that are in the database but not in the dump'
63 |     )
64 | 
65 |     if delete:
66 |         # Delete all notes that are currently in the database but not in the dump
67 |         result = collection.delete_many(
68 |             {'_id': {'$in': list(ids_in_db)}}, hint='_id_'
69 |         )
70 |         tqdm.write(
71 |             f'Deleted {result.deleted_count} notes which are not present in the notes dump anymore'
72 |         )
73 |         # Use the creation date of the last note in the dump as the timestamp of the last synchronization
74 |         last_note = collection.find_one({'_id': last_id})
75 |         last_date = last_note['comments'][0]['date']
76 |         with open(os.path.join(DIRECTORY, 'LAST_SYNC.txt'), 'w') as file:
77 |             file.write(last_date.isoformat(timespec='seconds'))
78 | 
79 | 
80 | parser = argparse.ArgumentParser(
81 |     description='Delete notes that are not included in the notes dump.'
82 | )
83 | parser.add_argument(
84 |     'file', type=str, help='path to the file which contains the notes dump'
85 | )
86 | parser.add_argument(
87 |     '--delete',
88 |     default=False,
89 |     action='store_true',
90 |     help='confirm deletion of the notes',
91 | )
92 | args = parser.parse_args()
93 | 
94 | delete(*ids(args.file), args.delete)
95 | 


--------------------------------------------------------------------------------
/scripts/import.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import os
  4 | import textwrap
  5 | 
  6 | import dateutil.parser
  7 | import iteration
  8 | from dotenv import load_dotenv
  9 | from lxml import etree
 10 | from pymongo import MongoClient, UpdateOne
 11 | from tqdm import tqdm
 12 | 
 13 | load_dotenv()
 14 | 
 15 | client = MongoClient(
 16 |     f'mongodb://{os.environ.get("DB_USER")}:{os.environ.get("DB_PASSWORD")}@127.0.0.1:27017/?authSource=notesreview',
 17 |     tz_aware=True,
 18 | )
 19 | collection = client.notesreview.notes
 20 | 
 21 | DIRECTORY = os.path.dirname(os.path.realpath(__file__))
 22 | 
 23 | 
 24 | # Parses an XML file containing all notes and inserts them into the database
 25 | def insert(file):
 26 |     # Notes are inserted/updated in batches of 50000
 27 |     BATCH_SIZE = 50000
 28 | 
 29 |     operations = []
 30 |     # 0. Deleted 1. Added, 2. Updated, 3. Matched
 31 |     all_stats = [0, 0, 0, 0]
 32 |     last_id = 0
 33 | 
 34 |     def process_element(element):
 35 |         nonlocal operations, all_stats, last_id
 36 | 
 37 |         try:
 38 |             attributes = element.attrib
 39 |             id = int(attributes['id'])
 40 |             last_id = id
 41 |             comments = parse(element)
 42 |             note = {
 43 |                 '_id': id,
 44 |                 'coordinates': [
 45 |                     float(attributes['lon']),
 46 |                     float(attributes['lat']),
 47 |                 ],
 48 |                 'status': 'closed' if 'closed_at' in attributes else 'open',
 49 |                 'updated_at': comments[-1]['date'],
 50 |                 'comments': comments,
 51 |             }
 52 |         except Exception:
 53 |             tqdm.write(f'Failed to parse note with the id {id}')
 54 |             return
 55 | 
 56 |         operations.append(
 57 |             UpdateOne(
 58 |                 {'_id': id},
 59 |                 {
 60 |                     '$set': {
 61 |                         'status': note['status'],
 62 |                         'updated_at': note['updated_at'],
 63 |                         'comments': note['comments'],
 64 |                     },
 65 |                     '$setOnInsert': {
 66 |                         'coordinates': note['coordinates'],
 67 |                     },
 68 |                 },
 69 |                 upsert=True,
 70 |                 hint='_id_',
 71 |             )
 72 |         )
 73 | 
 74 |         if len(operations) >= BATCH_SIZE:
 75 |             stats = write(operations)
 76 |             all_stats = [sum(x) for x in zip(all_stats, stats)]
 77 |             operations = []
 78 | 
 79 |     iteration.fast_iter(
 80 |         tqdm(etree.iterparse(file, tag='note', events=('end',))),
 81 |         process_element,
 82 |     )
 83 | 
 84 |     if len(operations) > 0:
 85 |         stats = write(operations)
 86 |         all_stats = [sum(x) for x in zip(all_stats, stats)]
 87 |         operations = []
 88 | 
 89 |     # Use the creation date of the last note in the dump as the timestamp of the last import
 90 |     last_date = collection.find_one({'_id': last_id})['comments'][0]['date']
 91 |     with open(os.path.join(DIRECTORY, 'LAST_IMPORT.txt'), 'w') as file:
 92 |         file.write(last_date.isoformat(timespec='seconds'))
 93 | 
 94 |     tqdm.write(
 95 |         textwrap.dedent(
 96 |             f"""
 97 |             ----------------------------------------
 98 |             IMPORT SUMMARY
 99 |             --------------------
100 |             Deleted {all_stats[0]} notes
101 |             Added {all_stats[1]} new notes
102 |             Updated {all_stats[2]} already existing notes
103 |             Matched {all_stats[3]} notes
104 |             ----------------------------------------
105 |             Please make sure to run the update script at least until {last_date.isoformat(timespec='seconds')}
106 |             to import all changes between the creation of the notes dump and now.
107 |             """
108 |         )
109 |     )
110 | 
111 | 
112 | # Write operations to the database using the bulk write feature
113 | def write(operations):
114 |     result = collection.bulk_write(operations, ordered=False)
115 |     if result.bulk_api_result['writeErrors']:
116 |         client.events.errors.insert_one(
117 |             {
118 |                 'type': 'import_error',
119 |                 'timestamp': datetime.datetime.now(datetime.timezone.utc),
120 |                 'error': result.bulk_api_result['writeErrors'],
121 |             }
122 |         )
123 |     return [
124 |         result.bulk_api_result['nRemoved'],
125 |         result.bulk_api_result['nInserted'],
126 |         result.bulk_api_result['nModified'],
127 |         result.bulk_api_result['nMatched'],
128 |     ]
129 | 
130 | 
131 | # Parse the comments and extract only the useful information
132 | def parse(note):
133 |     comments = []
134 |     for element in note:
135 |         attributes = element.attrib
136 | 
137 |         comment = {
138 |             'date': dateutil.parser.parse(attributes['timestamp']),
139 |             'action': attributes['action'],
140 |             'text': element.text,
141 |         }
142 |         if 'uid' in attributes:
143 |             comment['uid'] = int(attributes['uid'])
144 |         if 'user' in attributes:
145 |             comment['user'] = attributes['user']
146 |         if not element.text:
147 |             del comment['text']
148 | 
149 |         comments.append(comment)
150 |     return comments
151 | 
152 | 
153 | parser = argparse.ArgumentParser(description='Import notes from a notes dump.')
154 | parser.add_argument(
155 |     'file', type=str, help='path to the file which contains the notes dump'
156 | )
157 | args = parser.parse_args()
158 | 
159 | insert(args.file)
160 | 


--------------------------------------------------------------------------------
/blueprints/search.py:
--------------------------------------------------------------------------------
  1 | from textwrap import dedent
  2 | 
  3 | import orjson
  4 | from sanic import Blueprint, Sanic
  5 | from sanic.response import json
  6 | from sanic_ext import openapi
  7 | 
  8 | from api.models.note import Note
  9 | from api.query import Filter, Sort
 10 | from config import config
 11 | 
 12 | blueprint = Blueprint('Search', url_prefix='/search')
 13 | 
 14 | 
 15 | @blueprint.route('/', methods=['GET', 'POST'])
 16 | @openapi.description('Search and filter all notes in the database')
 17 | @openapi.parameter(
 18 |     'query',
 19 |     openapi.String(
 20 |         description=dedent(
 21 |             """\
 22 |             A word or sentence which can be found in the comments.
 23 |             To find an exact occurence of a word or sentence, wrap it in quotation marks `"{query}"`.
 24 |             Single words can be excluded from the result by prepending a dash `-` to the word.
 25 |             Spaces and other delimiters like dots are currently treated as a logical OR,
 26 |             though this will likely change in the future.
 27 |             """
 28 |         ),
 29 |         default=None,
 30 |         required=False,
 31 |     ),
 32 | )
 33 | @openapi.parameter(
 34 |     'bbox',
 35 |     openapi.String(
 36 |         description='A pair of coordinates specifying a rectangular box where all results are located in',
 37 |         example='-87.6955,41.8353,-87.5871,41.9170',
 38 |         default=None,
 39 |     ),
 40 | )
 41 | @openapi.parameter(
 42 |     'polygon',
 43 |     openapi.String(
 44 |         description='A GeoJSON polygon specifying a region where all results are located in',
 45 |         default=None,
 46 |     ),
 47 | )
 48 | @openapi.parameter(
 49 |     'status',
 50 |     openapi.String(
 51 |         description='The current status of the note',
 52 |         enum=('all', 'open', 'closed'),
 53 |         default='all',
 54 |     ),
 55 | )
 56 | @openapi.parameter(
 57 |     'anonymous',
 58 |     openapi.String(
 59 |         description='Whether anonymous notes should be included inclusively, excluded or included exclusively in the results',
 60 |         enum=('include', 'hide', 'only'),
 61 |         default='include',
 62 |     ),
 63 | )
 64 | @openapi.parameter(
 65 |     'author',
 66 |     openapi.String(
 67 |         description='Name of the user who opened the note, searching for multiple users is possible by separating them with a comma',
 68 |         default=None,
 69 |     ),
 70 | )
 71 | @openapi.parameter(
 72 |     'user',
 73 |     openapi.String(
 74 |         description='Name of any user who commented on the note, searching for multiple users is possible by separating them with a comma',
 75 |         default=None,
 76 |     ),
 77 | )
 78 | @openapi.parameter(
 79 |     'after',
 80 |     openapi.DateTime(
 81 |         description='Only return notes updated or created after this date',
 82 |         default=None,
 83 |         example='2020-03-13T10:20:24',
 84 |     ),
 85 | )
 86 | @openapi.parameter(
 87 |     'before',
 88 |     openapi.DateTime(
 89 |         description='Only return notes updated or created before this date',
 90 |         default=None,
 91 |         example='2020-05-11T07:10:45',
 92 |     ),
 93 | )
 94 | @openapi.parameter(
 95 |     'comments',
 96 |     openapi.Integer(
 97 |         description='Filters the amount of comments on a note',
 98 |         minimum=0,
 99 |         default=None,
100 |     ),
101 | )
102 | @openapi.parameter(
103 |     'commented',
104 |     openapi.String(
105 |         description='Whether commented notes should be included inclusively, excluded or included exclusively in the results',
106 |         enum=('include', 'hide', 'only'),
107 |         default='include',
108 |     ),
109 | )
110 | @openapi.parameter(
111 |     'sort_by',
112 |     openapi.String(
113 |         description='Sort notes either by no criteria, the date of the last update or their creation date',
114 |         enum=('none', 'updated_at', 'created_at'),
115 |         default='updated_at',
116 |     ),
117 | )
118 | @openapi.parameter(
119 |     'order',
120 |     openapi.String(
121 |         description='Sort notes either in ascending or descending order',
122 |         enum=('descending', 'desc', 'ascending', 'asc'),
123 |         default='descending',
124 |     ),
125 | )
126 | @openapi.parameter(
127 |     'limit',
128 |     openapi.Integer(
129 |         description='Limit the amount of notes to return',
130 |         minimum=1,
131 |         maximum=config['MAX_LIMIT'],
132 |         default=config['DEFAULT_LIMIT'],
133 |     ),
134 | )
135 | @openapi.response(
136 |     200,
137 |     {'application/json': openapi.Array(items=Note, uniqueItems=True)},
138 |     'The response is an array containing the notes with the requested information',
139 | )
140 | @openapi.response(
141 |     400,
142 |     {
143 |         'application/json': openapi.Object(
144 |             properties={'error': openapi.String()}
145 |         )
146 |     },
147 |     'In case one of the parameters is invalid, the response contains the error message',
148 | )
149 | async def index(request):
150 |     try:
151 |         args = None
152 |         if request.method == 'GET':
153 |             args = request.args
154 |         elif request.method == 'POST':
155 |             args = request.json
156 |         sort, filter, limit = parse(args)
157 |     except ValueError as error:
158 |         return json({'error': str(error)}, status=400)
159 | 
160 |     return await find(sort, filter, limit)
161 | 
162 | 
163 | def parse(data):
164 |     sort = (
165 |         Sort()
166 |         .by(data.get('sort_by', 'updated_at'))
167 |         .order(data.get('order', 'descending'))
168 |         .build()
169 |     )
170 |     filter = (
171 |         Filter(sort)
172 |         .query(data.get('query'))
173 |         .bbox(data.get('bbox'))
174 |         .polygon(data.get('polygon'))
175 |         .status(data.get('status'))
176 |         .anonymous(data.get('anonymous'))
177 |         .author(data.get('author'))
178 |         .user(data.get('user'))
179 |         .after(data.get('after', None))
180 |         .before(data.get('before', None))
181 |         .comments(data.get('comments', None))
182 |         .commented(data.get('commented'))
183 |         .build()
184 |     )
185 |     limit = data.get('limit', config['DEFAULT_LIMIT'])
186 | 
187 |     return sort, filter, limit
188 | 
189 | 
190 | async def find(sort, filter, limit):
191 |     # Apply the default limit in case the argument could not be parsed (e.g. for limit=NaN)
192 |     try:
193 |         limit = int(limit)
194 |     except ValueError:
195 |         limit = config['DEFAULT_LIMIT']
196 | 
197 |     if limit > config['MAX_LIMIT']:
198 |         return json(
199 |             {'error': f'Limit must not be higher than {config['MAX_LIMIT']}.'},
200 |             status=400,
201 |         )
202 | 
203 |     # Prevent that a limit of 0 is treated as no limit at all
204 |     if limit == 0:
205 |         limit = config['DEFAULT_LIMIT']
206 | 
207 |     cursor = Sanic.get_app().ctx.db.notes.find(filter).limit(limit)
208 |     # Queries are faster if the sorting is not explicitly specified (if desired)
209 |     if sort[0] is not None:
210 |         cursor = cursor.sort(*sort)
211 | 
212 |     result = []
213 |     async for document in cursor:
214 |         result.append(document)
215 |     return json(result, dumps=orjson.dumps, option=orjson.OPT_NAIVE_UTC)
216 | 


--------------------------------------------------------------------------------
/scripts/update.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import math
  4 | import os
  5 | import textwrap
  6 | import urllib.parse
  7 | 
  8 | import dateutil.parser
  9 | import requests
 10 | from dotenv import load_dotenv
 11 | from pymongo import DeleteOne, InsertOne, MongoClient, UpdateOne
 12 | 
 13 | load_dotenv()
 14 | 
 15 | client = MongoClient(
 16 |     f'mongodb://{os.environ.get("DB_USER")}:{os.environ.get("DB_PASSWORD")}@127.0.0.1:27017/?authSource=notesreview',
 17 |     tz_aware=True,
 18 | )
 19 | collection = client.notesreview.notes
 20 | 
 21 | DIRECTORY = os.path.dirname(os.path.realpath(__file__))
 22 | 
 23 | 
 24 | # Fills the database up by iterating over the OSM Notes API
 25 | # The current implementation is based on the last update of a note,
 26 | # all notes between now and another given date (the date of the last update) are imported into the database
 27 | def update(limit=100):
 28 |     # This variable is used in the while loop to ensure only notes of a specific timespan are fetched
 29 |     upper_bound = datetime.datetime.now(datetime.timezone.utc)
 30 |     # The start time of this function is used at the end to update the timestamp of the last update
 31 |     update_start_time = upper_bound
 32 |     with open(os.path.join(DIRECTORY, 'LAST_UPDATE.txt')) as file:
 33 |         last_update = dateutil.parser.parse(file.read())
 34 | 
 35 |     # Estimate a useful limit with a new note action every 15 seconds
 36 |     diff = (upper_bound - last_update).total_seconds()
 37 |     useful_limit = math.ceil(diff * (1 / 15))
 38 |     useful_limit = min(10000, useful_limit)
 39 | 
 40 |     # 0. Deleted 1. Added, 2. Updated, 3. Ignored
 41 |     all_stats = [0, 0, 0, 0]
 42 |     all_ignored = False
 43 | 
 44 |     # Either stop in case the stop date (i.e. the date of the last update) is exceeded or all notes are being ignored when inserting
 45 |     while (
 46 |         upper_bound is not None
 47 |         and upper_bound > last_update
 48 |         and not all_ignored
 49 |     ):
 50 |         url = build_url(
 51 |             {
 52 |                 'from': last_update.isoformat(),
 53 |                 'to': upper_bound.isoformat(),
 54 |                 'limit': str(limit),
 55 |             }
 56 |         )
 57 |         response = requests.get(url).json()
 58 |         features = response['features']
 59 | 
 60 |         stats, oldest = insert(features)
 61 |         all_stats = [sum(x) for x in zip(all_stats, stats)]
 62 | 
 63 |         # Check whether all features were ignored, meaning there are no updates anymore
 64 |         all_ignored = stats[3] == len(features)
 65 |         upper_bound = oldest
 66 | 
 67 |     print(
 68 |         textwrap.dedent(
 69 |             f"""
 70 |             ----------------------------------------
 71 |             UPDATE SUMMARY
 72 |             --------------------
 73 |             Last update:    {last_update.isoformat(timespec='seconds')}
 74 |             End of update:  {update_start_time.isoformat(timespec='seconds')}
 75 |             Time in seconds since last update: {round(diff)}
 76 |             Expected a useful limit of {useful_limit} while {all_stats[0] + all_stats[1] + all_stats[2]} was actually needed
 77 |             --------------------
 78 |             Deleted {all_stats[0]} notes
 79 |             Added {all_stats[1]} new notes
 80 |             Updated {all_stats[2]} already existing notes
 81 |             Ignored {all_stats[3]} already existing notes
 82 |             ----------------------------------------
 83 |             """
 84 |         )
 85 |     )
 86 | 
 87 |     with open(os.path.join(DIRECTORY, 'LAST_UPDATE.txt'), 'w') as file:
 88 |         file.write(update_start_time.isoformat(timespec='seconds'))
 89 |     # ---------------------------------------- #
 90 | 
 91 | 
 92 | def build_url(query={}):
 93 |     defaults = {
 94 |         'sort': 'updated_at',
 95 |         'closed': '-1',
 96 |         'limit': '100',
 97 |         # The start date needs to be specified because otherwise the value of the
 98 |         # 'to-parameter' has no effect (Use the begin of OpenStreetMap notes)
 99 |         'from': dateutil.parser.parse('2013-04-23T00:00:00'),
100 |     }
101 |     host = 'https://api.openstreetmap.org/api/0.6/notes/search.json'
102 |     url = host + '?' + urllib.parse.urlencode({**defaults, **query})
103 |     return url
104 | 
105 | 
106 | # Parse the comments and extract only the useful information
107 | def parse(comments):
108 |     for comment in comments:
109 |         if 'date' in comment:
110 |             comment['date'] = dateutil.parser.parse(comment['date'])
111 |         if 'user_url' in comment:
112 |             del comment['user_url']
113 |         if 'html' in comment:
114 |             del comment['html']
115 |         if not comment['text']:
116 |             del comment['text']
117 |     return comments
118 | 
119 | 
120 | # Loops through the provided list of notes and:
121 | # - Adds notes if they are unknown
122 | # - Updates notes if there is a different version
123 | # - Ignores notes which are the same
124 | def insert(features):
125 |     operations = []
126 |     deleted = 0
127 |     inserted = 0
128 |     updated = 0
129 |     ignored = 0
130 |     oldest = None
131 | 
132 |     for feature in features:
133 |         comments = parse(feature['properties']['comments'])
134 |         note = {
135 |             '_id': feature['properties']['id'],
136 |             'coordinates': feature['geometry']['coordinates'],
137 |             'status': feature['properties']['status'],
138 |             'updated_at': None if len(comments) == 0 else comments[-1]['date'],
139 |             'comments': comments,
140 |         }
141 |         query = {'_id': note['_id']}
142 | 
143 |         # If comments are invisible because of account deletion or other reasons,
144 |         # a note might not contain any comments at all
145 |         # see also https://github.com/openstreetmap/openstreetmap-website/issues/2146
146 |         if len(note['comments']) == 0:
147 |             # Notes without any comments are basically useless and should be deleted,
148 |             # especially as the comments might have been removed by a moderator
149 |             # and should not be visible to the public
150 |             operations.append(DeleteOne(query))
151 |             deleted += 1
152 |             continue
153 | 
154 |         # Check whether the note is already in the database and proceed with different operations
155 |         document = collection.find_one(query)
156 |         if document is None:
157 |             # Note is not yet in the database, insert it
158 |             operations.append(InsertOne(note))
159 |             inserted += 1
160 |         elif note == document:
161 |             # Note is already stored in the database, the statement is only true if
162 |             # "both dictionaries have the same (key, value) pairs (regardless of ordering)"
163 |             # See https://docs.python.org/3/library/stdtypes.html#dict
164 |             # Note is the same as the one that is already saved, should be ignored
165 |             ignored += 1
166 |         else:
167 |             # Note is different to the one that is already saved, needs to be updated
168 |             operations.append(
169 |                 UpdateOne(
170 |                     query,
171 |                     {
172 |                         '$set': {
173 |                             'status': note['status'],
174 |                             'updated_at': note['updated_at'],
175 |                             'comments': note['comments'],
176 |                         }
177 |                     },
178 |                 )
179 |             )
180 |             updated += 1
181 | 
182 |         # Check whether this note is the one with the oldest update date (for the upper bound of the next request)
183 |         last_changed = note['comments'][-1]['date']
184 |         # Only update the oldest changed date if the note is either new (i.e. no document exists yet) or has more comments than before.
185 |         # This generally means that no comments were hidden and the last changed date is in fact also the last update date.
186 |         # And obviously only update the date if it is older than the current oldest date.
187 |         if (
188 |             document is None
189 |             or len(note['comments']) > len(document['comments'])
190 |         ) and (oldest is None or last_changed < oldest):
191 |             oldest = last_changed
192 | 
193 |     if len(operations) != 0:
194 |         result = collection.bulk_write(operations, ordered=False)
195 |         if result.bulk_api_result['writeErrors']:
196 |             client.events.errors.insert_one(
197 |                 {
198 |                     'type': 'update_error',
199 |                     'timestamp': datetime.datetime.now(datetime.timezone.utc),
200 |                     'error': result.bulk_api_result['writeErrors'],
201 |                 }
202 |             )
203 |     return [deleted, inserted, updated, ignored], oldest
204 | 
205 | 
206 | parser = argparse.ArgumentParser(
207 |     description='Update notes between the last check and now.'
208 | )
209 | parser.add_argument(
210 |     '-l',
211 |     '--limit',
212 |     type=int,
213 |     default=100,
214 |     help='set the batch size limit (default: 100)',
215 | )
216 | args = parser.parse_args()
217 | 
218 | update(args.limit)
219 | 


--------------------------------------------------------------------------------
/api/query.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | import dateutil.parser
  5 | import lark
  6 | import orjson
  7 | 
  8 | 
  9 | class Sort(object):
 10 |     def __init__(self):
 11 |         self.sort = {}
 12 | 
 13 |     def build(self):
 14 |         return self.sort.get('by'), self.sort.get('order')
 15 | 
 16 |     def by(self, by):
 17 |         allowed = ['none', 'updated_at', 'created_at']
 18 |         if by not in allowed:
 19 |             raise ValueError(f'Sort must be one of {allowed}')
 20 | 
 21 |         if by == 'none':
 22 |             pass
 23 |         elif by == 'updated_at':
 24 |             self.sort['by'] = 'updated_at'
 25 |         elif by == 'created_at':
 26 |             self.sort['by'] = 'comments.0.date'
 27 |         return self
 28 | 
 29 |     def order(self, order):
 30 |         allowed = ['desc', 'descending', 'asc', 'ascending']
 31 |         if order not in allowed:
 32 |             raise ValueError(f'Order must be one of {allowed}')
 33 | 
 34 |         if order in ['asc', 'ascending']:
 35 |             self.sort['order'] = 1
 36 |         elif order in ['desc', 'descending']:
 37 |             self.sort['order'] = -1
 38 |         return self
 39 | 
 40 | 
 41 | class Filter(object):
 42 |     def __init__(self, sort):
 43 |         self.filter = {}
 44 |         self.sort = sort
 45 |         self.users = Users()
 46 | 
 47 |     def build(self):
 48 |         return self.filter
 49 | 
 50 |     def query(self, query):
 51 |         if query is not None:
 52 |             self.filter['comments.0.text'] = {
 53 |                 '$regex': (
 54 |                     query.removeprefix('regex:')
 55 |                     if query.startswith('regex:')
 56 |                     else re.escape(query)
 57 |                 ),
 58 |                 '$options': 'i',
 59 |             }
 60 |         return self
 61 | 
 62 |     def bbox(self, bbox):
 63 |         if bbox is not None:
 64 |             bbox = BoundingBox(bbox)
 65 |             self.filter['coordinates'] = {
 66 |                 '$geoWithin': {
 67 |                     '$box': [
 68 |                         # bottom left coordinates (longitude, latitude)
 69 |                         [bbox.x1, bbox.y1],
 70 |                         # upper right coordinates (longitude, latitude)
 71 |                         [bbox.x2, bbox.y2],
 72 |                     ]
 73 |                 }
 74 |             }
 75 |         return self
 76 | 
 77 |     def polygon(self, polygon):
 78 |         if polygon is not None:
 79 |             polygon = Polygon(polygon)
 80 |             self.filter['coordinates'] = {
 81 |                 '$geoWithin': {
 82 |                     '$geometry': {
 83 |                         'type': polygon.type,
 84 |                         'coordinates': polygon.coordinates,
 85 |                     }
 86 |                 }
 87 |             }
 88 |         return self
 89 | 
 90 |     def status(self, status):
 91 |         if status not in [None, 'all', 'open', 'closed']:
 92 |             raise ValueError('Status must be one of [all, open, closed]')
 93 | 
 94 |         if status not in [None, 'all']:
 95 |             self.filter['status'] = status
 96 |         return self
 97 | 
 98 |     def anonymous(self, anonymous):
 99 |         if anonymous not in [None, 'include', 'hide', 'only']:
100 |             raise ValueError('Anonymous must be one of [include, hide, only]')
101 | 
102 |         if anonymous is not None:
103 |             # Filtering out anonymous notes means that there must be a user who created the note
104 |             if anonymous == 'hide':
105 |                 self.filter['comments.0.user'] = {'$exists': True}
106 |             if anonymous == 'only':
107 |                 self.filter['comments.0.user'] = {'$exists': False}
108 |         return self
109 | 
110 |     def author(self, author):
111 |         if author is not None:
112 |             include, exclude = self.users.parse(author)
113 |             if 'comments.0.user' not in self.filter:
114 |                 self.filter['comments.0.user'] = {}
115 |             self.filter['comments.0.user'].update(
116 |                 self.clean({'$in': include, '$nin': exclude})
117 |             )
118 |         return self
119 | 
120 |     def user(self, user):
121 |         if user is not None:
122 |             include, exclude = self.users.parse(user)
123 |             if 'comments.user' not in self.filter:
124 |                 self.filter['comments.user'] = {}
125 |             self.filter['comments.user'].update(
126 |                 self.clean({'$all': include, '$nin': exclude})
127 |             )
128 |         return self
129 | 
130 |     def after(self, after):
131 |         if after is not None:
132 |             key = self.sort[0]
133 |             # If results will be unsorted, use the creation date for the comparison
134 |             if key is None:
135 |                 key = 'comments.0.date'
136 | 
137 |             if key not in self.filter:
138 |                 self.filter[key] = {}
139 |             self.filter[key]['$gt'] = dateutil.parser.parse(after)
140 |         return self
141 | 
142 |     def before(self, before):
143 |         if before is not None:
144 |             key = self.sort[0]
145 |             # If results will be unsorted, use the creation date for the comparison
146 |             if key is None:
147 |                 key = 'comments.0.date'
148 | 
149 |             if key not in self.filter:
150 |                 self.filter[key] = {}
151 |             self.filter[key]['$lt'] = dateutil.parser.parse(before)
152 |         return self
153 | 
154 |     def comments(self, amount_of_comments):
155 |         if amount_of_comments is not None:
156 |             # A comment of the note counts as everything after the original comment
157 |             self.filter['comments'] = {'$size': int(amount_of_comments) + 1}
158 |         return self
159 | 
160 |     def commented(self, commented):
161 |         if commented not in [None, 'include', 'hide', 'only']:
162 |             raise ValueError('Commented must be one of [include, hide, only]')
163 | 
164 |         if commented is not None:
165 |             # Filtering out commented notes means that only the original comment exists
166 |             if commented == 'hide':
167 |                 self.filter['comments'] = {'$size': 1}
168 |             # Showing only commented notes requires the amount of comments to be greater than 1
169 |             # This is not directly allowed (since $size does not accept ranges of values, e.g. via $gt),
170 |             # so instead show only notes with an amount of comments different from 1
171 |             # (notes with 0 comments do not exist)
172 |             if commented == 'only':
173 |                 self.filter['comments'] = {'$not': {'$size': 1}}
174 |         return self
175 | 
176 |     # Remove values that are not defined or empty from a given dictionary
177 |     def clean(self, dictionary):
178 |         return {
179 |             k: v
180 |             for k, v in dictionary.items()
181 |             if v is not None and (type(v) is list and len(v) > 0)
182 |         }
183 | 
184 | 
185 | class BoundingBox(object):
186 |     def __init__(self, bbox):
187 |         bbox = [float(x) for x in bbox.split(',')]
188 |         if len(bbox) != 4:
189 |             raise ValueError(
190 |                 'The bounding box does not contain all required coordinates'
191 |             )
192 | 
193 |         self.x1 = bbox[0]
194 |         self.y1 = bbox[1]
195 |         self.x2 = bbox[2]
196 |         self.y2 = bbox[3]
197 |         self.check()
198 | 
199 |     def check(self):
200 |         if self.x1 > self.x2:
201 |             raise ValueError(
202 |                 'The minimum longitude must be smaller than the maximum longitude'
203 |             )
204 |         if self.y1 > self.y2:
205 |             raise ValueError(
206 |                 'The minimum latitude must be smaller than the maximum latitude'
207 |             )
208 |         if self.x1 < -180 or self.y1 < -90 or self.x2 > +180 or self.y2 > +90:
209 |             raise ValueError(
210 |                 'The bounding box exceeds the size of the world, please specify a smaller bounding box'
211 |             )
212 | 
213 | 
214 | class Polygon(object):
215 |     def __init__(self, polygon):
216 |         polygon = orjson.loads(polygon)
217 |         if 'type' not in polygon or 'coordinates' not in polygon:
218 |             raise ValueError(
219 |                 'Polygon does not contain information about type or any coordinates'
220 |             )
221 | 
222 |         self.type = polygon['type']
223 |         self.coordinates = polygon['coordinates']
224 |         self.check()
225 | 
226 |     def check(self):
227 |         if self.type not in ['Polygon', 'MultiPolygon']:
228 |             raise ValueError(
229 |                 'The GeoJSON shape must be either a Polygon or a MultiPolygon'
230 |             )
231 |         if type(self.coordinates) is not list:
232 |             raise ValueError('Coordinates have to be supplied as an array')
233 | 
234 | 
235 | class Users(object):
236 |     def __init__(self):
237 |         with open(
238 |             os.path.join(os.path.dirname(__file__), 'grammars', 'users.lark')
239 |         ) as file:
240 |             self.grammar = lark.Lark(file.read())
241 | 
242 |     def parse(self, input):
243 |         tree = self.grammar.parse(input)
244 |         include = []
245 |         exclude = []
246 | 
247 |         for node in tree.children:
248 |             if isinstance(node.children[0], lark.Token):
249 |                 include.append(node.children[0].value)
250 |             elif (
251 |                 isinstance(node.children[0], lark.Tree)
252 |                 and node.children[0].data == 'not'
253 |             ):
254 |                 exclude.append(node.children[0].children[0].value)
255 | 
256 |         if len(include) + len(exclude) > 10:
257 |             raise ValueError(
258 |                 'The amount of users to search for exceeds the limit'
259 |             )
260 | 
261 |         return include, exclude
262 | 


--------------------------------------------------------------------------------