├── .bumpversion.cfg ├── .cache └── .gitignore ├── .dockerignore ├── .flake8 ├── .github └── workflows │ ├── docker-build.yml │ └── tests.yml ├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── Dockerfile ├── LICENSE ├── Procfile ├── README.md ├── app.json ├── docker-compose.yml ├── docs ├── development.md └── linguee-api.png ├── env.example ├── fly.toml ├── linguee_api ├── __init__.py ├── api.py ├── config.py ├── const.py ├── downloaders │ ├── __init__.py │ ├── error_downloader.py │ ├── file_cache.py │ ├── httpx_downloader.py │ ├── interfaces.py │ ├── memory_cache.py │ ├── mock_downloader.py │ └── sqlite_cache.py ├── linguee_client.py ├── models.py ├── parser_utils.py ├── parsers.py └── utils.py ├── mypy.ini ├── poetry.lock ├── pyproject.toml ├── runtime.txt ├── tests ├── conftest.py ├── parsers │ ├── __init__.py │ ├── test_autocompletions.py │ └── test_search_result.py ├── test_api_client.py ├── test_downloaders.py ├── test_file_cache.py ├── test_linguee_client.py ├── test_memory_cache.py └── test_sqlite_cache.py └── tox.ini /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 2.6.3 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:pyproject.toml] 7 | search = version = "{current_version}" 8 | replace = version = "{new_version}" 9 | 10 | [bumpversion:file:CHANGELOG.md] 11 | search = UNRELEASED 12 | replace = {new_version} ({now:%Y-%m-%d}) 13 | -------------------------------------------------------------------------------- /.cache/.gitignore: -------------------------------------------------------------------------------- 1 | !.gitignore 2 | * 3 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | .mypy_cache 3 | .pytest_cache 4 | .env 5 | .coverage 6 | .tox 7 | .cache 8 | .git 9 | coverage.xml 10 | htmlcov/ 11 | dist 12 | *.sqlite3 13 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | extend-ignore = E203, W503 4 | max-complexity = 10 5 | -------------------------------------------------------------------------------- /.github/workflows/docker-build.yml: -------------------------------------------------------------------------------- 1 | name: Docker Build and Publish 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build-and-publish: 11 | runs-on: ubuntu-latest 12 | permissions: 13 | contents: read 14 | packages: write 15 | 16 | steps: 17 | - name: Checkout code 18 | uses: actions/checkout@v2 19 | 20 | - name: Login to GitHub Container Registry 21 | uses: docker/login-action@v2 22 | with: 23 | registry: ghcr.io 24 | username: ${{ github.actor }} 25 | password: ${{ secrets.GITHUB_TOKEN }} 26 | 27 | - name: Build and publish Docker image 28 | uses: docker/build-push-action@v4 29 | with: 30 | context: . 31 | push: true 32 | tags: | 33 | ghcr.io/${{ github.repository }}:${{ github.sha }} 34 | ghcr.io/${{ github.repository }}:latest 35 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build-test: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | - name: Install poetry 11 | run: pipx install poetry 12 | - uses: actions/setup-python@v4 13 | with: 14 | python-version: "3.12" 15 | cache: "poetry" 16 | - name: Install dependencies 17 | run: | 18 | poetry install 19 | - name: Test with pytest 20 | run: | 21 | set -ex 22 | poetry run coverage run -m pytest 23 | - name: Convert coverage to XML 24 | run: | 25 | set -ex 26 | poetry run coverage xml 27 | - name: Upload coverage to codecov 28 | env: 29 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 30 | run: | 31 | bash <(curl -s https://codecov.io/bash) 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | .mypy_cache 3 | .pytest_cache 4 | .env 5 | .coverage 6 | .tox 7 | coverage.xml 8 | htmlcov/ 9 | dist 10 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | profile = black 3 | multi_line_output=3 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3.9 3 | 4 | repos: 5 | - repo: https://github.com/pre-commit/pre-commit-hooks 6 | rev: v4.4.0 7 | hooks: 8 | - id: trailing-whitespace 9 | - id: check-merge-conflict 10 | - id: check-case-conflict 11 | - id: debug-statements 12 | 13 | - repo: https://github.com/psf/black 14 | rev: 23.3.0 15 | hooks: 16 | - id: black 17 | 18 | - repo: https://github.com/pre-commit/mirrors-mypy 19 | rev: v1.2.0 20 | hooks: 21 | - id: mypy 22 | 23 | - repo: https://github.com/PyCQA/isort 24 | rev: 5.12.0 25 | hooks: 26 | - id: isort 27 | 28 | - repo: https://github.com/pycqa/flake8 29 | rev: 6.0.0 30 | hooks: 31 | - id: flake8 32 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | ## 2.6.3 (2024-08-14) 6 | 7 | - Updated pydantic to the latest 1.x version to address incompatibility with python 3.12.4. Ref: https://github.com/pydantic/pydantic/issues/9637 8 | 9 | ## 2.6.2 (2024-04-25) 10 | 11 | - Added support for Python 3.11 and 3.12. 12 | - Updated Dockerfile to use Python 3.12. 13 | - Added docker-build.yml action. 14 | 15 | ## 2.6.1 (2024-04-25) 16 | 17 | - Updated all dependencies. 18 | - Fixed a bug with usage frequency parsing (#48). 19 | 20 | ## 2.6.0 (2023-04-23) 21 | 22 | - Added SQLite cache and made it the default one. 23 | - Made MemoryCache use LRU. 24 | - Added tests for all cache classes. 25 | - Updated Dockerfile to use /cache for file and SQLite caches. 26 | - Added a sample docker-compose file. 27 | - Updated FastAPI and httpx dependencies. 28 | 29 | ## 2.5.1 (2022-11-19) 30 | 31 | - Added FAQ to the README, where provided a clearer explanation of the 503 error. 32 | 33 | ## 2.5.0 (2022-11-19) 34 | 35 | - Added "follow_corrections" API flag (#23) 36 | - Added configuration to host the project on fly.io 37 | - Updated the address of the sample installation to https://linguee-api.fly.dev 38 | - Added lemma forms (#26) 39 | 40 | ## 2.4.0 (2022-08-01) 41 | 42 | - Set Heroku runtime to python-3.10.5 (#21) 43 | - Added "usage_frequency" attribute to translations (#22) 44 | 45 | ## 2.3.0 (2022-06-17) 46 | 47 | - Added packaging support 48 | - Added support for various versions of Python (3.8+) 49 | - Updated httpx to the latest version. Ref: CVE-2021-41945 50 | 51 | ## 2.2.1 (2022-04-20) 52 | 53 | - Updated development dependencies and pre-commit hooks 54 | - Provided usage examples for Python and Bash 55 | 56 | ## 2.2.0 (2021-09-28) 57 | 58 | - Fixed a bug with multiple grammar infos (#12). 59 | - Fixed a file cache issue on the Windows platform (#16). 60 | - Updated all dependencies to their latest versions. 61 | 62 | ## 2.1.0 (2021-05-16) 63 | 64 | - Added translation examples to the /translations API endpoint (#10). 65 | 66 | ## 2.0.0 (2021-04-29) 67 | 68 | - The first release of the Python version of the project. 69 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # First stage: Install poetry and dependencies 2 | FROM python:3.12-slim AS builder 3 | 4 | # Install system dependencies 5 | RUN apt-get update \ 6 | && apt-get install --no-install-recommends -y \ 7 | curl \ 8 | build-essential \ 9 | && apt-get clean \ 10 | && rm -rf /var/lib/apt/lists/* 11 | 12 | # Set environment variables for poetry 13 | ENV POETRY_HOME="/opt/poetry" 14 | ENV POETRY_VIRTUALENVS_IN_PROJECT=true 15 | ENV POETRY_NO_INTERACTION=1 16 | ENV PATH="$POETRY_HOME/bin:$PATH" 17 | 18 | # Install poetry 19 | RUN curl -sSL https://install.python-poetry.org | python3 - 20 | 21 | # Copy only requirements to cache them in docker layer 22 | WORKDIR /app 23 | COPY pyproject.toml poetry.lock /app/ 24 | 25 | # Install runtime deps - uses $POETRY_VIRTUALENVS_IN_PROJECT internally 26 | # and install spacy's en_core_web_sm 27 | RUN poetry install --only main --no-root 28 | 29 | 30 | # Second stage: Copy from builder and run 31 | FROM python:3.12-slim AS runner 32 | 33 | # Copy virtualenv from builder 34 | COPY --from=builder /app/.venv /app/.venv 35 | 36 | WORKDIR /app 37 | 38 | # Ensure we use the virtualenv 39 | ENV PATH="/app/.venv/bin:$PATH" 40 | 41 | # Copy the content of the app 42 | COPY . /app/ 43 | 44 | # Declare port FastAPI will use 45 | EXPOSE 8000 46 | 47 | # Declare the VOLUME and use it as a cache directory 48 | VOLUME /cache 49 | ENV CACHE_DIRECTORY=/cache 50 | 51 | # Command to run on container start 52 | CMD ["uvicorn", "linguee_api.api:app", "--host", "0.0.0.0", "--port", "8000"] 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Roman Imankulov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: uvicorn linguee_api.api:app --host=0.0.0.0 --port=$PORT 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Linguee API 2 | 3 | [Linguee](https://linguee.com) provides excellent dictionary and translation memory service. Unfortunately, there is no way you can get automated access to it. Linguee API fixes the problem. It acts as a proxy and converts their HTML responses to easy-to-use JSON API. 4 | 5 | ## API endpoints 6 | 7 | The proxy provides three API endpoints: for translations, for examples, and external sources. 8 | 9 | ![Linguee API](./docs/linguee-api.png) 10 | 11 | The API documentation and the playground is available for the sample installation: 12 | 13 | - [Documentation and API playground](https://linguee-api.fly.dev/docs) 14 | - [The same documentation, but formatted with ReDoc](https://linguee-api.fly.dev/redoc) 15 | 16 | ## Sample installation 17 | 18 | Sample installation is available at https://linguee-api.fly.dev. 19 | 20 | - Get translations of the word "bacalhau" from Portuguese to English: [https://linguee-api.fly.dev/api/v2/translations?query=bacalhau&src=pt&dst=en](https://linguee-api.fly.dev/api/v2/translations?query=bacalhau&src=pt&dst=en). 21 | - Get a list of curated examples: [https://linguee-api.fly.dev/api/v2/examples?query=bacalhau&src=pt&dst=en](https://linguee-api.fly.dev/api/v2/examples?query=bacalhau&src=pt&dst=en). 22 | - Get examples from external sources: [https://linguee-api.fly.dev/api/v2/external_sources?query=bacalhau&src=pt&dst=en](https://linguee-api.fly.dev/api/v2/examples?query=bacalhau&src=pt&dst=en). 23 | 24 | ## Local installation 25 | 26 | Install the Linguee API. 27 | 28 | ```shell 29 | $ pip install linguee-api 30 | ``` 31 | 32 | Run the API server with `uvicorn` (installed as a dependency.) 33 | 34 | ```shell 35 | $ uvicorn linguee_api.api:app 36 | ... 37 | INFO: Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit) 38 | ... 39 | ``` 40 | 41 | Open http://127.0.0.1:8000. You will be redirected to the API documentation page, where you can test the API. 42 | 43 | ## Supported languages 44 | 45 | API supports all the languages, supported by Linguee. As in Linguee, not all language pairs are valid though. Supported languages: 46 | `bg` (Bulgarian), `cs` (Czech), `da` (Danish), `de` (German), `el` (Greek), `en` (English), `es` (Spanish), `et` (Estonian), `fi` (Finnish), `fr` (French), `hu` (Hungarian), `it` (Italian), `ja` (Japan),`lt` (Lithuanian), `lv` (Latvian), `mt` (Maltese), `nl` (Dutch), `pl` (Polish), `pt` (Portuguese), `ro` (Romanian), `ru` (Russian), `sk` (Slovak), `sl` (Solvene), `sv` (Swedish), `zh` (Chinese). 47 | 48 | ## Response structure 49 | 50 | **Lemmas** 51 | 52 | Every query (a random string) can match several so-called lemma objects. 53 | 54 | According to Wikipedia, [lemma](https://en.wikipedia.org/wiki/Lemma_(morphology)) is the canonical form, dictionary form, or citation form of a set of words. 55 | 56 | In English, for example, break, breaks, broke, broken, and breaking are forms of the same lexeme, with "break" as the lemma by which they are indexed. 57 | 58 | In the API, lemmas have the only required attribute, "text," but may have optional elements, such as part of speech ("pos") and audio links with pronunciations. 59 | 60 | 61 | **Translations** 62 | 63 | Every lemma has one or more translations. The translation is a lemma in a different language and has a similar structure with the necessary text field and optional part of speech and audio links. 64 | 65 | 66 | **Examples** 67 | 68 | In addition to lemmas, the API returns several usage examples curated by dictionary authors. Examples are the short phrases, annotated with one or more equivalents in different languages. When appropriate, examples may contain the part-of-speech form and audio links. 69 | 70 | **External Sources** 71 | 72 | On top of curated examples, Linguee provides links to external sources. The API returns objects containing the phrase snipped in the original language and an equivalent snippet in the translation. 73 | 74 | ## Usage examples with Python and requests 75 | 76 | Once installed on Heroku, Linguee API can be used as any other API service. I recommend using the [requests](https://docs.python-requests.org/) library. 77 | 78 | ### Translate a word or a phrase from one language to another with Python 79 | 80 | A request to the sample API installation to translate the word "bacalhau" from Portuguese to English. 81 | 82 | ```python 83 | import requests 84 | 85 | api_root = "https://linguee-api.fly.dev/api/v2" 86 | resp = requests.get(f"{api_root}/translations", params={"query": "bacalhau", "src": "pt", "dst": "en"}) 87 | for lemma in resp.json(): 88 | for translation in lemma['translations']: 89 | print(f"{lemma['text']} -> {translation['text']}") 90 | ``` 91 | 92 | This will print: 93 | 94 | ``` 95 | bacalhau -> cod 96 | bacalhau -> codfish 97 | ``` 98 | 99 | ### Provide translation examples with Python 100 | 101 | A request to the sample API installation to get all usage examples of "bacalhau" along with their translations. 102 | 103 | ```python 104 | import requests 105 | 106 | api_root = "https://linguee-api.fly.dev/api/v2" 107 | 108 | resp = requests.get(f"{api_root}/examples", params={"query": "bacalhau", "src": "pt", "dst": "en"}) 109 | 110 | for example in resp.json(): 111 | for translation in example["translations"]: 112 | print(f"{example['text']} -> {translation['text']}") 113 | ``` 114 | 115 | This will print: 116 | 117 | ``` 118 | bacalhau desfiado -> shredded cod 119 | lombo de bacalhau -> codfish fillet 120 | ... 121 | bacalhau do Atlântico -> Atlantic cod 122 | ``` 123 | 124 | ### Get access to real world usage examples with Python 125 | 126 | A request to the sample API installation to get all real-world usage examples of "bacalhau" along with their translations. 127 | 128 | ```python 129 | import requests 130 | 131 | api_root = "https://linguee-api.fly.dev/api/v2" 132 | 133 | resp = requests.get(f"{api_root}/external_sources", params={"query": "bacalhau", "src": "pt", "dst": "en"}) 134 | for source in resp.json(): 135 | print(f"{source['src']} -> {source['dst']}") 136 | ``` 137 | 138 | This will print a long list of real-world examples like this: 139 | 140 | ``` 141 | É calculado o esforço de [...] pesca de todos os navios que capturam bacalhau. -> The fishing effort of all [...] the vessels catching cod will be calculated. 142 | ``` 143 | 144 | 145 | ## Bash, curl and jq usage example 146 | 147 | Once installed on Heroku, Linguee API can be used as any other API service. 148 | 149 | For Bash scripts you can use curl and [jq](https://stedolan.github.io/jq/), a command-line JSON parser. 150 | 151 | ### Translate a word or a phrase from one language to another with Bash 152 | 153 | A request to the sample API installation to get all usage examples of "bacalhau" along with their translations. 154 | 155 | ```bash 156 | curl -s 'https://linguee-api.fly.dev/api/v2/translations?query=bacalhau&src=pt&dst=en' | jq -c '{text: .[].text, translation: .[].translations[].text}' 157 | ``` 158 | 159 | This will print 160 | 161 | ```json lines 162 | {"text":"bacalhau","translation":"cod"} 163 | {"text":"bacalhau","translation":"codfish"} 164 | ``` 165 | 166 | ### Provide translation examples with Bash 167 | 168 | A request to the sample API installation to get all usage examples of "bacalhau" along with their translations. 169 | 170 | ```shell 171 | curl -s 'https://linguee-api.fly.dev/api/v2/examples?query=bacalhau&src=pt&dst=en' | jq -c '{text: .[].text, translation: .[].translations[].text}' 172 | ``` 173 | 174 | This will print something like this: 175 | 176 | ```json lines 177 | {"text":"bacalhau desfiado","translation":"shredded cod"} 178 | {"text":"bacalhau desfiado","translation":"codfish fillet"} 179 | ... 180 | {"text":"bacalhau do Atlântico","translation":"Atlantic cod"} 181 | ``` 182 | 183 | ### Get access to real world usage examples with Bash 184 | 185 | A request to the sample API installation to get all real-world usage examples of "bacalhau" along with their translations. 186 | 187 | ```shell 188 | curl -s 'https://linguee-api.fly.dev/api/v2/external_sources?query=bacalhau&src=pt&dst=en' | jq -c '{src: .[].src, dst: .[].dst}' 189 | ``` 190 | 191 | This will print a long list of real-world examples like this: 192 | 193 | ```json lines 194 | {"src":"É calculado o esforço de [...] pesca de todos os navios que capturam bacalhau.","dst":"The fishing effort of all [...] the vessels catching cod will be calculated."} 195 | ... 196 | ``` 197 | 198 | ## FAQ 199 | 200 | ### The API server returns "The Linguee server returned 503" 201 | 202 | This error means that the Linguee website temporarily blocks the API client for sending too many requests. If you use the sample API server on https://linguee-api.fly.dev, you can try to send the request later or consider installing your API server, where you won't share the same IP address with other users. 203 | 204 | ## Terms and Conditions 205 | 206 | If you use the API, make sure you comply with 207 | [Linguee Terms and Conditions](http://www.linguee.com/page/termsAndConditions.php), 208 | and in particular with that clause: 209 | 210 | > Both private and business usage of linguee.com services is free of charge. 211 | > It is however strictly prohibited to forward on our services to third 212 | > parties against payment 213 | -------------------------------------------------------------------------------- /app.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Linguee API", 3 | "description": "Linguee proxy to convert HTML responses from linguee.com to JSON format", 4 | "website": "https://github.com/imankulov/linguee-api", 5 | "keywords": ["api", "translation"], 6 | "buildpacks": [ 7 | { 8 | "url": "https://github.com/moneymeets/python-poetry-buildpack.git" 9 | }, 10 | { 11 | "url": "heroku/python" 12 | } 13 | ], 14 | "env": { 15 | "DISABLE_POETRY_CREATE_RUNTIME_FILE": { 16 | "description": "Disable the creation of the runtime file by Poetry buildpack", 17 | "value": "1" 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # This is a sample docker-compose.yml file to run the Linguee API server. It exposes the API on port 8000 and 2 | # stores the cache between runs in a volume "linguee-cache". 3 | # 4 | # If you need to reset the cache, you can run: 5 | # docker-compose down -v 6 | version: "3.3" 7 | 8 | services: 9 | linguee: 10 | build: . 11 | ports: 12 | - "127.0.0.1:8000:8000" 13 | volumes: 14 | - linguee-cache:/cache 15 | 16 | volumes: 17 | linguee-cache: {} 18 | -------------------------------------------------------------------------------- /docs/development.md: -------------------------------------------------------------------------------- 1 | # Development installation 2 | 3 | Quick notes to myself how to install the project and run it locally. 4 | 5 | ## How to install 6 | 7 | ```bash 8 | poetry install 9 | cp env.example .env 10 | ``` 11 | 12 | ## How to run tests 13 | 14 | You can run tests offline or online. Offline tests fail when you try to download a new translation that is not in the cache. Set the configuration option in the `.env` file. 15 | 16 | ```dotenv 17 | PYTEST_OFFLINE=false 18 | ``` 19 | 20 | The run the tests 21 | 22 | ```bash 23 | poetry run pytest 24 | ``` 25 | 26 | ## How to run the API server 27 | 28 | ```bash 29 | poetry run uvicorn linguee_api.api:app 30 | ``` 31 | 32 | ## How to make a new release 33 | 34 | ```bash 35 | bum2version minor 36 | git push 37 | git push --tags 38 | poetry build 39 | poetry publish 40 | ``` 41 | -------------------------------------------------------------------------------- /docs/linguee-api.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imankulov/linguee-api/9844c8247b07a2771b1555f09c8bbc6ea83f08d7/docs/linguee-api.png -------------------------------------------------------------------------------- /env.example: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Sentry settings 3 | # --------------------------------------------------------- 4 | # When settings are not defined, Sentry is disabled. 5 | # SENTRY_DSN= 6 | # SENTRY_ENVIRONMENT=development 7 | 8 | 9 | # --------------------------------------------------------- 10 | # File cache settings 11 | # --------------------------------------------------------- 12 | # When settings are not defined, .cache directory in the project 13 | # root is used. 14 | # CACHE_DIRECTORY=/tmp/.cache 15 | 16 | 17 | # ---------------------------------------------------- 18 | # Pytest settings 19 | # ---------------------------------------------------- 20 | # Run tests offline 21 | PYTEST_OFFLINE=false 22 | -------------------------------------------------------------------------------- /fly.toml: -------------------------------------------------------------------------------- 1 | # fly.toml file generated for linguee-api 2 | app = "linguee-api" 3 | kill_signal = "SIGINT" 4 | kill_timeout = 5 5 | processes = [] 6 | 7 | [env] 8 | SENTRY_ENVIRONMENT = "production" 9 | 10 | [experimental] 11 | allowed_public_ports = [] 12 | auto_rollback = true 13 | 14 | [[services]] 15 | http_checks = [] 16 | internal_port = 8000 17 | processes = ["app"] 18 | protocol = "tcp" 19 | script_checks = [] 20 | 21 | [services.concurrency] 22 | hard_limit = 25 23 | soft_limit = 20 24 | type = "connections" 25 | 26 | [[services.ports]] 27 | force_https = true 28 | handlers = ["http"] 29 | port = 80 30 | 31 | [[services.ports]] 32 | handlers = ["tls", "http"] 33 | port = 443 34 | 35 | [[services.tcp_checks]] 36 | grace_period = "1s" 37 | interval = "15s" 38 | restart_limit = 0 39 | timeout = "2s" 40 | 41 | [mounts] 42 | source="linguee_cache" 43 | destination="/cache" 44 | -------------------------------------------------------------------------------- /linguee_api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imankulov/linguee-api/9844c8247b07a2771b1555f09c8bbc6ea83f08d7/linguee_api/__init__.py -------------------------------------------------------------------------------- /linguee_api/api.py: -------------------------------------------------------------------------------- 1 | import sentry_sdk 2 | from fastapi import FastAPI, Query, Response, status 3 | from sentry_sdk.integrations.asgi import SentryAsgiMiddleware 4 | from starlette.responses import RedirectResponse 5 | 6 | from linguee_api.config import settings 7 | from linguee_api.const import ( 8 | FOLLOW_CORRECTIONS_DESCRIPTION, 9 | LANGUAGE_CODE, 10 | PROJECT_DESCRIPTION, 11 | ) 12 | from linguee_api.downloaders.httpx_downloader import HTTPXDownloader 13 | from linguee_api.downloaders.memory_cache import MemoryCache 14 | from linguee_api.downloaders.sqlite_cache import SQLiteCache 15 | from linguee_api.linguee_client import LingueeClient 16 | from linguee_api.models import ( 17 | Autocompletions, 18 | FollowCorrections, 19 | ParseError, 20 | SearchResult, 21 | ) 22 | from linguee_api.parsers import XExtractParser 23 | 24 | sentry_sdk.init(dsn=settings.sentry_dsn, environment=settings.sentry_environment) 25 | app = FastAPI( 26 | title="Linguee API", 27 | description=PROJECT_DESCRIPTION, 28 | version="2.0.0", 29 | ) 30 | app.add_middleware(SentryAsgiMiddleware) 31 | 32 | page_downloader = MemoryCache( 33 | upstream=SQLiteCache( 34 | cache_database=settings.cache_database, 35 | upstream=HTTPXDownloader(), 36 | ) 37 | ) 38 | client = LingueeClient(page_downloader=page_downloader, page_parser=XExtractParser()) 39 | 40 | 41 | @app.get("/", include_in_schema=False) 42 | def index(): 43 | return RedirectResponse("/docs") 44 | 45 | 46 | @app.get( 47 | "/api/v2/translations", 48 | status_code=status.HTTP_200_OK, 49 | responses={ 50 | status.HTTP_200_OK: {"model": list[SearchResult.Lemma]}, 51 | status.HTTP_500_INTERNAL_SERVER_ERROR: {"model": ParseError}, 52 | }, 53 | ) 54 | async def translations( 55 | query: str, 56 | src: LANGUAGE_CODE, 57 | dst: LANGUAGE_CODE, 58 | response: Response, 59 | guess_direction: bool = False, 60 | follow_corrections: FollowCorrections = Query( 61 | default=FollowCorrections.ALWAYS, 62 | description=FOLLOW_CORRECTIONS_DESCRIPTION, 63 | ), 64 | ): 65 | """ 66 | Translate the query between src and dst language. 67 | 68 | The response contains the list of lemma objects matching the query in the source 69 | language. Each of these lemmas is annotated with one or multiple translations 70 | and optional examples. 71 | """ 72 | result = await client.process_search_result( 73 | query=query, 74 | src=src, 75 | dst=dst, 76 | guess_direction=guess_direction, 77 | follow_corrections=follow_corrections, 78 | ) 79 | if isinstance(result, ParseError): 80 | response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR 81 | return result 82 | return result.lemmas 83 | 84 | 85 | @app.get( 86 | "/api/v2/examples", 87 | status_code=status.HTTP_200_OK, 88 | responses={ 89 | status.HTTP_200_OK: {"model": list[SearchResult.Example]}, 90 | status.HTTP_500_INTERNAL_SERVER_ERROR: {"model": ParseError}, 91 | }, 92 | ) 93 | async def examples( 94 | query: str, 95 | src: LANGUAGE_CODE, 96 | dst: LANGUAGE_CODE, 97 | response: Response, 98 | guess_direction: bool = False, 99 | follow_corrections: FollowCorrections = Query( 100 | default=FollowCorrections.ALWAYS, 101 | description=FOLLOW_CORRECTIONS_DESCRIPTION, 102 | ), 103 | ): 104 | """Provide translation examples.""" 105 | result = await client.process_search_result( 106 | query=query, 107 | src=src, 108 | dst=dst, 109 | guess_direction=guess_direction, 110 | follow_corrections=follow_corrections, 111 | ) 112 | if isinstance(result, ParseError): 113 | response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR 114 | return result 115 | return result.examples 116 | 117 | 118 | @app.get( 119 | "/api/v2/external_sources", 120 | status_code=status.HTTP_200_OK, 121 | responses={ 122 | status.HTTP_200_OK: {"model": list[SearchResult.ExternalSource]}, 123 | status.HTTP_500_INTERNAL_SERVER_ERROR: {"model": ParseError}, 124 | }, 125 | ) 126 | async def external_sources( 127 | query: str, 128 | src: LANGUAGE_CODE, 129 | dst: LANGUAGE_CODE, 130 | response: Response, 131 | guess_direction: bool = False, 132 | follow_corrections: FollowCorrections = Query( 133 | default=FollowCorrections.ALWAYS, 134 | description=FOLLOW_CORRECTIONS_DESCRIPTION, 135 | ), 136 | ): 137 | """Provide translation examples from external (unverified) sources.""" 138 | result = await client.process_search_result( 139 | query=query, 140 | src=src, 141 | dst=dst, 142 | guess_direction=guess_direction, 143 | follow_corrections=follow_corrections, 144 | ) 145 | if isinstance(result, ParseError): 146 | response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR 147 | return result 148 | return result.external_sources 149 | 150 | 151 | @app.get( 152 | "/api/v2/autocompletions", 153 | status_code=status.HTTP_200_OK, 154 | responses={ 155 | status.HTTP_200_OK: {"model": list[Autocompletions.AutocompletionItem]}, 156 | status.HTTP_500_INTERNAL_SERVER_ERROR: {"model": ParseError}, 157 | }, 158 | ) 159 | async def autocompletions( 160 | query: str, 161 | src: LANGUAGE_CODE, 162 | dst: LANGUAGE_CODE, 163 | response: Response, 164 | ): 165 | """Provide translation examples from external (unverified) sources.""" 166 | result = await client.process_autocompletions( 167 | query=query, 168 | src_lang_code=src, 169 | dst_lang_code=dst, 170 | ) 171 | if isinstance(result, ParseError): 172 | response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR 173 | return result 174 | return result.autocompletions 175 | -------------------------------------------------------------------------------- /linguee_api/config.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from typing import Optional 3 | 4 | from pydantic import BaseSettings 5 | 6 | from linguee_api.const import PROJECT_ROOT 7 | 8 | 9 | class Settings(BaseSettings): 10 | """Application settings.""" 11 | 12 | # Sentry settings 13 | sentry_dsn: Optional[str] = None 14 | sentry_environment: str = "development" 15 | 16 | # File and SQLite cache settings 17 | cache_directory: pathlib.Path = PROJECT_ROOT / ".cache" 18 | 19 | @property 20 | def cache_database(self) -> pathlib.Path: 21 | """Cache database.""" 22 | return self.cache_directory / "cache.sqlite3" 23 | 24 | class Config: 25 | env_file = (PROJECT_ROOT / ".env").as_posix() 26 | 27 | 28 | settings = Settings() 29 | -------------------------------------------------------------------------------- /linguee_api/const.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from typing import Literal 3 | 4 | PROJECT_ROOT = pathlib.Path(__file__).parents[1] 5 | USER_AGENT = "Linguee API proxy (https://github.com/imankulov/linguee-api)" 6 | LANGUAGE_CODE = Literal[ 7 | "bg", 8 | "cs", 9 | "da", 10 | "de", 11 | "el", 12 | "en", 13 | "es", 14 | "et", 15 | "fi", 16 | "fr", 17 | "hu", 18 | "it", 19 | "ja", 20 | "lt", 21 | "lv", 22 | "mt", 23 | "nl", 24 | "pl", 25 | "pt", 26 | "ro", 27 | "ru", 28 | "sk", 29 | "sl", 30 | "sv", 31 | "zh", 32 | ] 33 | LANGUAGES = { 34 | "bg": "bulgarian", 35 | "cs": "czech", 36 | "da": "danish", 37 | "de": "german", 38 | "el": "greek", 39 | "en": "english", 40 | "es": "spanish", 41 | "et": "estonian", 42 | "fi": "finnish", 43 | "fr": "french", 44 | "hu": "hungarian", 45 | "it": "italian", 46 | "ja": "japanese", 47 | "lt": "lithuanian", 48 | "lv": "latvian", 49 | "mt": "maltese", 50 | "nl": "dutch", 51 | "pl": "polish", 52 | "pt": "portuguese", 53 | "ro": "romanian", 54 | "ru": "russian", 55 | "sk": "slovak", 56 | "sl": "slovene", 57 | "sv": "swedish", 58 | "zh": "chinese", 59 | } 60 | MAX_REDIRECTS = 5 61 | PROJECT_DESCRIPTION = """ 62 |

63 | Linguee provides excellent 64 | dictionary and translation memory service. Unfortunately, there is no way you 65 | can get automated access to it. Linguee API fixes the problem. It acts as a 66 | proxy and converts their HTML responses to easy-to-use JSON API. 67 |

68 |

69 | This installation is an example. If you want to have reliable service, install 70 | it yourself. The source code and installation instructions are available at 71 | github.com/imankulov/linguee-api. 73 |

74 |

75 | For any questions, ideas or bug reports, fill in 76 | 77 | the issue at GitHub. 78 |

79 | """ 80 | 81 | FOLLOW_CORRECTIONS_DESCRIPTION = """A flag that defines how to treat responses with a 82 | 'did you mean' link. There are three possible values: 83 | 84 | - `always` (default): always follow the suggestion if found on a page, even if the page 85 | itself has translations. 86 | - `never`: never follow the suggested correction. 87 | - `on_empty_translations`: only follow the link if there are no translations on 88 | the page. 89 | """ 90 | -------------------------------------------------------------------------------- /linguee_api/downloaders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imankulov/linguee-api/9844c8247b07a2771b1555f09c8bbc6ea83f08d7/linguee_api/downloaders/__init__.py -------------------------------------------------------------------------------- /linguee_api/downloaders/error_downloader.py: -------------------------------------------------------------------------------- 1 | from linguee_api.downloaders.interfaces import DownloaderError, IDownloader 2 | 3 | 4 | class ErrorDownloader(IDownloader): 5 | """ 6 | A downloader that always raises an DownloaderError. 7 | 8 | Helpful to use as the upstream downloader for cache in tests to make sure 9 | that we don't send requests to the server. 10 | """ 11 | 12 | async def download(self, url: str) -> str: 13 | raise DownloaderError(f"I cannot download {url}") 14 | -------------------------------------------------------------------------------- /linguee_api/downloaders/file_cache.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import urllib.parse 3 | from typing import Optional 4 | 5 | from linguee_api.downloaders.interfaces import ICache, IDownloader 6 | 7 | 8 | class FileCache(ICache): 9 | """File Cache.""" 10 | 11 | def __init__(self, cache_directory: pathlib.Path, upstream: IDownloader): 12 | self.cache_directory = cache_directory 13 | self.upstream = upstream 14 | self.cache_directory.mkdir(parents=True, exist_ok=True) 15 | 16 | async def get_from_cache(self, url: str) -> Optional[str]: 17 | page_file = self._get_page_file(url) 18 | if page_file.is_file(): 19 | return page_file.read_text(encoding="utf-8") 20 | return None 21 | 22 | async def put_to_cache(self, url: str, page: str) -> None: 23 | page_file = self._get_page_file(url) 24 | page_file.write_text(page, encoding="utf-8") 25 | 26 | def _get_page_file(self, url: str) -> pathlib.Path: 27 | return self.cache_directory / urllib.parse.quote(url, safe="") 28 | -------------------------------------------------------------------------------- /linguee_api/downloaders/httpx_downloader.py: -------------------------------------------------------------------------------- 1 | import httpx 2 | 3 | from linguee_api.downloaders.interfaces import DownloaderError, IDownloader 4 | 5 | ERROR_503 = ( 6 | "The Linguee server returned 503. The API proxy was temporarily blocked by " 7 | "Linguee. For more details, see https://github.com/imankulov/linguee-api#" 8 | "the-api-server-returns-the-linguee-server-returned-503" 9 | ) 10 | 11 | 12 | class HTTPXDownloader(IDownloader): 13 | """ 14 | Real downloader. 15 | 16 | Sends request to linguee.com to read the page. 17 | """ 18 | 19 | async def download(self, url: str) -> str: 20 | async with httpx.AsyncClient() as client: 21 | try: 22 | response = await client.get(url) 23 | except httpx.ConnectError as e: 24 | raise DownloaderError(str(e)) from e 25 | 26 | if response.status_code == 503: 27 | raise DownloaderError(ERROR_503) 28 | 29 | if response.status_code != 200: 30 | raise DownloaderError( 31 | f"The Linguee server returned {response.status_code}" 32 | ) 33 | return response.text 34 | -------------------------------------------------------------------------------- /linguee_api/downloaders/interfaces.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from typing import Optional 3 | 4 | 5 | class DownloaderError(Exception): 6 | pass 7 | 8 | 9 | class IDownloader(abc.ABC): 10 | @abc.abstractmethod 11 | async def download(self, url: str) -> str: 12 | """Download a page or raise an exception""" 13 | ... 14 | 15 | 16 | class ICache(IDownloader, abc.ABC): 17 | 18 | upstream: IDownloader 19 | 20 | @abc.abstractmethod 21 | async def get_from_cache(self, url: str) -> Optional[str]: 22 | """Return a page from the cache.""" 23 | ... 24 | 25 | @abc.abstractmethod 26 | async def put_to_cache(self, url: str, page: str) -> None: 27 | """Put a page to the cache.""" 28 | ... 29 | 30 | async def download(self, url: str) -> str: 31 | page = await self.get_from_cache(url) 32 | if page is None: 33 | page = await self.upstream.download(url) 34 | await self.put_to_cache(url, page) 35 | return page 36 | -------------------------------------------------------------------------------- /linguee_api/downloaders/memory_cache.py: -------------------------------------------------------------------------------- 1 | from async_lru import alru_cache 2 | 3 | from linguee_api.downloaders.interfaces import IDownloader 4 | 5 | 6 | class MemoryCache(IDownloader): 7 | """Memory cache. 8 | 9 | Exposes the downloader interface, but requires the upstream to work and 10 | keeps records in memory. 11 | """ 12 | 13 | def __init__(self, upstream: IDownloader, maxsize: int = 1000): 14 | self.upstream = upstream 15 | self.download = alru_cache(maxsize=maxsize)(self.download) # type: ignore 16 | 17 | async def download(self, url: str) -> str: 18 | return await self.upstream.download(url) 19 | -------------------------------------------------------------------------------- /linguee_api/downloaders/mock_downloader.py: -------------------------------------------------------------------------------- 1 | from linguee_api.downloaders.interfaces import IDownloader 2 | 3 | MESSAGE = "Hello world!" 4 | 5 | 6 | class MockDownloader(IDownloader): 7 | """ 8 | A downloader that always return "Hello world!". 9 | 10 | Helpful to test the cache layer. 11 | """ 12 | 13 | def __init__(self, message: str = MESSAGE): 14 | self.message = message 15 | 16 | async def download(self, url: str) -> str: 17 | return self.message 18 | -------------------------------------------------------------------------------- /linguee_api/downloaders/sqlite_cache.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from typing import Optional 3 | 4 | import aiosqlite 5 | 6 | from linguee_api.downloaders.interfaces import ICache, IDownloader 7 | 8 | 9 | class SQLiteCache(ICache): 10 | """SQLite Cache.""" 11 | 12 | def __init__(self, cache_database: pathlib.Path, upstream: IDownloader): 13 | self.cache_database = cache_database 14 | self.upstream = upstream 15 | 16 | async def get_from_cache(self, url: str) -> Optional[str]: 17 | await self._ensure_database_initialized() 18 | async with aiosqlite.connect(self.cache_database) as db: 19 | async with db.execute( 20 | "SELECT page FROM cache WHERE url = ?", [url] 21 | ) as cursor: 22 | row = await cursor.fetchone() 23 | if row is None: 24 | return None 25 | return row[0] 26 | 27 | async def put_to_cache(self, url: str, page: str) -> None: 28 | async with aiosqlite.connect(self.cache_database) as db: 29 | await db.execute("INSERT INTO cache (url, page) VALUES (?, ?)", [url, page]) 30 | await db.commit() 31 | 32 | async def _ensure_database_initialized(self): 33 | if self.cache_database.is_file(): 34 | return 35 | self.cache_database.parent.mkdir(parents=True, exist_ok=True) 36 | async with aiosqlite.connect(self.cache_database) as db: 37 | await db.execute( 38 | """CREATE TABLE IF NOT EXISTS cache ( 39 | url TEXT PRIMARY KEY, 40 | page TEXT, 41 | created_at DATETIME DEFAULT CURRENT_TIMESTAMP 42 | )""" 43 | ) 44 | await db.commit() 45 | -------------------------------------------------------------------------------- /linguee_api/linguee_client.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from urllib.parse import urlencode 3 | 4 | from loguru import logger 5 | 6 | from linguee_api.const import LANGUAGE_CODE, LANGUAGES, MAX_REDIRECTS 7 | from linguee_api.downloaders.interfaces import DownloaderError, IDownloader 8 | from linguee_api.models import ( 9 | Autocompletions, 10 | AutocompletionsOrError, 11 | Correction, 12 | FollowCorrections, 13 | NotFound, 14 | ParseError, 15 | SearchResult, 16 | ) 17 | from linguee_api.parsers import IParser 18 | 19 | 20 | class LingueeClient: 21 | """Linguee client. The core class of the application.""" 22 | 23 | def __init__( 24 | self, 25 | *, 26 | page_downloader: IDownloader, 27 | page_parser: IParser, 28 | max_redirects=MAX_REDIRECTS, 29 | ): 30 | self.page_downloader = page_downloader 31 | self.page_parser = page_parser 32 | self.max_redirects = max_redirects 33 | 34 | async def process_search_result( 35 | self, 36 | *, 37 | query: str, 38 | src: LANGUAGE_CODE, 39 | dst: LANGUAGE_CODE, 40 | guess_direction: bool, 41 | follow_corrections: FollowCorrections, 42 | ) -> Union[SearchResult, ParseError]: 43 | logger.info( 44 | f"Processing API request: {query=}, {src=}, {dst=}, " 45 | f"{guess_direction=}, {follow_corrections=}" 46 | ) 47 | url = get_search_url( 48 | query=query, 49 | src=src, 50 | dst=dst, 51 | guess_direction=guess_direction, 52 | ) 53 | 54 | for i in range(self.max_redirects): 55 | try: 56 | page_html = await self.page_downloader.download(url) 57 | except DownloaderError as error: 58 | logger.error(f"Error downloading URL: {error=}, {url=}") 59 | return ParseError(message=str(error)) 60 | 61 | parse_result = self.page_parser.parse_search_result( 62 | page_html, follow_corrections=follow_corrections 63 | ) 64 | if isinstance(parse_result, ParseError): 65 | logger.info(f"Parser returned parse error: {parse_result=}") 66 | return parse_result 67 | elif isinstance(parse_result, Correction): 68 | logger.info(f"Parser returned correction: {parse_result=}") 69 | url = get_search_url( 70 | query=parse_result.correction, 71 | src=src, 72 | dst=dst, 73 | guess_direction=guess_direction, 74 | ) 75 | elif isinstance(parse_result, SearchResult): 76 | logger.info( 77 | f"Parser returned search result: " 78 | f"{parse_result.query=}, " 79 | f"{len(parse_result.lemmas)=}, " 80 | f"{len(parse_result.examples)=}, " 81 | f"{len(parse_result.external_sources)=}" 82 | ) 83 | return parse_result 84 | elif isinstance(parse_result, NotFound): 85 | logger.info("Parser returned not found") 86 | return ParseError(message="Translation not found") 87 | else: 88 | logger.error(f"Unexpected API result: {parse_result=}") 89 | raise RuntimeError(f"Unexpected API result: {parse_result}") 90 | 91 | still_redirecting = f"Still redirecting after {self.max_redirects} redirects" 92 | logger.error(still_redirecting) 93 | return ParseError(message=still_redirecting) 94 | 95 | async def process_autocompletions( 96 | self, 97 | *, 98 | query: str, 99 | src_lang_code: LANGUAGE_CODE, 100 | dst_lang_code: LANGUAGE_CODE, 101 | ) -> AutocompletionsOrError: 102 | url = get_autocompletions_url( 103 | query=query, 104 | src=src_lang_code, 105 | dst=dst_lang_code, 106 | ) 107 | try: 108 | page_html = await self.page_downloader.download(url) 109 | except DownloaderError as error: 110 | return ParseError(message=str(error)) 111 | 112 | parse_result = self.page_parser.parse_autocompletions(page_html) 113 | if isinstance(parse_result, ParseError): 114 | return parse_result 115 | elif isinstance(parse_result, Autocompletions): 116 | return parse_result 117 | 118 | raise RuntimeError(f"Unexpected API result: {parse_result}") 119 | 120 | 121 | def get_search_url( 122 | *, 123 | query: str, 124 | src: LANGUAGE_CODE, 125 | dst: LANGUAGE_CODE, 126 | guess_direction: bool, 127 | ): 128 | """ 129 | Return a Linguee URL. 130 | """ 131 | src_lang_name = LANGUAGES[src] 132 | dst_lang_name = LANGUAGES[dst] 133 | url = f"https://www.linguee.com/{src_lang_name}-{dst_lang_name}/search" 134 | query_params = { 135 | "query": query, 136 | "ajax": "1", 137 | } 138 | if not guess_direction: 139 | query_params["source"] = src.upper() 140 | return f"{url}?{urlencode(query_params)}" 141 | 142 | 143 | def get_autocompletions_url( 144 | *, 145 | query: str, 146 | src: LANGUAGE_CODE, 147 | dst: LANGUAGE_CODE, 148 | ): 149 | """Return a URL for auto-completions.""" 150 | src_lang_name = LANGUAGES[src] 151 | dst_lang_name = LANGUAGES[dst] 152 | url = f"https://www.linguee.com/{src_lang_name}-{dst_lang_name}/search" 153 | query_params = { 154 | "qe": query, 155 | } 156 | return f"{url}?{urlencode(query_params)}" 157 | -------------------------------------------------------------------------------- /linguee_api/models.py: -------------------------------------------------------------------------------- 1 | """Data classes that define the schema of the API response.""" 2 | from enum import Enum 3 | from typing import List, Optional, Union 4 | 5 | from pydantic import BaseModel, Field, validator 6 | 7 | from linguee_api.parser_utils import remove_round_brackets_and_split_by_commas 8 | 9 | 10 | class FollowCorrections(Enum): 11 | ALWAYS = "always" 12 | NEVER = "never" 13 | ON_EMPTY_TRANSLATIONS = "on_empty_translations" 14 | 15 | 16 | class AudioLink(BaseModel): 17 | """The link to the audio file along with the language variant.""" 18 | 19 | url: str = Field( 20 | example=( 21 | "https://www.linguee.com/mp3/PT_BR/f5/" 22 | "f5491d72610965dd0a287c1ab1025c0f-300.mp3" 23 | ) 24 | ) 25 | lang: str = Field(example="Brazilian Portuguese") 26 | 27 | 28 | class UsageFrequency(Enum): 29 | """Translation usage frequency. Valid values: `often` or `almost_always`.""" 30 | 31 | OFTEN = "often" 32 | ALMOST_ALWAYS = "almost_always" 33 | 34 | 35 | class SearchResult(BaseModel): 36 | """The root structure of parsed API response.""" 37 | 38 | class Lemma(BaseModel): 39 | """Information about one found word (lemma).""" 40 | 41 | class Translation(BaseModel): 42 | """One of the possible translation of the term.""" 43 | 44 | class TranslationExample(BaseModel): 45 | """A translation example.""" 46 | 47 | src: str = Field( 48 | example=( 49 | "Estou obrigado pelo contrato a " 50 | "trabalhar seis horas por dia." 51 | ) 52 | ) 53 | dst: str = Field( 54 | example="I am bound by the contract to work six hours a day." 55 | ) 56 | 57 | featured: bool = Field(example=False) 58 | text: str = Field(example="required") 59 | pos: Optional[str] = Field(example="adjective / past participle, masculine") 60 | audio_links: Optional[List[AudioLink]] 61 | examples: Optional[List[TranslationExample]] 62 | usage_frequency: Optional[UsageFrequency] = Field( 63 | example=UsageFrequency.OFTEN 64 | ) 65 | 66 | featured: bool = Field(example=False) 67 | text: str = Field(example="obrigado") 68 | pos: Optional[str] = Field(example="interjection") 69 | forms: List[str] = Field( 70 | example=["obrigada f sl", "obrigados m pl", "obrigadas f pl"] 71 | ) 72 | grammar_info: Optional[str] = Field(example="Akk") 73 | audio_links: Optional[List[AudioLink]] 74 | translations: List[Translation] 75 | 76 | @validator("forms", pre=True, always=True) 77 | def _validate_forms(cls, v): 78 | return remove_round_brackets_and_split_by_commas(v) 79 | 80 | class Example(BaseModel): 81 | """One example.""" 82 | 83 | class Translation(BaseModel): 84 | """Translation example.""" 85 | 86 | text: str = Field(example="big thanks") 87 | pos: Optional[str] = Field(example="n [colloq.]") 88 | 89 | text: str = Field(example="muito obrigado") 90 | pos: Optional[str] = Field(example="m") 91 | audio_links: Optional[List[AudioLink]] 92 | translations: List[Translation] 93 | 94 | class ExternalSource(BaseModel): 95 | """An example of usage of the word in the context.""" 96 | 97 | src: str = Field( 98 | example=( 99 | "Parabéns e um grande obrigado a todos que ajudaram [...] " 100 | "ao sucesso desta noite!" 101 | ) 102 | ) 103 | dst: str = Field( 104 | example=( 105 | "Well done and many thanks to everyone who helped [...] " 106 | "make this evening a success!" 107 | ) 108 | ) 109 | src_url: str = Field( 110 | example="http://www.findmadeleine.com/pt/updates@page=2.html" 111 | ) 112 | dst_url: str = Field(example="http://www.findmadeleine.com/updates@page=2.html") 113 | 114 | src_lang: str = Field(example="pt") 115 | dst_lang: str = Field(example="en") 116 | query: str = Field(example="obrigado") 117 | correct_query: str = Field(example="obrigado") 118 | lemmas: List[Lemma] 119 | examples: List[Example] 120 | external_sources: List[ExternalSource] 121 | 122 | 123 | class Autocompletions(BaseModel): 124 | """The root structure of the API response for auto-completions.""" 125 | 126 | class AutocompletionItem(BaseModel): 127 | """Information about one word.""" 128 | 129 | class TranslationItem(BaseModel): 130 | text: str = Field(example="cat") 131 | pos: Optional[str] = Field(example="n") 132 | 133 | text: str = Field(example="Katze") 134 | pos: Optional[str] = Field(example="f") 135 | translations: List[TranslationItem] 136 | 137 | autocompletions: List[AutocompletionItem] 138 | 139 | 140 | class Correction(BaseModel): 141 | """ 142 | A redirect to the correct form. 143 | 144 | This response is returned by a parser, when a spelling issue is found, and 145 | a redirect to the correct form is needed. 146 | """ 147 | 148 | correction: str 149 | 150 | 151 | class NotFound(BaseModel): 152 | """ 153 | LemmaTranslation not found. 154 | 155 | The query is not recognized as a meaningful word. Nothing to translate. 156 | """ 157 | 158 | pass 159 | 160 | 161 | class ParseError(BaseModel): 162 | """Unexpected parsing error. Don't know what to do.""" 163 | 164 | message: str 165 | 166 | 167 | SearchResultOrError = Union[SearchResult, ParseError, Correction, NotFound] 168 | AutocompletionsOrError = Union[Autocompletions, ParseError] 169 | -------------------------------------------------------------------------------- /linguee_api/parser_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Any, Dict, List, Optional 3 | 4 | from xextract import Group 5 | from xextract.parsers import BaseNamedParser 6 | 7 | 8 | def concat_values(name: str, *children: BaseNamedParser): 9 | """ 10 | Concatenate values from children. 11 | 12 | Extract values from all the children, flatten and concatenate them as one string. 13 | """ 14 | return Group( 15 | name=name, 16 | children=children, 17 | quant="?", 18 | callback=_concat_values_callback, 19 | ) 20 | 21 | 22 | def _concat_values_callback(objects: Dict[str, Any]) -> str: 23 | ret = [] 24 | for value in objects.values(): 25 | if isinstance(value, list): 26 | ret.append(" ".join(str(item) for item in value)) 27 | else: 28 | ret.append(str(value)) 29 | return normalize(" ".join(ret)) 30 | 31 | 32 | def normalize(text: str) -> str: 33 | """ 34 | Replace all whitespaces in the text with a single space. 35 | 36 | For example " foo bar " is converted to "foo bar". 37 | """ 38 | return re.sub(r"\s+", " ", text).strip() 39 | 40 | 41 | def remove_round_brackets_and_split_by_commas(text: Optional[str]) -> List[str]: 42 | """Remove round brackets and split by commas.""" 43 | if not text: 44 | return [] 45 | 46 | stripped_text = text.strip().strip("()") 47 | if not stripped_text: 48 | return [] 49 | 50 | return [item.strip() for item in stripped_text.split(",") if item.strip()] 51 | 52 | 53 | def take_first_item(variants) -> Optional[str]: 54 | """Take the first item variant and normalize.""" 55 | if not variants["item"]: 56 | return None 57 | return variants["item"][0] 58 | 59 | 60 | def take_first_non_empty_item(variants) -> Optional[str]: 61 | """Take the first non-empty item variant and normalize.""" 62 | for item in variants["item"]: 63 | if item: 64 | return item 65 | return None 66 | -------------------------------------------------------------------------------- /linguee_api/parsers.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from typing import Dict, List, Optional 3 | 4 | from xextract import Group, String 5 | 6 | from linguee_api.models import ( 7 | Autocompletions, 8 | AutocompletionsOrError, 9 | Correction, 10 | FollowCorrections, 11 | NotFound, 12 | SearchResult, 13 | SearchResultOrError, 14 | UsageFrequency, 15 | ) 16 | from linguee_api.parser_utils import ( 17 | concat_values, 18 | normalize, 19 | take_first_item, 20 | take_first_non_empty_item, 21 | ) 22 | 23 | 24 | class IParser(abc.ABC): 25 | @abc.abstractmethod 26 | def parse_search_result( 27 | self, page_html: str, follow_corrections: FollowCorrections 28 | ) -> SearchResultOrError: 29 | ... 30 | 31 | @abc.abstractmethod 32 | def parse_autocompletions(self, page_html: str) -> AutocompletionsOrError: 33 | ... 34 | 35 | 36 | class XExtractParser(IParser): 37 | def parse_search_result( 38 | self, page_html: str, follow_corrections: FollowCorrections 39 | ) -> SearchResultOrError: 40 | # find correction, if asked. We'll use it on not found or empty response. 41 | correction = None 42 | 43 | if follow_corrections in ( 44 | FollowCorrections.ALWAYS, 45 | FollowCorrections.ON_EMPTY_TRANSLATIONS, 46 | ): 47 | correction = self.find_correction(page_html) 48 | 49 | # check if the page is correction 50 | if correction and follow_corrections == FollowCorrections.ALWAYS: 51 | return Correction(correction=correction) 52 | 53 | # check if the page is a not found 54 | if self.is_not_found(page_html): 55 | if correction: 56 | return Correction(correction=correction) 57 | return NotFound() 58 | 59 | # assume it's a valid result 60 | result = self.parse_search_result_to_page(page_html) 61 | 62 | # Process ON_EMTPY case 63 | if correction and not result.lemmas: 64 | return Correction(correction=correction) 65 | 66 | return result 67 | 68 | def is_not_found(self, page_html: str) -> bool: 69 | """Return True if the page is a NOT FOUND page.""" 70 | return String(css="h1.noresults").parse(page_html) != [] 71 | 72 | def find_correction(self, page_html: str) -> Optional[str]: 73 | """Find the correction for a NOT FOUND page.""" 74 | corrections = String(css="span.corrected").parse(page_html) 75 | if corrections: 76 | return corrections[0] 77 | return None 78 | 79 | def parse_search_result_to_page(self, page_html: str) -> SearchResult: 80 | parsed_result = self.parse_search_result_to_dict(page_html) 81 | return SearchResult(**parsed_result) 82 | 83 | def parse_search_result_to_dict(self, page_html: str) -> dict: 84 | return search_result_schema.parse(page_html) 85 | 86 | def parse_autocompletions(self, page_html: str) -> AutocompletionsOrError: 87 | parsed_result = self.parse_autocompletions_to_dict(page_html) 88 | return Autocompletions(**parsed_result) 89 | 90 | def parse_autocompletions_to_dict(self, page_html: str) -> dict: 91 | return autocompletions_schema.parse(page_html) 92 | 93 | 94 | def is_featured(classname): 95 | return "featured" in classname 96 | 97 | 98 | def normalize_example(text): 99 | """ 100 | Normalize the text in the example. 101 | 102 | Same as normalize(), but remove the last two words, which are the links to the 103 | source website. 104 | """ 105 | text = normalize(text) 106 | text = " ".join(text.split()[:-2]) 107 | return text 108 | 109 | 110 | def parse_audio_links(text: Optional[str]) -> List[Dict[str, str]]: 111 | if not text: 112 | return [] 113 | 114 | chunks = [chunk.strip('");') for chunk in text.split(",")] 115 | if not chunks: 116 | return [] 117 | 118 | ret = [] 119 | for i in range(1, len(chunks), 2): 120 | url_part = chunks[i] 121 | lang = chunks[i + 1] 122 | url = f"https://www.linguee.com/mp3/{url_part}.mp3" 123 | ret.append({"url": url, "lang": lang}) 124 | return ret 125 | 126 | 127 | def parse_usage_frequency(text: Optional[str]) -> Optional[UsageFrequency]: 128 | if not text: 129 | return None 130 | chunks = set(text.strip().split()) 131 | if "usedveryoften" in chunks: 132 | return UsageFrequency.OFTEN 133 | if "usedalmostalways" in chunks: 134 | return UsageFrequency.ALMOST_ALWAYS 135 | return None 136 | 137 | 138 | def normalize_lemma_text(children): 139 | return " ".join(children["item"]) 140 | 141 | 142 | lemma_schema = [ 143 | String( 144 | name="featured", 145 | xpath="self::*", 146 | attr="class", 147 | quant=1, 148 | callback=is_featured, 149 | ), 150 | # We parse text as a group, because the lemma may have one or more elements, all 151 | # of them represented with "a.dictLink". In most cases it's just a single element, 152 | # but if it's more, we need to collect then all, and merge them together in the 153 | # group callback normalize_lemma_text() 154 | Group( 155 | name="text", 156 | quant=1, 157 | css="span.tag_lemma", 158 | callback=normalize_lemma_text, 159 | children=[ 160 | String( 161 | name="item", 162 | css="a.dictLink", 163 | quant="+", 164 | callback=normalize, 165 | ), 166 | ], 167 | ), 168 | concat_values( 169 | "pos", 170 | String( 171 | name="pos", 172 | css="span.tag_lemma > span.tag_wordtype, span.tag_lemma > span.tag_type", 173 | quant="*", 174 | ), 175 | ), 176 | # Return a single string, that's being converted to a list in the model's validator. 177 | concat_values( 178 | "forms", 179 | String( 180 | name="forms", 181 | css="span.tag_forms", 182 | quant="*", 183 | attr="_all_text", 184 | ), 185 | ), 186 | # We parse text as a group, because grammar_info may have zero or more elements. 187 | # and we care about the first record only 188 | Group( 189 | name="grammar_info", 190 | quant=1, 191 | callback=take_first_item, 192 | children=[ 193 | String( 194 | name="item", 195 | quant="*", 196 | callback=normalize, 197 | css=( 198 | "span.tag_lemma > span.tag_lemma_context > " 199 | "span.placeholder > span.grammar_info" 200 | ), 201 | ) 202 | ], 203 | ), 204 | String( 205 | name="audio_links", 206 | quant="?", 207 | css="span.tag_lemma > a.audio", 208 | attr="onclick", 209 | callback=parse_audio_links, 210 | ), 211 | Group( 212 | name="translations", 213 | css="div.translation_lines div.translation", 214 | quant="+", 215 | children=[ 216 | String( 217 | name="featured", 218 | xpath="self::*", 219 | attr="class", 220 | quant=1, 221 | callback=is_featured, 222 | ), 223 | String( 224 | name="text", 225 | css="a.dictLink", 226 | quant=1, 227 | callback=normalize, 228 | ), 229 | concat_values( 230 | "pos", 231 | String( 232 | name="pos", 233 | css="span.tag_type", 234 | quant="*", 235 | attr="title", 236 | ), 237 | ), 238 | String( 239 | name="audio_links", 240 | quant="?", 241 | css="a.audio", 242 | attr="onclick", 243 | callback=parse_audio_links, 244 | ), 245 | Group( 246 | name="usage_frequency", 247 | quant=1, 248 | callback=take_first_non_empty_item, 249 | children=[ 250 | String( 251 | name="item", 252 | quant="*", 253 | css="span.tag_c", 254 | attr="class", 255 | callback=parse_usage_frequency, 256 | ), 257 | ], 258 | ), 259 | Group( 260 | name="examples", 261 | css=".example_lines > .example", 262 | quant="*", 263 | children=[ 264 | String(name="src", css=".tag_s", quant=1, callback=normalize), 265 | String(name="dst", css=".tag_t", quant=1, callback=normalize), 266 | ], 267 | ), 268 | ], 269 | ), 270 | ] 271 | 272 | source_url_schema = [ 273 | String( 274 | name="src_url", 275 | css="div.source_url > a", 276 | attr="href", 277 | quant="?", 278 | ), 279 | String( 280 | name="src_url_text", 281 | css="div.source_url", 282 | quant="?", 283 | ), 284 | ] 285 | 286 | 287 | def normalize_source_url(content): 288 | if content["src_url"]: 289 | return content["src_url"] 290 | if content["src_url_text"]: 291 | return f"http://{content['src_url_text']}" 292 | return None 293 | 294 | 295 | search_result_schema = Group( 296 | quant=1, 297 | children=[ 298 | String(name="src_lang", css="div#data", attr="data-lang1", quant=1), 299 | String(name="dst_lang", css="div#data", attr="data-lang2", quant=1), 300 | String(name="query", css="div#data", attr="data-query", quant=1), 301 | String( 302 | name="correct_query", 303 | css="div#data", 304 | attr="data-correctspellingofquery", 305 | quant=1, 306 | ), 307 | Group( 308 | quant="*", 309 | css="div.exact > div.lemma", 310 | name="lemmas", 311 | children=lemma_schema, 312 | ), 313 | Group( 314 | quant="*", 315 | css="div.example_lines div.lemma", 316 | name="examples", 317 | children=lemma_schema, 318 | ), 319 | Group( 320 | quant="*", 321 | css="table.result_table > tbody > tr", 322 | name="external_sources", 323 | children=[ 324 | String( 325 | name="src", 326 | css="td.left > div.wrap", 327 | quant=1, 328 | attr="_all_text", 329 | callback=normalize_example, 330 | ), 331 | String( 332 | name="dst", 333 | css="td.right2 > div.wrap", 334 | quant=1, 335 | attr="_all_text", 336 | callback=normalize_example, 337 | ), 338 | Group( 339 | name="src_url", 340 | quant=1, 341 | css="td.left", 342 | children=source_url_schema, 343 | callback=normalize_source_url, 344 | ), 345 | Group( 346 | name="dst_url", 347 | quant=1, 348 | css="td.right2", 349 | children=source_url_schema, 350 | callback=normalize_source_url, 351 | ), 352 | ], 353 | ), 354 | ], 355 | ) 356 | 357 | 358 | autocompletions_schema = Group( 359 | quant=1, 360 | children=[ 361 | Group( 362 | quant="*", 363 | css="div.autocompletion_item", 364 | name="autocompletions", 365 | children=[ 366 | String( 367 | name="text", 368 | css="div.main_row > div.main_item", 369 | quant=1, 370 | callback=normalize, 371 | ), 372 | concat_values( 373 | "pos", 374 | String( 375 | name="pos", 376 | css="div.main_row > div.main_wordtype", 377 | quant="*", 378 | ), 379 | ), 380 | Group( 381 | quant="+", 382 | name="translations", 383 | css="div.translation_row > div > div.translation_item", 384 | children=[ 385 | String( 386 | name="text", xpath="self::*", quant=1, callback=normalize 387 | ), 388 | concat_values( 389 | "pos", 390 | String( 391 | name="pos", 392 | css="div.translation_item > div.wordtype", 393 | quant="*", 394 | ), 395 | ), 396 | ], 397 | ), 398 | ], 399 | ) 400 | ], 401 | ) 402 | -------------------------------------------------------------------------------- /linguee_api/utils.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | 4 | def import_string(import_name: str): 5 | """ 6 | Import an object based on the import string. 7 | 8 | Separate module name from the object name with ":". For example, 9 | "linuguee_api.downloaders:HTTPXDownloader" 10 | """ 11 | if ":" not in import_name: 12 | raise RuntimeError( 13 | f'{import_name} must separate module from object with ":". ' 14 | f'For example, "linguee_api.downloaders:HTTPXDownloader"' 15 | ) 16 | module_name, object_name = import_name.rsplit(":", 1) 17 | mod = importlib.import_module(module_name) 18 | return getattr(mod, object_name) 19 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = 3.9 3 | follow_imports = silent 4 | scripts_are_modules = true 5 | namespace_packages = true 6 | no_implicit_optional = true 7 | 8 | # We had to ignore missing imports, because of third-party libraries installed 9 | # inside the virtualenv, and apparently there's no easy way for mypy to respect 10 | # packages inside the virtualenv. That's the option pre-commit-config runs with 11 | # by default, but we add it here as well for the sake of uniformity of the 12 | # output 13 | ignore_missing_imports = true 14 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "linguee-api" 3 | version = "2.6.3" 4 | description = "Linguee API" 5 | readme = "README.md" 6 | homepage = "https://github.com/imankulov/linguee-api" 7 | repository = "https://github.com/imankulov/linguee-api" 8 | authors = ["Roman Imankulov "] 9 | license = "MIT" 10 | classifiers = [ 11 | "Intended Audience :: Developers", 12 | "Programming Language :: Python :: 3", 13 | "Programming Language :: Python :: 3.8", 14 | "Programming Language :: Python :: 3.9", 15 | "Programming Language :: Python :: 3.10", 16 | "Programming Language :: Python :: 3.11", 17 | "Programming Language :: Python :: 3.12", 18 | ] 19 | include = [ 20 | "CHANGELOG.md", 21 | "docs/linguee-api.png", 22 | ] 23 | 24 | packages = [ 25 | { include = "linguee_api" }, 26 | { include = "tests", format = "sdist" }, 27 | ] 28 | 29 | [tool.poetry.dependencies] 30 | python = "^3.8" 31 | fastapi = "^0.109.2" 32 | pydantic = "^1" 33 | xextract = "^0.1.8" 34 | httpx = "^0.24.1" 35 | uvicorn = "^0.22.0" 36 | sentry-sdk = "^1.24.0" 37 | python-dotenv = "^1.0.0" 38 | loguru = "^0.7.0" 39 | aiosqlite = "^0.19.0" 40 | async-lru = "^2.0.2" 41 | lxml = "^4.9.3" 42 | 43 | [tool.poetry.group.dev.dependencies] 44 | pytest = "^6.1.2" 45 | pytest-xdist = "^2.2.1" 46 | black = "^24.4.1" 47 | flake8 = "^3.8.4" 48 | coverage = "^5.4" 49 | import-linter = "^1.2.1" 50 | pytest-asyncio = "^0.14.0" 51 | ipython = "^8.10.0" 52 | click = "^8.1.7" 53 | asgiref = "^3.3.4" 54 | tox-poetry-installer = {extras = ["poetry"], version = "^0.10.2"} 55 | bump2version = "^1.0.1" 56 | 57 | [tool.coverage.run] 58 | source = ["tests", "linguee_api"] 59 | 60 | [build-system] 61 | requires = ["poetry-core>=1.0.0"] 62 | build-backend = "poetry.core.masonry.api" 63 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.10.5 2 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pydantic import BaseSettings, Field 3 | 4 | from linguee_api.config import settings 5 | from linguee_api.const import PROJECT_ROOT 6 | from linguee_api.downloaders.error_downloader import ErrorDownloader 7 | from linguee_api.downloaders.httpx_downloader import HTTPXDownloader 8 | from linguee_api.downloaders.interfaces import IDownloader 9 | from linguee_api.downloaders.sqlite_cache import SQLiteCache 10 | from linguee_api.linguee_client import LingueeClient 11 | from linguee_api.parsers import XExtractParser 12 | 13 | 14 | class PytestSettings(BaseSettings): 15 | """Specific settings for pytest.""" 16 | 17 | offline: bool = Field(default=False, description="Run tests offline") 18 | 19 | @property 20 | def downloader(self) -> IDownloader: 21 | return ErrorDownloader() if self.offline else HTTPXDownloader() 22 | 23 | class Config: 24 | env_prefix = "pytest_" 25 | env_file = (PROJECT_ROOT / ".env").as_posix() 26 | 27 | 28 | pytest_settings = PytestSettings() 29 | 30 | 31 | @pytest.fixture 32 | def examples_downloader() -> IDownloader: 33 | return SQLiteCache( 34 | cache_database=settings.cache_database, upstream=pytest_settings.downloader 35 | ) 36 | 37 | 38 | @pytest.fixture 39 | def linguee_client(examples_downloader) -> LingueeClient: 40 | return LingueeClient( 41 | page_downloader=examples_downloader, page_parser=XExtractParser() 42 | ) 43 | -------------------------------------------------------------------------------- /tests/parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imankulov/linguee-api/9844c8247b07a2771b1555f09c8bbc6ea83f08d7/tests/parsers/__init__.py -------------------------------------------------------------------------------- /tests/parsers/test_autocompletions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from linguee_api.downloaders.interfaces import IDownloader 4 | from linguee_api.linguee_client import get_autocompletions_url 5 | from linguee_api.models import Autocompletions 6 | from linguee_api.parsers import XExtractParser 7 | 8 | 9 | @pytest.mark.asyncio 10 | async def test_parse_autocompletions_should_return_autocompletions( 11 | examples_downloader: IDownloader, 12 | ): 13 | url = get_autocompletions_url(query="katz", src="de", dst="en") 14 | page = await examples_downloader.download(url) 15 | parser = XExtractParser() 16 | parse_result = parser.parse_autocompletions(page) 17 | 18 | a = Autocompletions.AutocompletionItem 19 | t = Autocompletions.AutocompletionItem.TranslationItem 20 | first_item = a( 21 | text="Katze", 22 | pos="f", 23 | translations=[ 24 | t(text="cat", pos="n"), 25 | t(text="feline", pos="n"), 26 | t(text="crab", pos="n"), 27 | ], 28 | ) 29 | assert parse_result.autocompletions[0] == first_item 30 | -------------------------------------------------------------------------------- /tests/parsers/test_search_result.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import pytest 4 | 5 | from linguee_api.const import LANGUAGE_CODE 6 | from linguee_api.downloaders.interfaces import IDownloader 7 | from linguee_api.linguee_client import get_search_url 8 | from linguee_api.models import UsageFrequency 9 | from linguee_api.parsers import XExtractParser 10 | 11 | 12 | @pytest.mark.parametrize( 13 | ["query", "src", "dst", "is_not_found"], 14 | [ 15 | ("constibado", "pt", "en", True), 16 | ("Möglichkei", "de", "en", False), # At least, there are examples 17 | ("esgotar", "pt", "en", False), 18 | ("not bad", "en", "pt", False), 19 | ("xxxxzzzz", "pt", "en", True), 20 | ], 21 | ) 22 | @pytest.mark.asyncio 23 | async def test_parser_should_detect_not_found( 24 | examples_downloader: IDownloader, 25 | query: str, 26 | src: LANGUAGE_CODE, 27 | dst: LANGUAGE_CODE, 28 | is_not_found: bool, 29 | ): 30 | url = get_search_url(query=query, src=src, dst=dst, guess_direction=False) 31 | page = await examples_downloader.download(url) 32 | assert XExtractParser().is_not_found(page) == is_not_found 33 | 34 | 35 | @pytest.mark.asyncio 36 | async def test_parser_should_find_translation_examples( 37 | examples_downloader: IDownloader, 38 | ): 39 | url = get_search_url(query="obrigado", src="pt", dst="en", guess_direction=False) 40 | page_html = await examples_downloader.download(url) 41 | page = XExtractParser().parse_search_result_to_page(page_html) 42 | examples_of_1st_translation = page.lemmas[0].translations[0].examples 43 | assert examples_of_1st_translation is not None 44 | assert len(examples_of_1st_translation) == 1 45 | assert examples_of_1st_translation[0].src == ( 46 | "Obrigado por sua participação em nossa pesquisa." 47 | ) 48 | assert examples_of_1st_translation[0].dst == ( 49 | "Thank you for your participation in our survey." 50 | ) 51 | 52 | 53 | @pytest.mark.parametrize( 54 | ["query", "src", "dst", "correction"], 55 | [ 56 | ("constibado", "pt", "en", "constipado"), 57 | ( 58 | "Möglichkei", 59 | "de", 60 | "en", 61 | "möglichkeit", 62 | ), # Despite having examples, Linguee provides a correction. 63 | ("esgotar", "pt", "en", None), 64 | ("xxxxzzzz", "pt", "en", None), 65 | ], 66 | ) 67 | @pytest.mark.asyncio 68 | async def test_parser_should_find_correction( 69 | examples_downloader: IDownloader, 70 | query: str, 71 | src: LANGUAGE_CODE, 72 | dst: LANGUAGE_CODE, 73 | correction: Optional[str], 74 | ): 75 | url = get_search_url(query=query, src=src, dst=dst, guess_direction=False) 76 | page = await examples_downloader.download(url) 77 | assert XExtractParser().find_correction(page) == correction 78 | 79 | 80 | @pytest.mark.parametrize( 81 | ["query", "src", "dst"], 82 | [ 83 | ("esgotar", "pt", "en"), 84 | ( 85 | "Möglichkei", 86 | "de", 87 | "en", 88 | ), # The page only has external sources 89 | ("obrigado", "pt", "en"), 90 | ("not bad", "en", "pt"), 91 | ("einfach", "de", "en"), 92 | ("Tisch", "de", "en"), 93 | ("wünschen", "de", "en"), 94 | ("envisage", "en", "zh"), 95 | ("envisage", "en", "sv"), 96 | ("über", "de", "en"), 97 | ], 98 | ) 99 | @pytest.mark.asyncio 100 | async def test_parse_to_dict_should_return_parseable_result( 101 | examples_downloader: IDownloader, 102 | query: str, 103 | src: LANGUAGE_CODE, 104 | dst: LANGUAGE_CODE, 105 | ): 106 | url = get_search_url(query=query, src=src, dst=dst, guess_direction=False) 107 | page = await examples_downloader.download(url) 108 | XExtractParser().parse_search_result_to_page(page) 109 | 110 | 111 | @pytest.mark.asyncio 112 | async def test_parser_should_find_grammar_info_in_german_verbs( 113 | examples_downloader: IDownloader, 114 | ): 115 | url = get_search_url(query="bringen", src="de", dst="en", guess_direction=False) 116 | page_html = await examples_downloader.download(url) 117 | page = XExtractParser().parse_search_result_to_page(page_html) 118 | assert page.lemmas[0].grammar_info == "Akk" 119 | 120 | 121 | @pytest.mark.asyncio 122 | async def test_parser_should_process_examples_without_links( 123 | examples_downloader: IDownloader, 124 | ): 125 | url = get_search_url(query="einfach", src="de", dst="en", guess_direction=False) 126 | page_html = await examples_downloader.download(url) 127 | page = XExtractParser().parse_search_result_to_page(page_html) 128 | sources = page.external_sources 129 | assert all([s.src_url.startswith("http") for s in sources]) 130 | assert all([s.dst_url.startswith("http") for s in sources]) 131 | 132 | 133 | @pytest.mark.asyncio 134 | async def test_parser_should_find_almost_always_usage_frequency( 135 | examples_downloader: IDownloader, 136 | ): 137 | url = get_search_url(query="bacalhau", src="pt", dst="en", guess_direction=False) 138 | page_html = await examples_downloader.download(url) 139 | page = XExtractParser().parse_search_result_to_page(page_html) 140 | assert page.lemmas[0].translations[1].usage_frequency is None 141 | assert ( 142 | page.lemmas[0].translations[0].usage_frequency == UsageFrequency.ALMOST_ALWAYS 143 | ) 144 | 145 | 146 | @pytest.mark.asyncio 147 | async def test_parser_should_find_often_usage_frequency( 148 | examples_downloader: IDownloader, 149 | ): 150 | url = get_search_url(query="placa", src="pt", dst="en", guess_direction=False) 151 | page_html = await examples_downloader.download(url) 152 | page = XExtractParser().parse_search_result_to_page(page_html) 153 | assert page.lemmas[0].translations[1].usage_frequency is None 154 | assert page.lemmas[0].translations[0].usage_frequency == UsageFrequency.OFTEN 155 | 156 | 157 | @pytest.mark.asyncio 158 | async def test_parser_should_find_lemma_forms( 159 | examples_downloader: IDownloader, 160 | ): 161 | url = get_search_url(query="obrigado", src="pt", dst="en", guess_direction=False) 162 | page_html = await examples_downloader.download(url) 163 | page = XExtractParser().parse_search_result_to_page(page_html) 164 | assert page.lemmas[0].forms == [] 165 | assert page.lemmas[1].forms == ["obrigada f sl", "obrigados m pl", "obrigadas f pl"] 166 | 167 | 168 | @pytest.mark.asyncio 169 | async def test_parser_should_find_lemma_forms_for_verbs( 170 | examples_downloader: IDownloader, 171 | ): 172 | url = get_search_url(query="shrink", src="en", dst="pt", guess_direction=False) 173 | page_html = await examples_downloader.download(url) 174 | page = XExtractParser().parse_search_result_to_page(page_html) 175 | assert page.lemmas[0].forms == ["shrank or shrunk", "shrunk"] 176 | -------------------------------------------------------------------------------- /tests/test_api_client.py: -------------------------------------------------------------------------------- 1 | from linguee_api.linguee_client import get_search_url 2 | 3 | 4 | def test_get_linguee_url_should_return_valid_url(): 5 | url = get_search_url(query="bacalhau", src="pt", dst="en", guess_direction=False) 6 | assert url == ( 7 | "https://www.linguee.com/portuguese-english/search?" 8 | "query=bacalhau&ajax=1&source=PT" 9 | ) 10 | -------------------------------------------------------------------------------- /tests/test_downloaders.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | import pytest 5 | 6 | from linguee_api.downloaders.httpx_downloader import HTTPXDownloader 7 | from linguee_api.downloaders.interfaces import DownloaderError 8 | 9 | 10 | @pytest.mark.asyncio 11 | async def test_httpx_downloader_should_download_a_page(): 12 | url = ( 13 | "https://www.linguee.com/portuguese-english/search?" 14 | "query=bacalhau&ajax=1&source=PT" 15 | ) 16 | content = await HTTPXDownloader().download(url) 17 | assert "bacalhau" in content 18 | 19 | 20 | @pytest.mark.asyncio 21 | async def test_httpx_downloader_should_raise_exception_on_invalid_domain_name(): 22 | random_sequence = "".join(random.choices(string.ascii_lowercase, k=30)) 23 | invalid_url = f"https://{random_sequence}.com" 24 | with pytest.raises(DownloaderError): 25 | await HTTPXDownloader().download(invalid_url) 26 | 27 | 28 | @pytest.mark.asyncio 29 | async def test_httpx_downloader_should_raise_exception_on_non200_code(): 30 | invalid_url = "https://httpbin.org/status/403" 31 | with pytest.raises(DownloaderError): 32 | await HTTPXDownloader().download(invalid_url) 33 | -------------------------------------------------------------------------------- /tests/test_file_cache.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from linguee_api.downloaders.file_cache import FileCache 6 | from linguee_api.downloaders.mock_downloader import MockDownloader 7 | 8 | 9 | @pytest.mark.asyncio 10 | async def test_file_cache_should_cache_a_value(tmp_path): 11 | # Cache value 12 | cache = FileCache( 13 | cache_directory=Path(tmp_path), upstream=MockDownloader(message="foo") 14 | ) 15 | await cache.download("https://example.com") 16 | 17 | # Change upstream and try to get the value again 18 | cache.upstream = MockDownloader(message="bar") 19 | result2 = await cache.download("https://example.com") 20 | 21 | # The value should be the same 22 | assert result2 == "foo" 23 | -------------------------------------------------------------------------------- /tests/test_linguee_client.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from linguee_api.const import LANGUAGE_CODE, LANGUAGES 4 | from linguee_api.linguee_client import LingueeClient 5 | from linguee_api.models import FollowCorrections, ParseError, SearchResult 6 | 7 | 8 | @pytest.mark.asyncio 9 | @pytest.mark.parametrize( 10 | "follow_corrections", 11 | [ 12 | FollowCorrections.ALWAYS, 13 | FollowCorrections.ON_EMPTY_TRANSLATIONS, 14 | ], 15 | ) 16 | async def test_linguee_client_should_redirect_on_not_found( 17 | linguee_client: LingueeClient, 18 | follow_corrections, 19 | ): 20 | search_result = await linguee_client.process_search_result( 21 | query="constibado", 22 | src="pt", 23 | dst="en", 24 | guess_direction=False, 25 | follow_corrections=follow_corrections, 26 | ) 27 | assert search_result.query == "constipado" 28 | 29 | 30 | @pytest.mark.asyncio 31 | async def test_linguee_client_should_not_redirect_if_not_asked( 32 | linguee_client: LingueeClient, 33 | ): 34 | search_result = await linguee_client.process_search_result( 35 | query="constibado", 36 | src="pt", 37 | dst="en", 38 | guess_direction=False, 39 | follow_corrections=FollowCorrections.NEVER, 40 | ) 41 | assert isinstance(search_result, ParseError) 42 | assert search_result.message == "Translation not found" 43 | 44 | 45 | @pytest.mark.asyncio 46 | @pytest.mark.parametrize("lang", list(LANGUAGES.keys())) 47 | async def test_linguee_client_should_process_test_requests( 48 | linguee_client: LingueeClient, 49 | lang: LANGUAGE_CODE, 50 | ): 51 | search_result = await linguee_client.process_search_result( 52 | query="test", 53 | src="en", 54 | dst=lang, 55 | guess_direction=False, 56 | follow_corrections=FollowCorrections.ALWAYS, 57 | ) 58 | assert isinstance(search_result, SearchResult) 59 | -------------------------------------------------------------------------------- /tests/test_memory_cache.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from linguee_api.downloaders.memory_cache import MemoryCache 4 | from linguee_api.downloaders.mock_downloader import MockDownloader 5 | 6 | 7 | @pytest.mark.asyncio 8 | async def test_memory_cache_should_cache_a_value(): 9 | # Cache value 10 | cache = MemoryCache(upstream=MockDownloader(message="foo")) 11 | await cache.download("https://example.com") 12 | 13 | # Change upstream and try to get the value again 14 | cache.upstream = MockDownloader(message="bar") 15 | result2 = await cache.download("https://example.com") 16 | 17 | # The value should be the same 18 | assert result2 == "foo" 19 | 20 | 21 | @pytest.mark.asyncio 22 | async def test_memory_cache_should_evict_cache_on_overflow(): 23 | # Cache value 24 | cache = MemoryCache(upstream=MockDownloader(message="foo"), maxsize=1) 25 | await cache.download("https://example.com") 26 | await cache.download("https://example2.com") 27 | 28 | # Change upstream and try to get the value again 29 | cache.upstream = MockDownloader(message="bar") 30 | result2 = await cache.download("https://example.com") 31 | 32 | # The value should be the new one 33 | assert result2 == "bar" 34 | -------------------------------------------------------------------------------- /tests/test_sqlite_cache.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from linguee_api.downloaders.mock_downloader import MockDownloader 6 | from linguee_api.downloaders.sqlite_cache import SQLiteCache 7 | 8 | 9 | @pytest.mark.asyncio 10 | async def test_sqlite_cache_should_cache_a_value(tmp_path): 11 | # Cache value 12 | cache_database = Path(tmp_path) / "cache.db" 13 | cache = SQLiteCache( 14 | cache_database=cache_database, upstream=MockDownloader(message="foo") 15 | ) 16 | await cache.download("https://example.com") 17 | 18 | # Change upstream and try to get the value again 19 | cache.upstream = MockDownloader(message="bar") 20 | result2 = await cache.download("https://example.com") 21 | 22 | # The value should be the same 23 | assert result2 == "foo" 24 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py38, py39, py310, py311, py312 3 | 4 | isolated_build = true 5 | 6 | [testenv] 7 | locked_deps = 8 | pytest 9 | pytest-xdist 10 | pytest-asyncio 11 | commands = 12 | pytest -s tests 13 | --------------------------------------------------------------------------------