├── .bumpversion.cfg
├── .cache
    └── .gitignore
├── .dockerignore
├── .flake8
├── .github
    └── workflows
    │   ├── docker-build.yml
    │   └── tests.yml
├── .gitignore
├── .isort.cfg
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── Procfile
├── README.md
├── app.json
├── docker-compose.yml
├── docs
    ├── development.md
    └── linguee-api.png
├── env.example
├── fly.toml
├── linguee_api
    ├── __init__.py
    ├── api.py
    ├── config.py
    ├── const.py
    ├── downloaders
    │   ├── __init__.py
    │   ├── error_downloader.py
    │   ├── file_cache.py
    │   ├── httpx_downloader.py
    │   ├── interfaces.py
    │   ├── memory_cache.py
    │   ├── mock_downloader.py
    │   └── sqlite_cache.py
    ├── linguee_client.py
    ├── models.py
    ├── parser_utils.py
    ├── parsers.py
    └── utils.py
├── mypy.ini
├── poetry.lock
├── pyproject.toml
├── runtime.txt
├── tests
    ├── conftest.py
    ├── parsers
    │   ├── __init__.py
    │   ├── test_autocompletions.py
    │   └── test_search_result.py
    ├── test_api_client.py
    ├── test_downloaders.py
    ├── test_file_cache.py
    ├── test_linguee_client.py
    ├── test_memory_cache.py
    └── test_sqlite_cache.py
└── tox.ini


/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 2.6.3
 3 | commit = True
 4 | tag = True
 5 | 
 6 | [bumpversion:file:pyproject.toml]
 7 | search = version = "{current_version}"
 8 | replace = version = "{new_version}"
 9 | 
10 | [bumpversion:file:CHANGELOG.md]
11 | search = UNRELEASED
12 | replace = {new_version} ({now:%Y-%m-%d})
13 | 


--------------------------------------------------------------------------------
/.cache/.gitignore:
--------------------------------------------------------------------------------
1 | !.gitignore
2 | *
3 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | .vscode
 2 | .mypy_cache
 3 | .pytest_cache
 4 | .env
 5 | .coverage
 6 | .tox
 7 | .cache
 8 | .git
 9 | coverage.xml
10 | htmlcov/
11 | dist
12 | *.sqlite3
13 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | extend-ignore = E203, W503
4 | max-complexity = 10
5 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-build.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Build and Publish
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   build-and-publish:
11 |     runs-on: ubuntu-latest
12 |     permissions:
13 |       contents: read
14 |       packages: write
15 | 
16 |     steps:
17 |       - name: Checkout code
18 |         uses: actions/checkout@v2
19 | 
20 |       - name: Login to GitHub Container Registry
21 |         uses: docker/login-action@v2
22 |         with:
23 |           registry: ghcr.io
24 |           username: ${{ github.actor }}
25 |           password: ${{ secrets.GITHUB_TOKEN }}
26 | 
27 |       - name: Build and publish Docker image
28 |         uses: docker/build-push-action@v4
29 |         with:
30 |           context: .
31 |           push: true
32 |           tags: |
33 |             ghcr.io/${{ github.repository }}:${{ github.sha }}
34 |             ghcr.io/${{ github.repository }}:latest
35 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build-test:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v3
10 |       - name: Install poetry
11 |         run: pipx install poetry
12 |       - uses: actions/setup-python@v4
13 |         with:
14 |           python-version: "3.12"
15 |           cache: "poetry"
16 |       - name: Install dependencies
17 |         run: |
18 |           poetry install
19 |       - name: Test with pytest
20 |         run: |
21 |           set -ex
22 |           poetry run coverage run -m pytest
23 |       - name: Convert coverage to XML
24 |         run: |
25 |           set -ex
26 |           poetry run coverage xml
27 |       - name: Upload coverage to codecov
28 |         env:
29 |           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
30 |         run: |
31 |           bash <(curl -s https://codecov.io/bash)
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode
 2 | .mypy_cache
 3 | .pytest_cache
 4 | .env
 5 | .coverage
 6 | .tox
 7 | coverage.xml
 8 | htmlcov/
 9 | dist
10 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | profile = black
3 | multi_line_output=3
4 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |   python: python3.9
 3 | 
 4 | repos:
 5 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 6 |     rev: v4.4.0
 7 |     hooks:
 8 |       - id: trailing-whitespace
 9 |       - id: check-merge-conflict
10 |       - id: check-case-conflict
11 |       - id: debug-statements
12 | 
13 |   - repo: https://github.com/psf/black
14 |     rev: 23.3.0
15 |     hooks:
16 |       - id: black
17 | 
18 |   - repo: https://github.com/pre-commit/mirrors-mypy
19 |     rev: v1.2.0
20 |     hooks:
21 |       - id: mypy
22 | 
23 |   - repo: https://github.com/PyCQA/isort
24 |     rev: 5.12.0
25 |     hooks:
26 |       - id: isort
27 | 
28 |   - repo: https://github.com/pycqa/flake8
29 |     rev: 6.0.0
30 |     hooks:
31 |       - id: flake8
32 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | ## 2.6.3 (2024-08-14)
 6 | 
 7 | - Updated pydantic to the latest 1.x version to address incompatibility with python 3.12.4. Ref: https://github.com/pydantic/pydantic/issues/9637
 8 | 
 9 | ## 2.6.2 (2024-04-25)
10 | 
11 | - Added support for Python 3.11 and 3.12.
12 | - Updated Dockerfile to use Python 3.12.
13 | - Added docker-build.yml action.
14 | 
15 | ## 2.6.1 (2024-04-25)
16 | 
17 | - Updated all dependencies.
18 | - Fixed a bug with usage frequency parsing (#48).
19 | 
20 | ## 2.6.0 (2023-04-23)
21 | 
22 | - Added SQLite cache and made it the default one.
23 | - Made MemoryCache use LRU.
24 | - Added tests for all cache classes.
25 | - Updated Dockerfile to use /cache for file and SQLite caches.
26 | - Added a sample docker-compose file.
27 | - Updated FastAPI and httpx dependencies.
28 | 
29 | ## 2.5.1 (2022-11-19)
30 | 
31 | - Added FAQ to the README, where provided a clearer explanation of the 503 error.
32 | 
33 | ## 2.5.0 (2022-11-19)
34 | 
35 | - Added "follow_corrections" API flag (#23)
36 | - Added configuration to host the project on fly.io
37 | - Updated the address of the sample installation to https://linguee-api.fly.dev
38 | - Added lemma forms (#26)
39 | 
40 | ## 2.4.0 (2022-08-01)
41 | 
42 | - Set Heroku runtime to python-3.10.5 (#21)
43 | - Added "usage_frequency" attribute to translations (#22)
44 | 
45 | ## 2.3.0 (2022-06-17)
46 | 
47 | - Added packaging support
48 | - Added support for various versions of Python (3.8+)
49 | - Updated httpx to the latest version. Ref: CVE-2021-41945
50 | 
51 | ## 2.2.1 (2022-04-20)
52 | 
53 | - Updated development dependencies and pre-commit hooks
54 | - Provided usage examples for Python and Bash
55 | 
56 | ## 2.2.0 (2021-09-28)
57 | 
58 | - Fixed a bug with multiple grammar infos (#12).
59 | - Fixed a file cache issue on the Windows platform (#16).
60 | - Updated all dependencies to their latest versions.
61 | 
62 | ## 2.1.0 (2021-05-16)
63 | 
64 | - Added translation examples to the /translations API endpoint (#10).
65 | 
66 | ## 2.0.0 (2021-04-29)
67 | 
68 | - The first release of the Python version of the project.
69 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # First stage: Install poetry and dependencies
 2 | FROM python:3.12-slim AS builder
 3 | 
 4 | # Install system dependencies
 5 | RUN apt-get update \
 6 |     && apt-get install --no-install-recommends -y \
 7 |     curl \
 8 |     build-essential \
 9 |     && apt-get clean \
10 |     && rm -rf /var/lib/apt/lists/*
11 | 
12 | # Set environment variables for poetry
13 | ENV POETRY_HOME="/opt/poetry"
14 | ENV POETRY_VIRTUALENVS_IN_PROJECT=true
15 | ENV POETRY_NO_INTERACTION=1
16 | ENV PATH="$POETRY_HOME/bin:$PATH"
17 | 
18 | # Install poetry
19 | RUN curl -sSL https://install.python-poetry.org | python3 -
20 | 
21 | # Copy only requirements to cache them in docker layer
22 | WORKDIR /app
23 | COPY pyproject.toml poetry.lock /app/
24 | 
25 | # Install runtime deps - uses $POETRY_VIRTUALENVS_IN_PROJECT internally
26 | # and install spacy's en_core_web_sm
27 | RUN poetry install --only main --no-root
28 | 
29 | 
30 | # Second stage: Copy from builder and run
31 | FROM python:3.12-slim AS runner
32 | 
33 | # Copy virtualenv from builder
34 | COPY --from=builder /app/.venv /app/.venv
35 | 
36 | WORKDIR /app
37 | 
38 | # Ensure we use the virtualenv
39 | ENV PATH="/app/.venv/bin:$PATH"
40 | 
41 | # Copy the content of the app
42 | COPY . /app/
43 | 
44 | # Declare port FastAPI will use
45 | EXPOSE 8000
46 | 
47 | # Declare the VOLUME and use it as a cache directory
48 | VOLUME /cache
49 | ENV CACHE_DIRECTORY=/cache
50 | 
51 | # Command to run on container start
52 | CMD ["uvicorn", "linguee_api.api:app", "--host", "0.0.0.0", "--port", "8000"]
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Roman Imankulov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | web: uvicorn linguee_api.api:app --host=0.0.0.0 --port=$PORT
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Linguee API
  2 | 
  3 | [Linguee](https://linguee.com) provides excellent dictionary and translation memory service. Unfortunately, there is no way you can get automated access to it. Linguee API fixes the problem. It acts as a proxy and converts their HTML responses to easy-to-use JSON API.
  4 | 
  5 | ## API endpoints
  6 | 
  7 | The proxy provides three API endpoints: for translations, for examples, and external sources.
  8 | 
  9 | ![Linguee API](./docs/linguee-api.png)
 10 | 
 11 | The API documentation and the playground is available for the sample installation:
 12 | 
 13 | - [Documentation and API playground](https://linguee-api.fly.dev/docs)
 14 | - [The same documentation, but formatted with ReDoc](https://linguee-api.fly.dev/redoc)
 15 | 
 16 | ## Sample installation
 17 | 
 18 | Sample installation is available at https://linguee-api.fly.dev.
 19 | 
 20 | - Get translations of the word "bacalhau" from Portuguese to English: [https://linguee-api.fly.dev/api/v2/translations?query=bacalhau&src=pt&dst=en](https://linguee-api.fly.dev/api/v2/translations?query=bacalhau&src=pt&dst=en).
 21 | - Get a list of curated examples: [https://linguee-api.fly.dev/api/v2/examples?query=bacalhau&src=pt&dst=en](https://linguee-api.fly.dev/api/v2/examples?query=bacalhau&src=pt&dst=en).
 22 | - Get examples from external sources: [https://linguee-api.fly.dev/api/v2/external_sources?query=bacalhau&src=pt&dst=en](https://linguee-api.fly.dev/api/v2/examples?query=bacalhau&src=pt&dst=en).
 23 | 
 24 | ## Local installation
 25 | 
 26 | Install the Linguee API.
 27 | 
 28 | ```shell
 29 | $ pip install linguee-api
 30 | ```
 31 | 
 32 | Run the API server with `uvicorn` (installed as a dependency.)
 33 | 
 34 | ```shell
 35 | $ uvicorn linguee_api.api:app
 36 | ...
 37 | INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)
 38 | ...
 39 | ```
 40 | 
 41 | Open http://127.0.0.1:8000. You will be redirected to the API documentation page, where you can test the API.
 42 | 
 43 | ## Supported languages
 44 | 
 45 | API supports all the languages, supported by Linguee. As in Linguee, not all language pairs are valid though. Supported languages:
 46 | `bg` (Bulgarian), `cs` (Czech), `da` (Danish), `de` (German), `el` (Greek), `en` (English), `es` (Spanish), `et` (Estonian), `fi` (Finnish), `fr` (French), `hu` (Hungarian), `it` (Italian), `ja` (Japan),`lt` (Lithuanian), `lv` (Latvian), `mt` (Maltese), `nl` (Dutch), `pl` (Polish), `pt` (Portuguese), `ro` (Romanian), `ru` (Russian), `sk` (Slovak), `sl` (Solvene), `sv` (Swedish), `zh` (Chinese).
 47 | 
 48 | ## Response structure
 49 | 
 50 | **Lemmas**
 51 | 
 52 | Every query (a random string) can match several so-called lemma objects.
 53 | 
 54 | According to Wikipedia, [lemma](https://en.wikipedia.org/wiki/Lemma_(morphology)) is the canonical form, dictionary form, or citation form of a set of words.
 55 | 
 56 | In English, for example, break, breaks, broke, broken, and breaking are forms of the same lexeme, with "break" as the lemma by which they are indexed.
 57 | 
 58 | In the API, lemmas have the only required attribute, "text," but may have optional elements, such as part of speech ("pos") and audio links with pronunciations.
 59 | 
 60 | 
 61 | **Translations**
 62 | 
 63 | Every lemma has one or more translations. The translation is a lemma in a different language and has a similar structure with the necessary text field and optional part of speech and audio links.
 64 | 
 65 | 
 66 | **Examples**
 67 | 
 68 | In addition to lemmas, the API returns several usage examples curated by dictionary authors. Examples are the short phrases, annotated with one or more equivalents in different languages. When appropriate, examples may contain the part-of-speech form and audio links.
 69 | 
 70 | **External Sources**
 71 | 
 72 | On top of curated examples, Linguee provides links to external sources. The API returns objects containing the phrase snipped in the original language and an equivalent snippet in the translation.
 73 | 
 74 | ## Usage examples with Python and requests
 75 | 
 76 | Once installed on Heroku, Linguee API can be used as any other API service. I recommend using the [requests](https://docs.python-requests.org/) library.
 77 | 
 78 | ### Translate a word or a phrase from one language to another with Python
 79 | 
 80 | A request to the sample API installation to translate the word "bacalhau" from Portuguese to English.
 81 | 
 82 | ```python
 83 | import requests
 84 | 
 85 | api_root = "https://linguee-api.fly.dev/api/v2"
 86 | resp = requests.get(f"{api_root}/translations", params={"query": "bacalhau", "src": "pt", "dst": "en"})
 87 | for lemma in resp.json():
 88 |     for translation in lemma['translations']:
 89 |         print(f"{lemma['text']} -> {translation['text']}")
 90 | ```
 91 | 
 92 | This will print:
 93 | 
 94 | ```
 95 | bacalhau -> cod
 96 | bacalhau -> codfish
 97 | ```
 98 | 
 99 | ### Provide translation examples with Python
100 | 
101 | A request to the sample API installation to get all usage examples of "bacalhau" along with their translations.
102 | 
103 | ```python
104 | import requests
105 | 
106 | api_root = "https://linguee-api.fly.dev/api/v2"
107 | 
108 | resp = requests.get(f"{api_root}/examples", params={"query": "bacalhau", "src": "pt", "dst": "en"})
109 | 
110 | for example in resp.json():
111 |     for translation in example["translations"]:
112 |         print(f"{example['text']} -> {translation['text']}")
113 | ```
114 | 
115 | This will print:
116 | 
117 | ```
118 | bacalhau desfiado -> shredded cod
119 | lombo de bacalhau -> codfish fillet
120 | ...
121 | bacalhau do Atlântico -> Atlantic cod
122 | ```
123 | 
124 | ### Get access to real world usage examples with Python
125 | 
126 | A request to the sample API installation to get all real-world usage examples of "bacalhau" along with their translations.
127 | 
128 | ```python
129 | import requests
130 | 
131 | api_root = "https://linguee-api.fly.dev/api/v2"
132 | 
133 | resp = requests.get(f"{api_root}/external_sources", params={"query": "bacalhau", "src": "pt", "dst": "en"})
134 | for source in resp.json():
135 |     print(f"{source['src']} -> {source['dst']}")
136 | ```
137 | 
138 | This will print a long list of real-world examples like this:
139 | 
140 | ```
141 | É calculado o esforço de [...] pesca de todos os navios que capturam bacalhau. -> The fishing effort of all [...] the vessels catching cod will be calculated.
142 | ```
143 | 
144 | 
145 | ## Bash, curl and jq usage example
146 | 
147 | Once installed on Heroku, Linguee API can be used as any other API service.
148 | 
149 | For Bash scripts you can use curl and [jq](https://stedolan.github.io/jq/), a command-line JSON parser.
150 | 
151 | ### Translate a word or a phrase from one language to another with Bash
152 | 
153 | A request to the sample API installation to get all usage examples of "bacalhau" along with their translations.
154 | 
155 | ```bash
156 | curl -s 'https://linguee-api.fly.dev/api/v2/translations?query=bacalhau&src=pt&dst=en' | jq -c '{text: .[].text, translation: .[].translations[].text}'
157 | ```
158 | 
159 | This will print
160 | 
161 | ```json lines
162 | {"text":"bacalhau","translation":"cod"}
163 | {"text":"bacalhau","translation":"codfish"}
164 | ```
165 | 
166 | ### Provide translation examples with Bash
167 | 
168 | A request to the sample API installation to get all usage examples of "bacalhau" along with their translations.
169 | 
170 | ```shell
171 | curl -s 'https://linguee-api.fly.dev/api/v2/examples?query=bacalhau&src=pt&dst=en' | jq -c '{text: .[].text, translation: .[].translations[].text}'
172 | ```
173 | 
174 | This will print something like this:
175 | 
176 | ```json lines
177 | {"text":"bacalhau desfiado","translation":"shredded cod"}
178 | {"text":"bacalhau desfiado","translation":"codfish fillet"}
179 | ...
180 | {"text":"bacalhau do Atlântico","translation":"Atlantic cod"}
181 | ```
182 | 
183 | ### Get access to real world usage examples with Bash
184 | 
185 | A request to the sample API installation to get all real-world usage examples of "bacalhau" along with their translations.
186 | 
187 | ```shell
188 | curl -s 'https://linguee-api.fly.dev/api/v2/external_sources?query=bacalhau&src=pt&dst=en' | jq -c '{src: .[].src, dst: .[].dst}'
189 | ```
190 | 
191 | This will print a long list of real-world examples like this:
192 | 
193 | ```json lines
194 | {"src":"É calculado o esforço de [...] pesca de todos os navios que capturam bacalhau.","dst":"The fishing effort of all [...] the vessels catching cod will be calculated."}
195 | ...
196 | ```
197 | 
198 | ## FAQ
199 | 
200 | ### The API server returns "The Linguee server returned 503"
201 | 
202 | This error means that the Linguee website temporarily blocks the API client for sending too many requests. If you use the sample API server on https://linguee-api.fly.dev, you can try to send the request later or consider installing your API server, where you won't share the same IP address with other users.
203 | 
204 | ## Terms and Conditions
205 | 
206 | If you use the API, make sure you comply with
207 | [Linguee Terms and Conditions](http://www.linguee.com/page/termsAndConditions.php),
208 | and in particular with that clause:
209 | 
210 | > Both private and business usage of linguee.com services is free of charge.
211 | > It is however strictly prohibited to forward on our services to third
212 | > parties against payment
213 | 


--------------------------------------------------------------------------------
/app.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Linguee API",
 3 |   "description": "Linguee proxy to convert HTML responses from linguee.com to JSON format",
 4 |   "website": "https://github.com/imankulov/linguee-api",
 5 |   "keywords": ["api", "translation"],
 6 |   "buildpacks": [
 7 |     {
 8 |       "url": "https://github.com/moneymeets/python-poetry-buildpack.git"
 9 |     },
10 |     {
11 |       "url": "heroku/python"
12 |     }
13 |   ],
14 |   "env": {
15 |     "DISABLE_POETRY_CREATE_RUNTIME_FILE": {
16 |       "description": "Disable the creation of the runtime file by Poetry buildpack",
17 |       "value": "1"
18 |     }
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | # This is a sample docker-compose.yml file to run the Linguee API server. It exposes the API on port 8000 and
 2 | # stores the cache between runs in a volume "linguee-cache".
 3 | #
 4 | # If you need to reset the cache, you can run:
 5 | #   docker-compose down -v
 6 | version: "3.3"
 7 | 
 8 | services:
 9 |   linguee:
10 |     build: .
11 |     ports:
12 |       - "127.0.0.1:8000:8000"
13 |     volumes:
14 |       - linguee-cache:/cache
15 | 
16 | volumes:
17 |   linguee-cache: {}
18 | 


--------------------------------------------------------------------------------
/docs/development.md:
--------------------------------------------------------------------------------
 1 | # Development installation
 2 | 
 3 | Quick notes to myself how to install the project and run it locally.
 4 | 
 5 | ## How to install
 6 | 
 7 | ```bash
 8 | poetry install
 9 | cp env.example .env
10 | ```
11 | 
12 | ## How to run tests
13 | 
14 | You can run tests offline or online. Offline tests fail when you try to download a new translation that is not in the cache. Set the configuration option in the `.env` file.
15 | 
16 | ```dotenv
17 | PYTEST_OFFLINE=false
18 | ```
19 | 
20 | The run the tests
21 | 
22 | ```bash
23 | poetry run pytest
24 | ```
25 | 
26 | ## How to run the API server
27 | 
28 | ```bash
29 | poetry run uvicorn linguee_api.api:app
30 | ```
31 | 
32 | ## How to make a new release
33 | 
34 | ```bash
35 | bum2version minor
36 | git push
37 | git push --tags
38 | poetry build
39 | poetry publish
40 | ```
41 | 


--------------------------------------------------------------------------------
/docs/linguee-api.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imankulov/linguee-api/9844c8247b07a2771b1555f09c8bbc6ea83f08d7/docs/linguee-api.png


--------------------------------------------------------------------------------
/env.example:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------
 2 | # Sentry settings
 3 | # ---------------------------------------------------------
 4 | # When settings are not defined, Sentry is disabled.
 5 | # SENTRY_DSN=
 6 | # SENTRY_ENVIRONMENT=development
 7 | 
 8 | 
 9 | # ---------------------------------------------------------
10 | # File cache settings
11 | # ---------------------------------------------------------
12 | # When settings are not defined, .cache directory in the project
13 | # root is used.
14 | # CACHE_DIRECTORY=/tmp/.cache
15 | 
16 | 
17 | # ----------------------------------------------------
18 | # Pytest settings
19 | # ----------------------------------------------------
20 | # Run tests offline
21 | PYTEST_OFFLINE=false
22 | 


--------------------------------------------------------------------------------
/fly.toml:
--------------------------------------------------------------------------------
 1 | # fly.toml file generated for linguee-api
 2 | app = "linguee-api"
 3 | kill_signal = "SIGINT"
 4 | kill_timeout = 5
 5 | processes = []
 6 | 
 7 | [env]
 8 | SENTRY_ENVIRONMENT = "production"
 9 | 
10 | [experimental]
11 | allowed_public_ports = []
12 | auto_rollback = true
13 | 
14 | [[services]]
15 | http_checks = []
16 | internal_port = 8000
17 | processes = ["app"]
18 | protocol = "tcp"
19 | script_checks = []
20 | 
21 | [services.concurrency]
22 | hard_limit = 25
23 | soft_limit = 20
24 | type = "connections"
25 | 
26 | [[services.ports]]
27 | force_https = true
28 | handlers = ["http"]
29 | port = 80
30 | 
31 | [[services.ports]]
32 | handlers = ["tls", "http"]
33 | port = 443
34 | 
35 | [[services.tcp_checks]]
36 | grace_period = "1s"
37 | interval = "15s"
38 | restart_limit = 0
39 | timeout = "2s"
40 | 
41 | [mounts]
42 | source="linguee_cache"
43 | destination="/cache"
44 | 


--------------------------------------------------------------------------------
/linguee_api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imankulov/linguee-api/9844c8247b07a2771b1555f09c8bbc6ea83f08d7/linguee_api/__init__.py


--------------------------------------------------------------------------------
/linguee_api/api.py:
--------------------------------------------------------------------------------
  1 | import sentry_sdk
  2 | from fastapi import FastAPI, Query, Response, status
  3 | from sentry_sdk.integrations.asgi import SentryAsgiMiddleware
  4 | from starlette.responses import RedirectResponse
  5 | 
  6 | from linguee_api.config import settings
  7 | from linguee_api.const import (
  8 |     FOLLOW_CORRECTIONS_DESCRIPTION,
  9 |     LANGUAGE_CODE,
 10 |     PROJECT_DESCRIPTION,
 11 | )
 12 | from linguee_api.downloaders.httpx_downloader import HTTPXDownloader
 13 | from linguee_api.downloaders.memory_cache import MemoryCache
 14 | from linguee_api.downloaders.sqlite_cache import SQLiteCache
 15 | from linguee_api.linguee_client import LingueeClient
 16 | from linguee_api.models import (
 17 |     Autocompletions,
 18 |     FollowCorrections,
 19 |     ParseError,
 20 |     SearchResult,
 21 | )
 22 | from linguee_api.parsers import XExtractParser
 23 | 
 24 | sentry_sdk.init(dsn=settings.sentry_dsn, environment=settings.sentry_environment)
 25 | app = FastAPI(
 26 |     title="Linguee API",
 27 |     description=PROJECT_DESCRIPTION,
 28 |     version="2.0.0",
 29 | )
 30 | app.add_middleware(SentryAsgiMiddleware)
 31 | 
 32 | page_downloader = MemoryCache(
 33 |     upstream=SQLiteCache(
 34 |         cache_database=settings.cache_database,
 35 |         upstream=HTTPXDownloader(),
 36 |     )
 37 | )
 38 | client = LingueeClient(page_downloader=page_downloader, page_parser=XExtractParser())
 39 | 
 40 | 
 41 | @app.get("/", include_in_schema=False)
 42 | def index():
 43 |     return RedirectResponse("/docs")
 44 | 
 45 | 
 46 | @app.get(
 47 |     "/api/v2/translations",
 48 |     status_code=status.HTTP_200_OK,
 49 |     responses={
 50 |         status.HTTP_200_OK: {"model": list[SearchResult.Lemma]},
 51 |         status.HTTP_500_INTERNAL_SERVER_ERROR: {"model": ParseError},
 52 |     },
 53 | )
 54 | async def translations(
 55 |     query: str,
 56 |     src: LANGUAGE_CODE,
 57 |     dst: LANGUAGE_CODE,
 58 |     response: Response,
 59 |     guess_direction: bool = False,
 60 |     follow_corrections: FollowCorrections = Query(
 61 |         default=FollowCorrections.ALWAYS,
 62 |         description=FOLLOW_CORRECTIONS_DESCRIPTION,
 63 |     ),
 64 | ):
 65 |     """
 66 |     Translate the query between src and dst language.
 67 | 
 68 |     The response contains the list of lemma objects matching the query in the source
 69 |     language. Each of these lemmas is annotated with one or multiple translations
 70 |     and optional examples.
 71 |     """
 72 |     result = await client.process_search_result(
 73 |         query=query,
 74 |         src=src,
 75 |         dst=dst,
 76 |         guess_direction=guess_direction,
 77 |         follow_corrections=follow_corrections,
 78 |     )
 79 |     if isinstance(result, ParseError):
 80 |         response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR
 81 |         return result
 82 |     return result.lemmas
 83 | 
 84 | 
 85 | @app.get(
 86 |     "/api/v2/examples",
 87 |     status_code=status.HTTP_200_OK,
 88 |     responses={
 89 |         status.HTTP_200_OK: {"model": list[SearchResult.Example]},
 90 |         status.HTTP_500_INTERNAL_SERVER_ERROR: {"model": ParseError},
 91 |     },
 92 | )
 93 | async def examples(
 94 |     query: str,
 95 |     src: LANGUAGE_CODE,
 96 |     dst: LANGUAGE_CODE,
 97 |     response: Response,
 98 |     guess_direction: bool = False,
 99 |     follow_corrections: FollowCorrections = Query(
100 |         default=FollowCorrections.ALWAYS,
101 |         description=FOLLOW_CORRECTIONS_DESCRIPTION,
102 |     ),
103 | ):
104 |     """Provide translation examples."""
105 |     result = await client.process_search_result(
106 |         query=query,
107 |         src=src,
108 |         dst=dst,
109 |         guess_direction=guess_direction,
110 |         follow_corrections=follow_corrections,
111 |     )
112 |     if isinstance(result, ParseError):
113 |         response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR
114 |         return result
115 |     return result.examples
116 | 
117 | 
118 | @app.get(
119 |     "/api/v2/external_sources",
120 |     status_code=status.HTTP_200_OK,
121 |     responses={
122 |         status.HTTP_200_OK: {"model": list[SearchResult.ExternalSource]},
123 |         status.HTTP_500_INTERNAL_SERVER_ERROR: {"model": ParseError},
124 |     },
125 | )
126 | async def external_sources(
127 |     query: str,
128 |     src: LANGUAGE_CODE,
129 |     dst: LANGUAGE_CODE,
130 |     response: Response,
131 |     guess_direction: bool = False,
132 |     follow_corrections: FollowCorrections = Query(
133 |         default=FollowCorrections.ALWAYS,
134 |         description=FOLLOW_CORRECTIONS_DESCRIPTION,
135 |     ),
136 | ):
137 |     """Provide translation examples from external (unverified) sources."""
138 |     result = await client.process_search_result(
139 |         query=query,
140 |         src=src,
141 |         dst=dst,
142 |         guess_direction=guess_direction,
143 |         follow_corrections=follow_corrections,
144 |     )
145 |     if isinstance(result, ParseError):
146 |         response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR
147 |         return result
148 |     return result.external_sources
149 | 
150 | 
151 | @app.get(
152 |     "/api/v2/autocompletions",
153 |     status_code=status.HTTP_200_OK,
154 |     responses={
155 |         status.HTTP_200_OK: {"model": list[Autocompletions.AutocompletionItem]},
156 |         status.HTTP_500_INTERNAL_SERVER_ERROR: {"model": ParseError},
157 |     },
158 | )
159 | async def autocompletions(
160 |     query: str,
161 |     src: LANGUAGE_CODE,
162 |     dst: LANGUAGE_CODE,
163 |     response: Response,
164 | ):
165 |     """Provide translation examples from external (unverified) sources."""
166 |     result = await client.process_autocompletions(
167 |         query=query,
168 |         src_lang_code=src,
169 |         dst_lang_code=dst,
170 |     )
171 |     if isinstance(result, ParseError):
172 |         response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR
173 |         return result
174 |     return result.autocompletions
175 | 


--------------------------------------------------------------------------------
/linguee_api/config.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from typing import Optional
 3 | 
 4 | from pydantic import BaseSettings
 5 | 
 6 | from linguee_api.const import PROJECT_ROOT
 7 | 
 8 | 
 9 | class Settings(BaseSettings):
10 |     """Application settings."""
11 | 
12 |     # Sentry settings
13 |     sentry_dsn: Optional[str] = None
14 |     sentry_environment: str = "development"
15 | 
16 |     # File and SQLite cache settings
17 |     cache_directory: pathlib.Path = PROJECT_ROOT / ".cache"
18 | 
19 |     @property
20 |     def cache_database(self) -> pathlib.Path:
21 |         """Cache database."""
22 |         return self.cache_directory / "cache.sqlite3"
23 | 
24 |     class Config:
25 |         env_file = (PROJECT_ROOT / ".env").as_posix()
26 | 
27 | 
28 | settings = Settings()
29 | 


--------------------------------------------------------------------------------
/linguee_api/const.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from typing import Literal
 3 | 
 4 | PROJECT_ROOT = pathlib.Path(__file__).parents[1]
 5 | USER_AGENT = "Linguee API proxy (https://github.com/imankulov/linguee-api)"
 6 | LANGUAGE_CODE = Literal[
 7 |     "bg",
 8 |     "cs",
 9 |     "da",
10 |     "de",
11 |     "el",
12 |     "en",
13 |     "es",
14 |     "et",
15 |     "fi",
16 |     "fr",
17 |     "hu",
18 |     "it",
19 |     "ja",
20 |     "lt",
21 |     "lv",
22 |     "mt",
23 |     "nl",
24 |     "pl",
25 |     "pt",
26 |     "ro",
27 |     "ru",
28 |     "sk",
29 |     "sl",
30 |     "sv",
31 |     "zh",
32 | ]
33 | LANGUAGES = {
34 |     "bg": "bulgarian",
35 |     "cs": "czech",
36 |     "da": "danish",
37 |     "de": "german",
38 |     "el": "greek",
39 |     "en": "english",
40 |     "es": "spanish",
41 |     "et": "estonian",
42 |     "fi": "finnish",
43 |     "fr": "french",
44 |     "hu": "hungarian",
45 |     "it": "italian",
46 |     "ja": "japanese",
47 |     "lt": "lithuanian",
48 |     "lv": "latvian",
49 |     "mt": "maltese",
50 |     "nl": "dutch",
51 |     "pl": "polish",
52 |     "pt": "portuguese",
53 |     "ro": "romanian",
54 |     "ru": "russian",
55 |     "sk": "slovak",
56 |     "sl": "slovene",
57 |     "sv": "swedish",
58 |     "zh": "chinese",
59 | }
60 | MAX_REDIRECTS = 5
61 | PROJECT_DESCRIPTION = """
62 | <p>
63 |     <a href="https://linguee.com" target="_blank">Linguee</a> provides excellent
64 |     dictionary and translation memory service. Unfortunately, there is no way you
65 |     can get automated access to it. Linguee API fixes the problem. It acts as a
66 |     proxy and converts their HTML responses to easy-to-use JSON API.
67 | </p>
68 | <p>
69 |     This installation is an example. If you want to have reliable service, install
70 |     it yourself. The source code and installation instructions are available at
71 |     <a href="https://github.com/imankulov/linguee-api"
72 |     >github.com/imankulov/linguee-api</a>.
73 | </p>
74 | <p>
75 |     For any questions, ideas or bug reports, fill in
76 |     <a href="https://github.com/imankulov/linguee-api/issues" target="_blank">
77 |     the issue at GitHub</a>.
78 | </p>
79 | """
80 | 
81 | FOLLOW_CORRECTIONS_DESCRIPTION = """A flag that defines how to treat responses with a
82 | 'did you mean' link. There are three possible values:
83 | 
84 | - `always` (default): always follow the suggestion if found on a page, even if the page
85 |    itself has translations.
86 | - `never`: never follow the suggested correction.
87 | - `on_empty_translations`: only follow the link if there are no translations on
88 |    the page.
89 | """
90 | 


--------------------------------------------------------------------------------
/linguee_api/downloaders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imankulov/linguee-api/9844c8247b07a2771b1555f09c8bbc6ea83f08d7/linguee_api/downloaders/__init__.py


--------------------------------------------------------------------------------
/linguee_api/downloaders/error_downloader.py:
--------------------------------------------------------------------------------
 1 | from linguee_api.downloaders.interfaces import DownloaderError, IDownloader
 2 | 
 3 | 
 4 | class ErrorDownloader(IDownloader):
 5 |     """
 6 |     A downloader that always raises an DownloaderError.
 7 | 
 8 |     Helpful to use as the upstream downloader for cache in tests to make sure
 9 |     that we don't send requests to the server.
10 |     """
11 | 
12 |     async def download(self, url: str) -> str:
13 |         raise DownloaderError(f"I cannot download {url}")
14 | 


--------------------------------------------------------------------------------
/linguee_api/downloaders/file_cache.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import urllib.parse
 3 | from typing import Optional
 4 | 
 5 | from linguee_api.downloaders.interfaces import ICache, IDownloader
 6 | 
 7 | 
 8 | class FileCache(ICache):
 9 |     """File Cache."""
10 | 
11 |     def __init__(self, cache_directory: pathlib.Path, upstream: IDownloader):
12 |         self.cache_directory = cache_directory
13 |         self.upstream = upstream
14 |         self.cache_directory.mkdir(parents=True, exist_ok=True)
15 | 
16 |     async def get_from_cache(self, url: str) -> Optional[str]:
17 |         page_file = self._get_page_file(url)
18 |         if page_file.is_file():
19 |             return page_file.read_text(encoding="utf-8")
20 |         return None
21 | 
22 |     async def put_to_cache(self, url: str, page: str) -> None:
23 |         page_file = self._get_page_file(url)
24 |         page_file.write_text(page, encoding="utf-8")
25 | 
26 |     def _get_page_file(self, url: str) -> pathlib.Path:
27 |         return self.cache_directory / urllib.parse.quote(url, safe="")
28 | 


--------------------------------------------------------------------------------
/linguee_api/downloaders/httpx_downloader.py:
--------------------------------------------------------------------------------
 1 | import httpx
 2 | 
 3 | from linguee_api.downloaders.interfaces import DownloaderError, IDownloader
 4 | 
 5 | ERROR_503 = (
 6 |     "The Linguee server returned 503. The API proxy was temporarily blocked by "
 7 |     "Linguee. For more details, see https://github.com/imankulov/linguee-api#"
 8 |     "the-api-server-returns-the-linguee-server-returned-503"
 9 | )
10 | 
11 | 
12 | class HTTPXDownloader(IDownloader):
13 |     """
14 |     Real downloader.
15 | 
16 |     Sends request to linguee.com to read the page.
17 |     """
18 | 
19 |     async def download(self, url: str) -> str:
20 |         async with httpx.AsyncClient() as client:
21 |             try:
22 |                 response = await client.get(url)
23 |             except httpx.ConnectError as e:
24 |                 raise DownloaderError(str(e)) from e
25 | 
26 |             if response.status_code == 503:
27 |                 raise DownloaderError(ERROR_503)
28 | 
29 |             if response.status_code != 200:
30 |                 raise DownloaderError(
31 |                     f"The Linguee server returned {response.status_code}"
32 |                 )
33 |             return response.text
34 | 


--------------------------------------------------------------------------------
/linguee_api/downloaders/interfaces.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from typing import Optional
 3 | 
 4 | 
 5 | class DownloaderError(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | class IDownloader(abc.ABC):
10 |     @abc.abstractmethod
11 |     async def download(self, url: str) -> str:
12 |         """Download a page or raise an exception"""
13 |         ...
14 | 
15 | 
16 | class ICache(IDownloader, abc.ABC):
17 | 
18 |     upstream: IDownloader
19 | 
20 |     @abc.abstractmethod
21 |     async def get_from_cache(self, url: str) -> Optional[str]:
22 |         """Return a page from the cache."""
23 |         ...
24 | 
25 |     @abc.abstractmethod
26 |     async def put_to_cache(self, url: str, page: str) -> None:
27 |         """Put a page to the cache."""
28 |         ...
29 | 
30 |     async def download(self, url: str) -> str:
31 |         page = await self.get_from_cache(url)
32 |         if page is None:
33 |             page = await self.upstream.download(url)
34 |             await self.put_to_cache(url, page)
35 |         return page
36 | 


--------------------------------------------------------------------------------
/linguee_api/downloaders/memory_cache.py:
--------------------------------------------------------------------------------
 1 | from async_lru import alru_cache
 2 | 
 3 | from linguee_api.downloaders.interfaces import IDownloader
 4 | 
 5 | 
 6 | class MemoryCache(IDownloader):
 7 |     """Memory cache.
 8 | 
 9 |     Exposes the downloader interface, but requires the upstream to work and
10 |     keeps records in memory.
11 |     """
12 | 
13 |     def __init__(self, upstream: IDownloader, maxsize: int = 1000):
14 |         self.upstream = upstream
15 |         self.download = alru_cache(maxsize=maxsize)(self.download)  # type: ignore
16 | 
17 |     async def download(self, url: str) -> str:
18 |         return await self.upstream.download(url)
19 | 


--------------------------------------------------------------------------------
/linguee_api/downloaders/mock_downloader.py:
--------------------------------------------------------------------------------
 1 | from linguee_api.downloaders.interfaces import IDownloader
 2 | 
 3 | MESSAGE = "Hello world!"
 4 | 
 5 | 
 6 | class MockDownloader(IDownloader):
 7 |     """
 8 |     A downloader that always return "Hello world!".
 9 | 
10 |     Helpful to test the cache layer.
11 |     """
12 | 
13 |     def __init__(self, message: str = MESSAGE):
14 |         self.message = message
15 | 
16 |     async def download(self, url: str) -> str:
17 |         return self.message
18 | 


--------------------------------------------------------------------------------
/linguee_api/downloaders/sqlite_cache.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from typing import Optional
 3 | 
 4 | import aiosqlite
 5 | 
 6 | from linguee_api.downloaders.interfaces import ICache, IDownloader
 7 | 
 8 | 
 9 | class SQLiteCache(ICache):
10 |     """SQLite Cache."""
11 | 
12 |     def __init__(self, cache_database: pathlib.Path, upstream: IDownloader):
13 |         self.cache_database = cache_database
14 |         self.upstream = upstream
15 | 
16 |     async def get_from_cache(self, url: str) -> Optional[str]:
17 |         await self._ensure_database_initialized()
18 |         async with aiosqlite.connect(self.cache_database) as db:
19 |             async with db.execute(
20 |                 "SELECT page FROM cache WHERE url = ?", [url]
21 |             ) as cursor:
22 |                 row = await cursor.fetchone()
23 |                 if row is None:
24 |                     return None
25 |         return row[0]
26 | 
27 |     async def put_to_cache(self, url: str, page: str) -> None:
28 |         async with aiosqlite.connect(self.cache_database) as db:
29 |             await db.execute("INSERT INTO cache (url, page) VALUES (?, ?)", [url, page])
30 |             await db.commit()
31 | 
32 |     async def _ensure_database_initialized(self):
33 |         if self.cache_database.is_file():
34 |             return
35 |         self.cache_database.parent.mkdir(parents=True, exist_ok=True)
36 |         async with aiosqlite.connect(self.cache_database) as db:
37 |             await db.execute(
38 |                 """CREATE TABLE IF NOT EXISTS cache (
39 |                     url TEXT PRIMARY KEY,
40 |                     page TEXT,
41 |                     created_at DATETIME DEFAULT CURRENT_TIMESTAMP
42 |                 )"""
43 |             )
44 |             await db.commit()
45 | 


--------------------------------------------------------------------------------
/linguee_api/linguee_client.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | from urllib.parse import urlencode
  3 | 
  4 | from loguru import logger
  5 | 
  6 | from linguee_api.const import LANGUAGE_CODE, LANGUAGES, MAX_REDIRECTS
  7 | from linguee_api.downloaders.interfaces import DownloaderError, IDownloader
  8 | from linguee_api.models import (
  9 |     Autocompletions,
 10 |     AutocompletionsOrError,
 11 |     Correction,
 12 |     FollowCorrections,
 13 |     NotFound,
 14 |     ParseError,
 15 |     SearchResult,
 16 | )
 17 | from linguee_api.parsers import IParser
 18 | 
 19 | 
 20 | class LingueeClient:
 21 |     """Linguee client. The core class of the application."""
 22 | 
 23 |     def __init__(
 24 |         self,
 25 |         *,
 26 |         page_downloader: IDownloader,
 27 |         page_parser: IParser,
 28 |         max_redirects=MAX_REDIRECTS,
 29 |     ):
 30 |         self.page_downloader = page_downloader
 31 |         self.page_parser = page_parser
 32 |         self.max_redirects = max_redirects
 33 | 
 34 |     async def process_search_result(
 35 |         self,
 36 |         *,
 37 |         query: str,
 38 |         src: LANGUAGE_CODE,
 39 |         dst: LANGUAGE_CODE,
 40 |         guess_direction: bool,
 41 |         follow_corrections: FollowCorrections,
 42 |     ) -> Union[SearchResult, ParseError]:
 43 |         logger.info(
 44 |             f"Processing API request: {query=}, {src=}, {dst=}, "
 45 |             f"{guess_direction=}, {follow_corrections=}"
 46 |         )
 47 |         url = get_search_url(
 48 |             query=query,
 49 |             src=src,
 50 |             dst=dst,
 51 |             guess_direction=guess_direction,
 52 |         )
 53 | 
 54 |         for i in range(self.max_redirects):
 55 |             try:
 56 |                 page_html = await self.page_downloader.download(url)
 57 |             except DownloaderError as error:
 58 |                 logger.error(f"Error downloading URL: {error=}, {url=}")
 59 |                 return ParseError(message=str(error))
 60 | 
 61 |             parse_result = self.page_parser.parse_search_result(
 62 |                 page_html, follow_corrections=follow_corrections
 63 |             )
 64 |             if isinstance(parse_result, ParseError):
 65 |                 logger.info(f"Parser returned parse error: {parse_result=}")
 66 |                 return parse_result
 67 |             elif isinstance(parse_result, Correction):
 68 |                 logger.info(f"Parser returned correction: {parse_result=}")
 69 |                 url = get_search_url(
 70 |                     query=parse_result.correction,
 71 |                     src=src,
 72 |                     dst=dst,
 73 |                     guess_direction=guess_direction,
 74 |                 )
 75 |             elif isinstance(parse_result, SearchResult):
 76 |                 logger.info(
 77 |                     f"Parser returned search result: "
 78 |                     f"{parse_result.query=}, "
 79 |                     f"{len(parse_result.lemmas)=}, "
 80 |                     f"{len(parse_result.examples)=}, "
 81 |                     f"{len(parse_result.external_sources)=}"
 82 |                 )
 83 |                 return parse_result
 84 |             elif isinstance(parse_result, NotFound):
 85 |                 logger.info("Parser returned not found")
 86 |                 return ParseError(message="Translation not found")
 87 |             else:
 88 |                 logger.error(f"Unexpected API result: {parse_result=}")
 89 |                 raise RuntimeError(f"Unexpected API result: {parse_result}")
 90 | 
 91 |         still_redirecting = f"Still redirecting after {self.max_redirects} redirects"
 92 |         logger.error(still_redirecting)
 93 |         return ParseError(message=still_redirecting)
 94 | 
 95 |     async def process_autocompletions(
 96 |         self,
 97 |         *,
 98 |         query: str,
 99 |         src_lang_code: LANGUAGE_CODE,
100 |         dst_lang_code: LANGUAGE_CODE,
101 |     ) -> AutocompletionsOrError:
102 |         url = get_autocompletions_url(
103 |             query=query,
104 |             src=src_lang_code,
105 |             dst=dst_lang_code,
106 |         )
107 |         try:
108 |             page_html = await self.page_downloader.download(url)
109 |         except DownloaderError as error:
110 |             return ParseError(message=str(error))
111 | 
112 |         parse_result = self.page_parser.parse_autocompletions(page_html)
113 |         if isinstance(parse_result, ParseError):
114 |             return parse_result
115 |         elif isinstance(parse_result, Autocompletions):
116 |             return parse_result
117 | 
118 |         raise RuntimeError(f"Unexpected API result: {parse_result}")
119 | 
120 | 
121 | def get_search_url(
122 |     *,
123 |     query: str,
124 |     src: LANGUAGE_CODE,
125 |     dst: LANGUAGE_CODE,
126 |     guess_direction: bool,
127 | ):
128 |     """
129 |     Return a Linguee URL.
130 |     """
131 |     src_lang_name = LANGUAGES[src]
132 |     dst_lang_name = LANGUAGES[dst]
133 |     url = f"https://www.linguee.com/{src_lang_name}-{dst_lang_name}/search"
134 |     query_params = {
135 |         "query": query,
136 |         "ajax": "1",
137 |     }
138 |     if not guess_direction:
139 |         query_params["source"] = src.upper()
140 |     return f"{url}?{urlencode(query_params)}"
141 | 
142 | 
143 | def get_autocompletions_url(
144 |     *,
145 |     query: str,
146 |     src: LANGUAGE_CODE,
147 |     dst: LANGUAGE_CODE,
148 | ):
149 |     """Return a URL for auto-completions."""
150 |     src_lang_name = LANGUAGES[src]
151 |     dst_lang_name = LANGUAGES[dst]
152 |     url = f"https://www.linguee.com/{src_lang_name}-{dst_lang_name}/search"
153 |     query_params = {
154 |         "qe": query,
155 |     }
156 |     return f"{url}?{urlencode(query_params)}"
157 | 


--------------------------------------------------------------------------------
/linguee_api/models.py:
--------------------------------------------------------------------------------
  1 | """Data classes that define the schema of the API response."""
  2 | from enum import Enum
  3 | from typing import List, Optional, Union
  4 | 
  5 | from pydantic import BaseModel, Field, validator
  6 | 
  7 | from linguee_api.parser_utils import remove_round_brackets_and_split_by_commas
  8 | 
  9 | 
 10 | class FollowCorrections(Enum):
 11 |     ALWAYS = "always"
 12 |     NEVER = "never"
 13 |     ON_EMPTY_TRANSLATIONS = "on_empty_translations"
 14 | 
 15 | 
 16 | class AudioLink(BaseModel):
 17 |     """The link to the audio file along with the language variant."""
 18 | 
 19 |     url: str = Field(
 20 |         example=(
 21 |             "https://www.linguee.com/mp3/PT_BR/f5/"
 22 |             "f5491d72610965dd0a287c1ab1025c0f-300.mp3"
 23 |         )
 24 |     )
 25 |     lang: str = Field(example="Brazilian Portuguese")
 26 | 
 27 | 
 28 | class UsageFrequency(Enum):
 29 |     """Translation usage frequency. Valid values: `often` or `almost_always`."""
 30 | 
 31 |     OFTEN = "often"
 32 |     ALMOST_ALWAYS = "almost_always"
 33 | 
 34 | 
 35 | class SearchResult(BaseModel):
 36 |     """The root structure of parsed API response."""
 37 | 
 38 |     class Lemma(BaseModel):
 39 |         """Information about one found word (lemma)."""
 40 | 
 41 |         class Translation(BaseModel):
 42 |             """One of the possible translation of the term."""
 43 | 
 44 |             class TranslationExample(BaseModel):
 45 |                 """A translation example."""
 46 | 
 47 |                 src: str = Field(
 48 |                     example=(
 49 |                         "Estou obrigado pelo contrato a "
 50 |                         "trabalhar seis horas por dia."
 51 |                     )
 52 |                 )
 53 |                 dst: str = Field(
 54 |                     example="I am bound by the contract to work six hours a day."
 55 |                 )
 56 | 
 57 |             featured: bool = Field(example=False)
 58 |             text: str = Field(example="required")
 59 |             pos: Optional[str] = Field(example="adjective / past participle, masculine")
 60 |             audio_links: Optional[List[AudioLink]]
 61 |             examples: Optional[List[TranslationExample]]
 62 |             usage_frequency: Optional[UsageFrequency] = Field(
 63 |                 example=UsageFrequency.OFTEN
 64 |             )
 65 | 
 66 |         featured: bool = Field(example=False)
 67 |         text: str = Field(example="obrigado")
 68 |         pos: Optional[str] = Field(example="interjection")
 69 |         forms: List[str] = Field(
 70 |             example=["obrigada f sl", "obrigados m pl", "obrigadas f pl"]
 71 |         )
 72 |         grammar_info: Optional[str] = Field(example="Akk")
 73 |         audio_links: Optional[List[AudioLink]]
 74 |         translations: List[Translation]
 75 | 
 76 |         @validator("forms", pre=True, always=True)
 77 |         def _validate_forms(cls, v):
 78 |             return remove_round_brackets_and_split_by_commas(v)
 79 | 
 80 |     class Example(BaseModel):
 81 |         """One example."""
 82 | 
 83 |         class Translation(BaseModel):
 84 |             """Translation example."""
 85 | 
 86 |             text: str = Field(example="big thanks")
 87 |             pos: Optional[str] = Field(example="n [colloq.]")
 88 | 
 89 |         text: str = Field(example="muito obrigado")
 90 |         pos: Optional[str] = Field(example="m")
 91 |         audio_links: Optional[List[AudioLink]]
 92 |         translations: List[Translation]
 93 | 
 94 |     class ExternalSource(BaseModel):
 95 |         """An example of usage of the word in the context."""
 96 | 
 97 |         src: str = Field(
 98 |             example=(
 99 |                 "Parabéns e um grande obrigado a todos que ajudaram [...] "
100 |                 "ao sucesso desta noite!"
101 |             )
102 |         )
103 |         dst: str = Field(
104 |             example=(
105 |                 "Well done and many thanks to everyone who helped [...] "
106 |                 "make this evening a success!"
107 |             )
108 |         )
109 |         src_url: str = Field(
110 |             example="http://www.findmadeleine.com/pt/updates@page=2.html"
111 |         )
112 |         dst_url: str = Field(example="http://www.findmadeleine.com/updates@page=2.html")
113 | 
114 |     src_lang: str = Field(example="pt")
115 |     dst_lang: str = Field(example="en")
116 |     query: str = Field(example="obrigado")
117 |     correct_query: str = Field(example="obrigado")
118 |     lemmas: List[Lemma]
119 |     examples: List[Example]
120 |     external_sources: List[ExternalSource]
121 | 
122 | 
123 | class Autocompletions(BaseModel):
124 |     """The root structure of the API response for auto-completions."""
125 | 
126 |     class AutocompletionItem(BaseModel):
127 |         """Information about one word."""
128 | 
129 |         class TranslationItem(BaseModel):
130 |             text: str = Field(example="cat")
131 |             pos: Optional[str] = Field(example="n")
132 | 
133 |         text: str = Field(example="Katze")
134 |         pos: Optional[str] = Field(example="f")
135 |         translations: List[TranslationItem]
136 | 
137 |     autocompletions: List[AutocompletionItem]
138 | 
139 | 
140 | class Correction(BaseModel):
141 |     """
142 |     A redirect to the correct form.
143 | 
144 |     This response is returned by a parser, when a spelling issue is found, and
145 |     a redirect to the correct form is needed.
146 |     """
147 | 
148 |     correction: str
149 | 
150 | 
151 | class NotFound(BaseModel):
152 |     """
153 |     LemmaTranslation not found.
154 | 
155 |     The query is not recognized as a meaningful word. Nothing to translate.
156 |     """
157 | 
158 |     pass
159 | 
160 | 
161 | class ParseError(BaseModel):
162 |     """Unexpected parsing error. Don't know what to do."""
163 | 
164 |     message: str
165 | 
166 | 
167 | SearchResultOrError = Union[SearchResult, ParseError, Correction, NotFound]
168 | AutocompletionsOrError = Union[Autocompletions, ParseError]
169 | 


--------------------------------------------------------------------------------
/linguee_api/parser_utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Any, Dict, List, Optional
 3 | 
 4 | from xextract import Group
 5 | from xextract.parsers import BaseNamedParser
 6 | 
 7 | 
 8 | def concat_values(name: str, *children: BaseNamedParser):
 9 |     """
10 |     Concatenate values from children.
11 | 
12 |     Extract values from all the children, flatten and concatenate them as one string.
13 |     """
14 |     return Group(
15 |         name=name,
16 |         children=children,
17 |         quant="?",
18 |         callback=_concat_values_callback,
19 |     )
20 | 
21 | 
22 | def _concat_values_callback(objects: Dict[str, Any]) -> str:
23 |     ret = []
24 |     for value in objects.values():
25 |         if isinstance(value, list):
26 |             ret.append(" ".join(str(item) for item in value))
27 |         else:
28 |             ret.append(str(value))
29 |     return normalize(" ".join(ret))
30 | 
31 | 
32 | def normalize(text: str) -> str:
33 |     """
34 |     Replace all whitespaces in the text with a single space.
35 | 
36 |     For example "  foo   bar " is converted to "foo bar".
37 |     """
38 |     return re.sub(r"\s+", " ", text).strip()
39 | 
40 | 
41 | def remove_round_brackets_and_split_by_commas(text: Optional[str]) -> List[str]:
42 |     """Remove round brackets and split by commas."""
43 |     if not text:
44 |         return []
45 | 
46 |     stripped_text = text.strip().strip("()")
47 |     if not stripped_text:
48 |         return []
49 | 
50 |     return [item.strip() for item in stripped_text.split(",") if item.strip()]
51 | 
52 | 
53 | def take_first_item(variants) -> Optional[str]:
54 |     """Take the first item variant and normalize."""
55 |     if not variants["item"]:
56 |         return None
57 |     return variants["item"][0]
58 | 
59 | 
60 | def take_first_non_empty_item(variants) -> Optional[str]:
61 |     """Take the first non-empty item variant and normalize."""
62 |     for item in variants["item"]:
63 |         if item:
64 |             return item
65 |     return None
66 | 


--------------------------------------------------------------------------------
/linguee_api/parsers.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | from typing import Dict, List, Optional
  3 | 
  4 | from xextract import Group, String
  5 | 
  6 | from linguee_api.models import (
  7 |     Autocompletions,
  8 |     AutocompletionsOrError,
  9 |     Correction,
 10 |     FollowCorrections,
 11 |     NotFound,
 12 |     SearchResult,
 13 |     SearchResultOrError,
 14 |     UsageFrequency,
 15 | )
 16 | from linguee_api.parser_utils import (
 17 |     concat_values,
 18 |     normalize,
 19 |     take_first_item,
 20 |     take_first_non_empty_item,
 21 | )
 22 | 
 23 | 
 24 | class IParser(abc.ABC):
 25 |     @abc.abstractmethod
 26 |     def parse_search_result(
 27 |         self, page_html: str, follow_corrections: FollowCorrections
 28 |     ) -> SearchResultOrError:
 29 |         ...
 30 | 
 31 |     @abc.abstractmethod
 32 |     def parse_autocompletions(self, page_html: str) -> AutocompletionsOrError:
 33 |         ...
 34 | 
 35 | 
 36 | class XExtractParser(IParser):
 37 |     def parse_search_result(
 38 |         self, page_html: str, follow_corrections: FollowCorrections
 39 |     ) -> SearchResultOrError:
 40 |         # find correction, if asked. We'll use it on not found or empty response.
 41 |         correction = None
 42 | 
 43 |         if follow_corrections in (
 44 |             FollowCorrections.ALWAYS,
 45 |             FollowCorrections.ON_EMPTY_TRANSLATIONS,
 46 |         ):
 47 |             correction = self.find_correction(page_html)
 48 | 
 49 |         # check if the page is correction
 50 |         if correction and follow_corrections == FollowCorrections.ALWAYS:
 51 |             return Correction(correction=correction)
 52 | 
 53 |         # check if the page is a not found
 54 |         if self.is_not_found(page_html):
 55 |             if correction:
 56 |                 return Correction(correction=correction)
 57 |             return NotFound()
 58 | 
 59 |         # assume it's a valid result
 60 |         result = self.parse_search_result_to_page(page_html)
 61 | 
 62 |         # Process ON_EMTPY case
 63 |         if correction and not result.lemmas:
 64 |             return Correction(correction=correction)
 65 | 
 66 |         return result
 67 | 
 68 |     def is_not_found(self, page_html: str) -> bool:
 69 |         """Return True if the page is a NOT FOUND page."""
 70 |         return String(css="h1.noresults").parse(page_html) != []
 71 | 
 72 |     def find_correction(self, page_html: str) -> Optional[str]:
 73 |         """Find the correction for a NOT FOUND page."""
 74 |         corrections = String(css="span.corrected").parse(page_html)
 75 |         if corrections:
 76 |             return corrections[0]
 77 |         return None
 78 | 
 79 |     def parse_search_result_to_page(self, page_html: str) -> SearchResult:
 80 |         parsed_result = self.parse_search_result_to_dict(page_html)
 81 |         return SearchResult(**parsed_result)
 82 | 
 83 |     def parse_search_result_to_dict(self, page_html: str) -> dict:
 84 |         return search_result_schema.parse(page_html)
 85 | 
 86 |     def parse_autocompletions(self, page_html: str) -> AutocompletionsOrError:
 87 |         parsed_result = self.parse_autocompletions_to_dict(page_html)
 88 |         return Autocompletions(**parsed_result)
 89 | 
 90 |     def parse_autocompletions_to_dict(self, page_html: str) -> dict:
 91 |         return autocompletions_schema.parse(page_html)
 92 | 
 93 | 
 94 | def is_featured(classname):
 95 |     return "featured" in classname
 96 | 
 97 | 
 98 | def normalize_example(text):
 99 |     """
100 |     Normalize the text in the example.
101 | 
102 |     Same as normalize(), but remove the last two words, which are the links to the
103 |     source website.
104 |     """
105 |     text = normalize(text)
106 |     text = " ".join(text.split()[:-2])
107 |     return text
108 | 
109 | 
110 | def parse_audio_links(text: Optional[str]) -> List[Dict[str, str]]:
111 |     if not text:
112 |         return []
113 | 
114 |     chunks = [chunk.strip('");') for chunk in text.split(",")]
115 |     if not chunks:
116 |         return []
117 | 
118 |     ret = []
119 |     for i in range(1, len(chunks), 2):
120 |         url_part = chunks[i]
121 |         lang = chunks[i + 1]
122 |         url = f"https://www.linguee.com/mp3/{url_part}.mp3"
123 |         ret.append({"url": url, "lang": lang})
124 |     return ret
125 | 
126 | 
127 | def parse_usage_frequency(text: Optional[str]) -> Optional[UsageFrequency]:
128 |     if not text:
129 |         return None
130 |     chunks = set(text.strip().split())
131 |     if "usedveryoften" in chunks:
132 |         return UsageFrequency.OFTEN
133 |     if "usedalmostalways" in chunks:
134 |         return UsageFrequency.ALMOST_ALWAYS
135 |     return None
136 | 
137 | 
138 | def normalize_lemma_text(children):
139 |     return " ".join(children["item"])
140 | 
141 | 
142 | lemma_schema = [
143 |     String(
144 |         name="featured",
145 |         xpath="self::*",
146 |         attr="class",
147 |         quant=1,
148 |         callback=is_featured,
149 |     ),
150 |     # We parse text as a group, because the lemma may have one or more elements, all
151 |     # of them represented with "a.dictLink". In most cases it's just a single element,
152 |     # but if it's more, we need to collect then all, and merge them together in the
153 |     # group callback normalize_lemma_text()
154 |     Group(
155 |         name="text",
156 |         quant=1,
157 |         css="span.tag_lemma",
158 |         callback=normalize_lemma_text,
159 |         children=[
160 |             String(
161 |                 name="item",
162 |                 css="a.dictLink",
163 |                 quant="+",
164 |                 callback=normalize,
165 |             ),
166 |         ],
167 |     ),
168 |     concat_values(
169 |         "pos",
170 |         String(
171 |             name="pos",
172 |             css="span.tag_lemma > span.tag_wordtype, span.tag_lemma > span.tag_type",
173 |             quant="*",
174 |         ),
175 |     ),
176 |     # Return a single string, that's being converted to a list in the model's validator.
177 |     concat_values(
178 |         "forms",
179 |         String(
180 |             name="forms",
181 |             css="span.tag_forms",
182 |             quant="*",
183 |             attr="_all_text",
184 |         ),
185 |     ),
186 |     # We parse text as a group, because grammar_info may have zero or more elements.
187 |     # and we care about the first record only
188 |     Group(
189 |         name="grammar_info",
190 |         quant=1,
191 |         callback=take_first_item,
192 |         children=[
193 |             String(
194 |                 name="item",
195 |                 quant="*",
196 |                 callback=normalize,
197 |                 css=(
198 |                     "span.tag_lemma > span.tag_lemma_context > "
199 |                     "span.placeholder > span.grammar_info"
200 |                 ),
201 |             )
202 |         ],
203 |     ),
204 |     String(
205 |         name="audio_links",
206 |         quant="?",
207 |         css="span.tag_lemma > a.audio",
208 |         attr="onclick",
209 |         callback=parse_audio_links,
210 |     ),
211 |     Group(
212 |         name="translations",
213 |         css="div.translation_lines div.translation",
214 |         quant="+",
215 |         children=[
216 |             String(
217 |                 name="featured",
218 |                 xpath="self::*",
219 |                 attr="class",
220 |                 quant=1,
221 |                 callback=is_featured,
222 |             ),
223 |             String(
224 |                 name="text",
225 |                 css="a.dictLink",
226 |                 quant=1,
227 |                 callback=normalize,
228 |             ),
229 |             concat_values(
230 |                 "pos",
231 |                 String(
232 |                     name="pos",
233 |                     css="span.tag_type",
234 |                     quant="*",
235 |                     attr="title",
236 |                 ),
237 |             ),
238 |             String(
239 |                 name="audio_links",
240 |                 quant="?",
241 |                 css="a.audio",
242 |                 attr="onclick",
243 |                 callback=parse_audio_links,
244 |             ),
245 |             Group(
246 |                 name="usage_frequency",
247 |                 quant=1,
248 |                 callback=take_first_non_empty_item,
249 |                 children=[
250 |                     String(
251 |                         name="item",
252 |                         quant="*",
253 |                         css="span.tag_c",
254 |                         attr="class",
255 |                         callback=parse_usage_frequency,
256 |                     ),
257 |                 ],
258 |             ),
259 |             Group(
260 |                 name="examples",
261 |                 css=".example_lines > .example",
262 |                 quant="*",
263 |                 children=[
264 |                     String(name="src", css=".tag_s", quant=1, callback=normalize),
265 |                     String(name="dst", css=".tag_t", quant=1, callback=normalize),
266 |                 ],
267 |             ),
268 |         ],
269 |     ),
270 | ]
271 | 
272 | source_url_schema = [
273 |     String(
274 |         name="src_url",
275 |         css="div.source_url > a",
276 |         attr="href",
277 |         quant="?",
278 |     ),
279 |     String(
280 |         name="src_url_text",
281 |         css="div.source_url",
282 |         quant="?",
283 |     ),
284 | ]
285 | 
286 | 
287 | def normalize_source_url(content):
288 |     if content["src_url"]:
289 |         return content["src_url"]
290 |     if content["src_url_text"]:
291 |         return f"http://{content['src_url_text']}"
292 |     return None
293 | 
294 | 
295 | search_result_schema = Group(
296 |     quant=1,
297 |     children=[
298 |         String(name="src_lang", css="div#data", attr="data-lang1", quant=1),
299 |         String(name="dst_lang", css="div#data", attr="data-lang2", quant=1),
300 |         String(name="query", css="div#data", attr="data-query", quant=1),
301 |         String(
302 |             name="correct_query",
303 |             css="div#data",
304 |             attr="data-correctspellingofquery",
305 |             quant=1,
306 |         ),
307 |         Group(
308 |             quant="*",
309 |             css="div.exact > div.lemma",
310 |             name="lemmas",
311 |             children=lemma_schema,
312 |         ),
313 |         Group(
314 |             quant="*",
315 |             css="div.example_lines div.lemma",
316 |             name="examples",
317 |             children=lemma_schema,
318 |         ),
319 |         Group(
320 |             quant="*",
321 |             css="table.result_table > tbody > tr",
322 |             name="external_sources",
323 |             children=[
324 |                 String(
325 |                     name="src",
326 |                     css="td.left > div.wrap",
327 |                     quant=1,
328 |                     attr="_all_text",
329 |                     callback=normalize_example,
330 |                 ),
331 |                 String(
332 |                     name="dst",
333 |                     css="td.right2 > div.wrap",
334 |                     quant=1,
335 |                     attr="_all_text",
336 |                     callback=normalize_example,
337 |                 ),
338 |                 Group(
339 |                     name="src_url",
340 |                     quant=1,
341 |                     css="td.left",
342 |                     children=source_url_schema,
343 |                     callback=normalize_source_url,
344 |                 ),
345 |                 Group(
346 |                     name="dst_url",
347 |                     quant=1,
348 |                     css="td.right2",
349 |                     children=source_url_schema,
350 |                     callback=normalize_source_url,
351 |                 ),
352 |             ],
353 |         ),
354 |     ],
355 | )
356 | 
357 | 
358 | autocompletions_schema = Group(
359 |     quant=1,
360 |     children=[
361 |         Group(
362 |             quant="*",
363 |             css="div.autocompletion_item",
364 |             name="autocompletions",
365 |             children=[
366 |                 String(
367 |                     name="text",
368 |                     css="div.main_row > div.main_item",
369 |                     quant=1,
370 |                     callback=normalize,
371 |                 ),
372 |                 concat_values(
373 |                     "pos",
374 |                     String(
375 |                         name="pos",
376 |                         css="div.main_row > div.main_wordtype",
377 |                         quant="*",
378 |                     ),
379 |                 ),
380 |                 Group(
381 |                     quant="+",
382 |                     name="translations",
383 |                     css="div.translation_row > div > div.translation_item",
384 |                     children=[
385 |                         String(
386 |                             name="text", xpath="self::*", quant=1, callback=normalize
387 |                         ),
388 |                         concat_values(
389 |                             "pos",
390 |                             String(
391 |                                 name="pos",
392 |                                 css="div.translation_item > div.wordtype",
393 |                                 quant="*",
394 |                             ),
395 |                         ),
396 |                     ],
397 |                 ),
398 |             ],
399 |         )
400 |     ],
401 | )
402 | 


--------------------------------------------------------------------------------
/linguee_api/utils.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | 
 4 | def import_string(import_name: str):
 5 |     """
 6 |     Import an object based on the import string.
 7 | 
 8 |     Separate module name from the object name with ":". For example,
 9 |     "linuguee_api.downloaders:HTTPXDownloader"
10 |     """
11 |     if ":" not in import_name:
12 |         raise RuntimeError(
13 |             f'{import_name} must separate module from object with ":". '
14 |             f'For example, "linguee_api.downloaders:HTTPXDownloader"'
15 |         )
16 |     module_name, object_name = import_name.rsplit(":", 1)
17 |     mod = importlib.import_module(module_name)
18 |     return getattr(mod, object_name)
19 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | python_version = 3.9
 3 | follow_imports = silent
 4 | scripts_are_modules = true
 5 | namespace_packages = true
 6 | no_implicit_optional = true
 7 | 
 8 | # We had to ignore missing imports, because of third-party libraries installed
 9 | # inside the virtualenv, and apparently there's no easy way for mypy to respect
10 | # packages inside the virtualenv. That's the option pre-commit-config runs with
11 | # by default, but we add it here as well for the sake of uniformity of the
12 | # output
13 | ignore_missing_imports = true
14 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "linguee-api"
 3 | version = "2.6.3"
 4 | description = "Linguee API"
 5 | readme = "README.md"
 6 | homepage = "https://github.com/imankulov/linguee-api"
 7 | repository = "https://github.com/imankulov/linguee-api"
 8 | authors = ["Roman Imankulov <roman.imankulov@gmail.com>"]
 9 | license = "MIT"
10 | classifiers = [
11 |     "Intended Audience :: Developers",
12 |     "Programming Language :: Python :: 3",
13 |     "Programming Language :: Python :: 3.8",
14 |     "Programming Language :: Python :: 3.9",
15 |     "Programming Language :: Python :: 3.10",
16 |     "Programming Language :: Python :: 3.11",
17 |     "Programming Language :: Python :: 3.12",
18 | ]
19 | include = [
20 |     "CHANGELOG.md",
21 |     "docs/linguee-api.png",
22 | ]
23 | 
24 | packages = [
25 |     { include = "linguee_api" },
26 |     { include = "tests", format = "sdist" },
27 | ]
28 | 
29 | [tool.poetry.dependencies]
30 | python = "^3.8"
31 | fastapi = "^0.109.2"
32 | pydantic = "^1"
33 | xextract = "^0.1.8"
34 | httpx = "^0.24.1"
35 | uvicorn = "^0.22.0"
36 | sentry-sdk = "^1.24.0"
37 | python-dotenv = "^1.0.0"
38 | loguru = "^0.7.0"
39 | aiosqlite = "^0.19.0"
40 | async-lru = "^2.0.2"
41 | lxml = "^4.9.3"
42 | 
43 | [tool.poetry.group.dev.dependencies]
44 | pytest = "^6.1.2"
45 | pytest-xdist = "^2.2.1"
46 | black = "^24.4.1"
47 | flake8 = "^3.8.4"
48 | coverage = "^5.4"
49 | import-linter = "^1.2.1"
50 | pytest-asyncio = "^0.14.0"
51 | ipython = "^8.10.0"
52 | click = "^8.1.7"
53 | asgiref = "^3.3.4"
54 | tox-poetry-installer = {extras = ["poetry"], version = "^0.10.2"}
55 | bump2version = "^1.0.1"
56 | 
57 | [tool.coverage.run]
58 | source = ["tests", "linguee_api"]
59 | 
60 | [build-system]
61 | requires = ["poetry-core>=1.0.0"]
62 | build-backend = "poetry.core.masonry.api"
63 | 


--------------------------------------------------------------------------------
/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.10.5
2 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pydantic import BaseSettings, Field
 3 | 
 4 | from linguee_api.config import settings
 5 | from linguee_api.const import PROJECT_ROOT
 6 | from linguee_api.downloaders.error_downloader import ErrorDownloader
 7 | from linguee_api.downloaders.httpx_downloader import HTTPXDownloader
 8 | from linguee_api.downloaders.interfaces import IDownloader
 9 | from linguee_api.downloaders.sqlite_cache import SQLiteCache
10 | from linguee_api.linguee_client import LingueeClient
11 | from linguee_api.parsers import XExtractParser
12 | 
13 | 
14 | class PytestSettings(BaseSettings):
15 |     """Specific settings for pytest."""
16 | 
17 |     offline: bool = Field(default=False, description="Run tests offline")
18 | 
19 |     @property
20 |     def downloader(self) -> IDownloader:
21 |         return ErrorDownloader() if self.offline else HTTPXDownloader()
22 | 
23 |     class Config:
24 |         env_prefix = "pytest_"
25 |         env_file = (PROJECT_ROOT / ".env").as_posix()
26 | 
27 | 
28 | pytest_settings = PytestSettings()
29 | 
30 | 
31 | @pytest.fixture
32 | def examples_downloader() -> IDownloader:
33 |     return SQLiteCache(
34 |         cache_database=settings.cache_database, upstream=pytest_settings.downloader
35 |     )
36 | 
37 | 
38 | @pytest.fixture
39 | def linguee_client(examples_downloader) -> LingueeClient:
40 |     return LingueeClient(
41 |         page_downloader=examples_downloader, page_parser=XExtractParser()
42 |     )
43 | 


--------------------------------------------------------------------------------
/tests/parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imankulov/linguee-api/9844c8247b07a2771b1555f09c8bbc6ea83f08d7/tests/parsers/__init__.py


--------------------------------------------------------------------------------
/tests/parsers/test_autocompletions.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from linguee_api.downloaders.interfaces import IDownloader
 4 | from linguee_api.linguee_client import get_autocompletions_url
 5 | from linguee_api.models import Autocompletions
 6 | from linguee_api.parsers import XExtractParser
 7 | 
 8 | 
 9 | @pytest.mark.asyncio
10 | async def test_parse_autocompletions_should_return_autocompletions(
11 |     examples_downloader: IDownloader,
12 | ):
13 |     url = get_autocompletions_url(query="katz", src="de", dst="en")
14 |     page = await examples_downloader.download(url)
15 |     parser = XExtractParser()
16 |     parse_result = parser.parse_autocompletions(page)
17 | 
18 |     a = Autocompletions.AutocompletionItem
19 |     t = Autocompletions.AutocompletionItem.TranslationItem
20 |     first_item = a(
21 |         text="Katze",
22 |         pos="f",
23 |         translations=[
24 |             t(text="cat", pos="n"),
25 |             t(text="feline", pos="n"),
26 |             t(text="crab", pos="n"),
27 |         ],
28 |     )
29 |     assert parse_result.autocompletions[0] == first_item
30 | 


--------------------------------------------------------------------------------
/tests/parsers/test_search_result.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | import pytest
  4 | 
  5 | from linguee_api.const import LANGUAGE_CODE
  6 | from linguee_api.downloaders.interfaces import IDownloader
  7 | from linguee_api.linguee_client import get_search_url
  8 | from linguee_api.models import UsageFrequency
  9 | from linguee_api.parsers import XExtractParser
 10 | 
 11 | 
 12 | @pytest.mark.parametrize(
 13 |     ["query", "src", "dst", "is_not_found"],
 14 |     [
 15 |         ("constibado", "pt", "en", True),
 16 |         ("Möglichkei", "de", "en", False),  # At least, there are examples
 17 |         ("esgotar", "pt", "en", False),
 18 |         ("not bad", "en", "pt", False),
 19 |         ("xxxxzzzz", "pt", "en", True),
 20 |     ],
 21 | )
 22 | @pytest.mark.asyncio
 23 | async def test_parser_should_detect_not_found(
 24 |     examples_downloader: IDownloader,
 25 |     query: str,
 26 |     src: LANGUAGE_CODE,
 27 |     dst: LANGUAGE_CODE,
 28 |     is_not_found: bool,
 29 | ):
 30 |     url = get_search_url(query=query, src=src, dst=dst, guess_direction=False)
 31 |     page = await examples_downloader.download(url)
 32 |     assert XExtractParser().is_not_found(page) == is_not_found
 33 | 
 34 | 
 35 | @pytest.mark.asyncio
 36 | async def test_parser_should_find_translation_examples(
 37 |     examples_downloader: IDownloader,
 38 | ):
 39 |     url = get_search_url(query="obrigado", src="pt", dst="en", guess_direction=False)
 40 |     page_html = await examples_downloader.download(url)
 41 |     page = XExtractParser().parse_search_result_to_page(page_html)
 42 |     examples_of_1st_translation = page.lemmas[0].translations[0].examples
 43 |     assert examples_of_1st_translation is not None
 44 |     assert len(examples_of_1st_translation) == 1
 45 |     assert examples_of_1st_translation[0].src == (
 46 |         "Obrigado por sua participação em nossa pesquisa."
 47 |     )
 48 |     assert examples_of_1st_translation[0].dst == (
 49 |         "Thank you for your participation in our survey."
 50 |     )
 51 | 
 52 | 
 53 | @pytest.mark.parametrize(
 54 |     ["query", "src", "dst", "correction"],
 55 |     [
 56 |         ("constibado", "pt", "en", "constipado"),
 57 |         (
 58 |             "Möglichkei",
 59 |             "de",
 60 |             "en",
 61 |             "möglichkeit",
 62 |         ),  # Despite having examples, Linguee provides a correction.
 63 |         ("esgotar", "pt", "en", None),
 64 |         ("xxxxzzzz", "pt", "en", None),
 65 |     ],
 66 | )
 67 | @pytest.mark.asyncio
 68 | async def test_parser_should_find_correction(
 69 |     examples_downloader: IDownloader,
 70 |     query: str,
 71 |     src: LANGUAGE_CODE,
 72 |     dst: LANGUAGE_CODE,
 73 |     correction: Optional[str],
 74 | ):
 75 |     url = get_search_url(query=query, src=src, dst=dst, guess_direction=False)
 76 |     page = await examples_downloader.download(url)
 77 |     assert XExtractParser().find_correction(page) == correction
 78 | 
 79 | 
 80 | @pytest.mark.parametrize(
 81 |     ["query", "src", "dst"],
 82 |     [
 83 |         ("esgotar", "pt", "en"),
 84 |         (
 85 |             "Möglichkei",
 86 |             "de",
 87 |             "en",
 88 |         ),  # The page only has external sources
 89 |         ("obrigado", "pt", "en"),
 90 |         ("not bad", "en", "pt"),
 91 |         ("einfach", "de", "en"),
 92 |         ("Tisch", "de", "en"),
 93 |         ("wünschen", "de", "en"),
 94 |         ("envisage", "en", "zh"),
 95 |         ("envisage", "en", "sv"),
 96 |         ("über", "de", "en"),
 97 |     ],
 98 | )
 99 | @pytest.mark.asyncio
100 | async def test_parse_to_dict_should_return_parseable_result(
101 |     examples_downloader: IDownloader,
102 |     query: str,
103 |     src: LANGUAGE_CODE,
104 |     dst: LANGUAGE_CODE,
105 | ):
106 |     url = get_search_url(query=query, src=src, dst=dst, guess_direction=False)
107 |     page = await examples_downloader.download(url)
108 |     XExtractParser().parse_search_result_to_page(page)
109 | 
110 | 
111 | @pytest.mark.asyncio
112 | async def test_parser_should_find_grammar_info_in_german_verbs(
113 |     examples_downloader: IDownloader,
114 | ):
115 |     url = get_search_url(query="bringen", src="de", dst="en", guess_direction=False)
116 |     page_html = await examples_downloader.download(url)
117 |     page = XExtractParser().parse_search_result_to_page(page_html)
118 |     assert page.lemmas[0].grammar_info == "Akk"
119 | 
120 | 
121 | @pytest.mark.asyncio
122 | async def test_parser_should_process_examples_without_links(
123 |     examples_downloader: IDownloader,
124 | ):
125 |     url = get_search_url(query="einfach", src="de", dst="en", guess_direction=False)
126 |     page_html = await examples_downloader.download(url)
127 |     page = XExtractParser().parse_search_result_to_page(page_html)
128 |     sources = page.external_sources
129 |     assert all([s.src_url.startswith("http") for s in sources])
130 |     assert all([s.dst_url.startswith("http") for s in sources])
131 | 
132 | 
133 | @pytest.mark.asyncio
134 | async def test_parser_should_find_almost_always_usage_frequency(
135 |     examples_downloader: IDownloader,
136 | ):
137 |     url = get_search_url(query="bacalhau", src="pt", dst="en", guess_direction=False)
138 |     page_html = await examples_downloader.download(url)
139 |     page = XExtractParser().parse_search_result_to_page(page_html)
140 |     assert page.lemmas[0].translations[1].usage_frequency is None
141 |     assert (
142 |         page.lemmas[0].translations[0].usage_frequency == UsageFrequency.ALMOST_ALWAYS
143 |     )
144 | 
145 | 
146 | @pytest.mark.asyncio
147 | async def test_parser_should_find_often_usage_frequency(
148 |     examples_downloader: IDownloader,
149 | ):
150 |     url = get_search_url(query="placa", src="pt", dst="en", guess_direction=False)
151 |     page_html = await examples_downloader.download(url)
152 |     page = XExtractParser().parse_search_result_to_page(page_html)
153 |     assert page.lemmas[0].translations[1].usage_frequency is None
154 |     assert page.lemmas[0].translations[0].usage_frequency == UsageFrequency.OFTEN
155 | 
156 | 
157 | @pytest.mark.asyncio
158 | async def test_parser_should_find_lemma_forms(
159 |     examples_downloader: IDownloader,
160 | ):
161 |     url = get_search_url(query="obrigado", src="pt", dst="en", guess_direction=False)
162 |     page_html = await examples_downloader.download(url)
163 |     page = XExtractParser().parse_search_result_to_page(page_html)
164 |     assert page.lemmas[0].forms == []
165 |     assert page.lemmas[1].forms == ["obrigada f sl", "obrigados m pl", "obrigadas f pl"]
166 | 
167 | 
168 | @pytest.mark.asyncio
169 | async def test_parser_should_find_lemma_forms_for_verbs(
170 |     examples_downloader: IDownloader,
171 | ):
172 |     url = get_search_url(query="shrink", src="en", dst="pt", guess_direction=False)
173 |     page_html = await examples_downloader.download(url)
174 |     page = XExtractParser().parse_search_result_to_page(page_html)
175 |     assert page.lemmas[0].forms == ["shrank or shrunk", "shrunk"]
176 | 


--------------------------------------------------------------------------------
/tests/test_api_client.py:
--------------------------------------------------------------------------------
 1 | from linguee_api.linguee_client import get_search_url
 2 | 
 3 | 
 4 | def test_get_linguee_url_should_return_valid_url():
 5 |     url = get_search_url(query="bacalhau", src="pt", dst="en", guess_direction=False)
 6 |     assert url == (
 7 |         "https://www.linguee.com/portuguese-english/search?"
 8 |         "query=bacalhau&ajax=1&source=PT"
 9 |     )
10 | 


--------------------------------------------------------------------------------
/tests/test_downloaders.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import string
 3 | 
 4 | import pytest
 5 | 
 6 | from linguee_api.downloaders.httpx_downloader import HTTPXDownloader
 7 | from linguee_api.downloaders.interfaces import DownloaderError
 8 | 
 9 | 
10 | @pytest.mark.asyncio
11 | async def test_httpx_downloader_should_download_a_page():
12 |     url = (
13 |         "https://www.linguee.com/portuguese-english/search?"
14 |         "query=bacalhau&ajax=1&source=PT"
15 |     )
16 |     content = await HTTPXDownloader().download(url)
17 |     assert "bacalhau" in content
18 | 
19 | 
20 | @pytest.mark.asyncio
21 | async def test_httpx_downloader_should_raise_exception_on_invalid_domain_name():
22 |     random_sequence = "".join(random.choices(string.ascii_lowercase, k=30))
23 |     invalid_url = f"https://{random_sequence}.com"
24 |     with pytest.raises(DownloaderError):
25 |         await HTTPXDownloader().download(invalid_url)
26 | 
27 | 
28 | @pytest.mark.asyncio
29 | async def test_httpx_downloader_should_raise_exception_on_non200_code():
30 |     invalid_url = "https://httpbin.org/status/403"
31 |     with pytest.raises(DownloaderError):
32 |         await HTTPXDownloader().download(invalid_url)
33 | 


--------------------------------------------------------------------------------
/tests/test_file_cache.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from linguee_api.downloaders.file_cache import FileCache
 6 | from linguee_api.downloaders.mock_downloader import MockDownloader
 7 | 
 8 | 
 9 | @pytest.mark.asyncio
10 | async def test_file_cache_should_cache_a_value(tmp_path):
11 |     # Cache value
12 |     cache = FileCache(
13 |         cache_directory=Path(tmp_path), upstream=MockDownloader(message="foo")
14 |     )
15 |     await cache.download("https://example.com")
16 | 
17 |     # Change upstream and try to get the value again
18 |     cache.upstream = MockDownloader(message="bar")
19 |     result2 = await cache.download("https://example.com")
20 | 
21 |     # The value should be the same
22 |     assert result2 == "foo"
23 | 


--------------------------------------------------------------------------------
/tests/test_linguee_client.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from linguee_api.const import LANGUAGE_CODE, LANGUAGES
 4 | from linguee_api.linguee_client import LingueeClient
 5 | from linguee_api.models import FollowCorrections, ParseError, SearchResult
 6 | 
 7 | 
 8 | @pytest.mark.asyncio
 9 | @pytest.mark.parametrize(
10 |     "follow_corrections",
11 |     [
12 |         FollowCorrections.ALWAYS,
13 |         FollowCorrections.ON_EMPTY_TRANSLATIONS,
14 |     ],
15 | )
16 | async def test_linguee_client_should_redirect_on_not_found(
17 |     linguee_client: LingueeClient,
18 |     follow_corrections,
19 | ):
20 |     search_result = await linguee_client.process_search_result(
21 |         query="constibado",
22 |         src="pt",
23 |         dst="en",
24 |         guess_direction=False,
25 |         follow_corrections=follow_corrections,
26 |     )
27 |     assert search_result.query == "constipado"
28 | 
29 | 
30 | @pytest.mark.asyncio
31 | async def test_linguee_client_should_not_redirect_if_not_asked(
32 |     linguee_client: LingueeClient,
33 | ):
34 |     search_result = await linguee_client.process_search_result(
35 |         query="constibado",
36 |         src="pt",
37 |         dst="en",
38 |         guess_direction=False,
39 |         follow_corrections=FollowCorrections.NEVER,
40 |     )
41 |     assert isinstance(search_result, ParseError)
42 |     assert search_result.message == "Translation not found"
43 | 
44 | 
45 | @pytest.mark.asyncio
46 | @pytest.mark.parametrize("lang", list(LANGUAGES.keys()))
47 | async def test_linguee_client_should_process_test_requests(
48 |     linguee_client: LingueeClient,
49 |     lang: LANGUAGE_CODE,
50 | ):
51 |     search_result = await linguee_client.process_search_result(
52 |         query="test",
53 |         src="en",
54 |         dst=lang,
55 |         guess_direction=False,
56 |         follow_corrections=FollowCorrections.ALWAYS,
57 |     )
58 |     assert isinstance(search_result, SearchResult)
59 | 


--------------------------------------------------------------------------------
/tests/test_memory_cache.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from linguee_api.downloaders.memory_cache import MemoryCache
 4 | from linguee_api.downloaders.mock_downloader import MockDownloader
 5 | 
 6 | 
 7 | @pytest.mark.asyncio
 8 | async def test_memory_cache_should_cache_a_value():
 9 |     # Cache value
10 |     cache = MemoryCache(upstream=MockDownloader(message="foo"))
11 |     await cache.download("https://example.com")
12 | 
13 |     # Change upstream and try to get the value again
14 |     cache.upstream = MockDownloader(message="bar")
15 |     result2 = await cache.download("https://example.com")
16 | 
17 |     # The value should be the same
18 |     assert result2 == "foo"
19 | 
20 | 
21 | @pytest.mark.asyncio
22 | async def test_memory_cache_should_evict_cache_on_overflow():
23 |     # Cache value
24 |     cache = MemoryCache(upstream=MockDownloader(message="foo"), maxsize=1)
25 |     await cache.download("https://example.com")
26 |     await cache.download("https://example2.com")
27 | 
28 |     # Change upstream and try to get the value again
29 |     cache.upstream = MockDownloader(message="bar")
30 |     result2 = await cache.download("https://example.com")
31 | 
32 |     # The value should be the new one
33 |     assert result2 == "bar"
34 | 


--------------------------------------------------------------------------------
/tests/test_sqlite_cache.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from linguee_api.downloaders.mock_downloader import MockDownloader
 6 | from linguee_api.downloaders.sqlite_cache import SQLiteCache
 7 | 
 8 | 
 9 | @pytest.mark.asyncio
10 | async def test_sqlite_cache_should_cache_a_value(tmp_path):
11 |     # Cache value
12 |     cache_database = Path(tmp_path) / "cache.db"
13 |     cache = SQLiteCache(
14 |         cache_database=cache_database, upstream=MockDownloader(message="foo")
15 |     )
16 |     await cache.download("https://example.com")
17 | 
18 |     # Change upstream and try to get the value again
19 |     cache.upstream = MockDownloader(message="bar")
20 |     result2 = await cache.download("https://example.com")
21 | 
22 |     # The value should be the same
23 |     assert result2 == "foo"
24 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py38, py39, py310, py311, py312
 3 | 
 4 | isolated_build = true
 5 | 
 6 | [testenv]
 7 | locked_deps =
 8 |     pytest
 9 |     pytest-xdist
10 |     pytest-asyncio
11 | commands =
12 |     pytest -s tests
13 | 


--------------------------------------------------------------------------------