├── .env
├── .flake8
├── .github
    └── workflows
    │   ├── dev.yml
    │   └── docker-image.yml
├── .gitignore
├── .vscode
    └── settings.json
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── data
    └── test.jsonl
├── eval.py
├── main.py
├── notes.md
├── prompts
    ├── address.txt
    ├── detailed_intent.txt
    └── intent.txt
├── pyproject.toml
├── requirements.txt
├── run_address.sh
├── run_intent.sh
├── src
    ├── __init__.py
    ├── config.py
    ├── converter.py
    ├── gpt
    │   ├── __init__.py
    │   └── network_manager.py
    ├── lm
    │   ├── __init__.py
    │   └── tokenizer.py
    ├── logger.py
    └── models.py
├── task-definition.json
└── tests
    ├── __init__.py
    └── test_ai.py


/.env:
--------------------------------------------------------------------------------
1 | address_prompt_file="prompts/address.txt"
2 | detailed_intent_prompt_file="prompts/detailed_intent.txt"
3 | max_tokens=384
4 | batch_size=20
5 | geo_location=false
6 | engine="afet-org"


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | exclude = .git
 3 | max-line-length = 88
 4 | ignore =
 5 |     E203,   # Black default for colon/whitespace handling...
 6 |     E501,   # Line length issues... black takes care of code, but comments/markdown are not handled
 7 |     W503,   # Line break before binary operatorflake8(W503)
 8 |     E722
 9 | per-file-ignores =
10 |     */__init__.py: F401
11 | 


--------------------------------------------------------------------------------
/.github/workflows/dev.yml:
--------------------------------------------------------------------------------
 1 | name: Test DepremOpenAiApi
 2 | 
 3 | on:
 4 |   push:
 5 |     branches-ignore: ["main"]
 6 | 
 7 | env:
 8 |   OPENAI_API_BASE_POOL: ${{ secrets.OPENAI_API_BASE_POOL }}
 9 |   OPENAI_API_KEY_POOL: ${{ secrets.OPENAI_API_KEY_POOL }}
10 |   NEEDS_RESOLVER_API_KEY: ${{ secrets.NEEDS_RESOLVER_API_KEY }}
11 | 
12 | jobs:
13 |   deploy:
14 |     name: test_on_branch
15 |     runs-on: ubuntu-latest
16 |     environment: prod
17 | 
18 |     steps:
19 |       - name: Checkout
20 |         uses: actions/checkout@v3
21 | 
22 |       - name: Build and test
23 |         id: build-image
24 |         env:
25 |           IMAGE_TAG: ${{ github.sha }}
26 |         run: |
27 |           docker build -t deprem-openai-apis-test:latest .
28 |           docker run  -e NEEDS_RESOLVER_API_KEY=${NEEDS_RESOLVER_API_KEY} --rm -t deprem-openai-apis-test:latest sh -c "OPENAI_API_BASE_POOL=${OPENAI_API_BASE_POOL} OPENAI_API_KEY_POOL=${OPENAI_API_KEY_POOL} pytest --verbose -s"
29 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy DepremOpenAiApi
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["main"]
 6 | 
 7 | env:
 8 |   OPENAI_API_BASE_POOL: ${{ secrets.OPENAI_API_BASE_POOL }}
 9 |   OPENAI_API_KEY_POOL: ${{ secrets.OPENAI_API_KEY_POOL }}
10 |   NEEDS_RESOLVER_API_KEY: ${{ secrets.NEEDS_RESOLVER_API_KEY }}
11 | 
12 | jobs:
13 |   deploy:
14 |     name: Deploy
15 |     runs-on: ubuntu-latest
16 |     permissions:
17 |       actions: write
18 |       id-token: write
19 |     environment: prod
20 | 
21 |     steps:
22 |       - name: Checkout
23 |         uses: actions/checkout@v3
24 | 
25 |       - name: Configure AWS credentials
26 |         uses: aws-actions/configure-aws-credentials@v1
27 |         with:
28 |           aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }}
29 |           aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }}
30 |           aws-region: eu-central-1
31 | 
32 |       - name: Login to Amazon ECR
33 |         id: login-ecr
34 |         uses: aws-actions/amazon-ecr-login@v1
35 | 
36 |       - name: Build, tag, and push image to Amazon ECR
37 |         id: build-image
38 |         env:
39 |           ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
40 |           ECR_REPOSITORY: base-ecr
41 |           IMAGE_TAG: ${{ github.sha }}
42 |         run: |
43 |           # Build a docker container and
44 |           # push it to ECR so that it can
45 |           # be deployed to ECS.
46 |           docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
47 |           docker run -e NEEDS_RESOLVER_API_KEY=${NEEDS_RESOLVER_API_KEY} --rm -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG sh -c "OPENAI_API_BASE_POOL=${OPENAI_API_BASE_POOL} OPENAI_API_KEY_POOL=${OPENAI_API_KEY_POOL} pytest --verbose -s"
48 |           docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
49 |           echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
50 | 
51 |       - name: Fill in the new image ID in the Amazon ECS task definition
52 |         id: task-def
53 |         uses: aws-actions/amazon-ecs-render-task-definition@v1
54 |         with:
55 |           task-definition: task-definition.json
56 |           container-name: container-name
57 |           image: ${{ steps.build-image.outputs.image }}
58 | 
59 |       - name: Deploy Amazon ECS task definition
60 |         uses: aws-actions/amazon-ecs-deploy-task-definition@v1
61 |         with:
62 |           task-definition: ${{ steps.task-def.outputs.task-definition }}
63 |           service: deprem-openai-api-service
64 |           cluster: base-cluster
65 |           wait-for-service-stability: false
66 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | **/__pycache__/
  2 | __pycache__/
  3 | 
  4 | # C extensions
  5 | *.so
  6 | 
  7 | # Distribution / packaging
  8 | .Python
  9 | build/
 10 | develop-eggs/
 11 | dist/
 12 | downloads/
 13 | eggs/
 14 | .eggs/
 15 | lib/
 16 | lib64/
 17 | parts/
 18 | sdist/
 19 | var/
 20 | wheels/
 21 | share/python-wheels/
 22 | *.egg-info/
 23 | .installed.cfg
 24 | *.egg
 25 | MANIFEST
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .nox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | *.py,cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | cover/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | db.sqlite3-journal
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | .pybuilder/
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 84 | __pypackages__/
 85 | 
 86 | # Celery stuff
 87 | celerybeat-schedule
 88 | celerybeat.pid
 89 | 
 90 | # SageMath parsed files
 91 | *.sage.py
 92 | 
 93 | # Environments
 94 | .env
 95 | .venv
 96 | env/
 97 | venv/
 98 | ENV/
 99 | env.bak/
100 | venv.bak/
101 | 
102 | # Spyder project settings
103 | .spyderproject
104 | .spyproject
105 | 
106 | # Rope project settings
107 | .ropeproject
108 | 
109 | # mkdocs documentation
110 | /site
111 | 
112 | # mypy
113 | .mypy_cache/
114 | .dmypy.json
115 | dmypy.json
116 | 
117 | # Pyre type checker
118 | .pyre/
119 | 
120 | # pytype static type analyzer
121 | .pytype/
122 | 
123 | # Cython debug symbols
124 | cython_debug/
125 | 
126 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "git.ignoreLimitWarning": true,
 3 |     "python.linting.flake8Enabled": true,
 4 |     "python.linting.enabled": true,
 5 |     "editor.formatOnSave": true,
 6 |     "python.linting.lintOnSave": true,
 7 |     "python.analysis.extraPaths": [
 8 |         "${workspaceFolder}",
 9 |     ],
10 |     "python.formatting.provider": "black",
11 |     "python.formatting.blackArgs": [
12 |         "--line-length=88"
13 |     ],
14 |     "python.sortImports.args": [
15 |         "--profile",
16 |         "black"
17 |     ],
18 |     "[python]": {
19 |         "editor.codeActionsOnSave": {
20 |             "source.organizeImports": true
21 |         }
22 |     },
23 |     "python.linting.pylintEnabled": false
24 | }


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10
 2 | 
 3 | RUN mkdir -p /usr/src/app
 4 | WORKDIR /usr/src/app
 5 | 
 6 | COPY requirements.txt /usr/src/app/requirements.txt
 7 | 
 8 | RUN pip install --no-cache-dir --upgrade -r requirements.txt
 9 | 
10 | ENV PYTHONUNBUFFERED 0
11 | EXPOSE 80
12 | 
13 | COPY . /usr/src/app
14 | 
15 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"]
16 | 
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 açık-kaynak.org
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | local-test:
2 | 	docker build -t deprem-openai-apis-test:latest .
3 | 	docker run --rm -t deprem-openai-apis-test:latest sh -c "OPENAI_API_BASE_POOL=${OPENAI_API_BASE_POOL} OPENAI_API_KEY_POOL=${OPENAI_API_KEY_POOL} pytest --verbose"
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Address and Intent Extractor
 2 | 
 3 | > Please use the active remote: https://github.com/acikkaynak/deprem_openai_apis
 4 | 
 5 | > Prompts in this repo are placeholder for the privacy reasons. Please contact us if you'd like to obtain them.
 6 | 
 7 | The code can extract adresses from raw Turkiye earthquake tweets and classify them for intent via OpenAI GPT Codex API by using few-shot prompting.
 8 | 
 9 | # How To Run
10 | 
11 | Currently the input format is `.jsonl` where each line has a json string with "Tweet" field, see an example input file here [data/test.jsonl](./data/test.jsonl).
12 | 
13 | Export two environment variables as comma seperated keys:
14 | 
15 | ```SHELL
16 | export OPENAI_API_KEY_POOL=key1,key2,key3...
17 | export GEO_KEY_POOL=key1,key2
18 | ```
19 | 
20 | optionally for afet org api base urls
21 | ```SHELL
22 | export OPENAI_API_BASE_POOL=
23 | ```
24 | 
25 | To extract the geo location address information:
26 | - Specify your paths in [run_addres.sh](./run_address.sh), then run the script
27 | ```SHELL
28 | ./run_address.sh
29 | ```
30 | 
31 | To extract the intent information:
32 | - Specify your paths in [run_intent.sh](./run_intent.sh), then run the script.
33 | ```SHELL
34 | ./run_intent.sh
35 | ```
36 | 
37 | # To Run FastAPI Backend
38 | 
39 | - To run locally `uvicorn main:app --reload`
40 | 
41 | 
42 | 
43 | Running github actions 
44 | 


--------------------------------------------------------------------------------
/data/test.jsonl:
--------------------------------------------------------------------------------
1 | {"URL": "https://www.twitter.com/circcassian/status/1623400284046168071", "Tarih": 1675894563000, "Tweet": "3 G\u00dcN OLDU NOLUR YARDIM ED\u0130N HATAY/ KIRIKHAN CUMHUR\u0130YET MAHALLES\u0130 G\u00dcL SOKAK NO 14 (erdo\u011fan kebap yan\u0131) ENKAZ ALTINDA KALAN EMEL \u00c7A\u011eLAR VE M\u0130THAT \u00c7A\u011eLAR KURTARILMAYA \u00c7ALI\u015eILIYOR SESLER\u0130 GEL\u0130YOR FAKAT EK\u0130PLER S\u00dcREKL\u0130 ARAMAYI BIRAKIP (\u0131\u015f\u0131k yok,yorulduk vs diyerek) B\u00d6LGEDEN AYRILIYOR", "Kullan\u0131c\u0131 ad\u0131": "circcassian"}
2 | {"URL": "https://www.twitter.com/circcassian/status/1623400284046168071", "Tarih": 1675894563000, "Tweet": "3 G\u00dcN OLDU NOLUR YARDIM ED\u0130N HATAY/ KIRIKHAN CUMHUR\u0130YET MAHALLES\u0130 G\u00dcL SOKAK NO 14 (erdo\u011fan kebap yan\u0131) ENKAZ ALTINDA KALAN EMEL \u00c7A\u011eLAR VE M\u0130THAT \u00c7A\u011eLAR KURTARILMAYA \u00c7ALI\u015eILIYOR SESLER\u0130 GEL\u0130YOR FAKAT EK\u0130PLER S\u00dcREKL\u0130 ARAMAYI BIRAKIP (\u0131\u015f\u0131k yok,yorulduk vs diyerek) B\u00d6LGEDEN AYRILIYOR", "Kullan\u0131c\u0131 ad\u0131": "circcassian"}
3 | {"URL": "https://www.twitter.com/circcassian/status/1623400284046168071", "Tarih": 1675894563000, "Tweet": "3 G\u00dcN OLDU NOLUR YARDIM ED\u0130N HATAY/ KIRIKHAN CUMHUR\u0130YET MAHALLES\u0130 G\u00dcL SOKAK NO 14 (erdo\u011fan kebap yan\u0131) ENKAZ ALTINDA KALAN EMEL \u00c7A\u011eLAR VE M\u0130THAT \u00c7A\u011eLAR KURTARILMAYA \u00c7ALI\u015eILIYOR SESLER\u0130 GEL\u0130YOR FAKAT EK\u0130PLER S\u00dcREKL\u0130 ARAMAYI BIRAKIP (\u0131\u015f\u0131k yok,yorulduk vs diyerek) B\u00d6LGEDEN AYRILIYOR", "Kullan\u0131c\u0131 ad\u0131": "circcassian"}
4 | {"URL": "https://www.twitter.com/circcassian/status/1623400284046168071", "Tarih": 1675894563000, "Tweet": "3 G\u00dcN OLDU NOLUR YARDIM ED\u0130N HATAY/ KIRIKHAN CUMHUR\u0130YET MAHALLES\u0130 G\u00dcL SOKAK NO 14 (erdo\u011fan kebap yan\u0131) ENKAZ ALTINDA KALAN EMEL \u00c7A\u011eLAR VE M\u0130THAT \u00c7A\u011eLAR KURTARILMAYA \u00c7ALI\u015eILIYOR SESLER\u0130 GEL\u0130YOR FAKAT EK\u0130PLER S\u00dcREKL\u0130 ARAMAYI BIRAKIP (\u0131\u015f\u0131k yok,yorulduk vs diyerek) B\u00d6LGEDEN AYRILIYOR", "Kullan\u0131c\u0131 ad\u0131": "circcassian"}
5 | {"URL": "https://www.twitter.com/circcassian/status/1623400284046168071", "Tarih": 1675894563000, "Tweet": "3 G\u00dcN OLDU NOLUR YARDIM ED\u0130N HATAY/ KIRIKHAN CUMHUR\u0130YET MAHALLES\u0130 G\u00dcL SOKAK NO 14 (erdo\u011fan kebap yan\u0131) ENKAZ ALTINDA KALAN EMEL \u00c7A\u011eLAR VE M\u0130THAT \u00c7A\u011eLAR KURTARILMAYA \u00c7ALI\u015eILIYOR SESLER\u0130 GEL\u0130YOR FAKAT EK\u0130PLER S\u00dcREKL\u0130 ARAMAYI BIRAKIP (\u0131\u015f\u0131k yok,yorulduk vs diyerek) B\u00d6LGEDEN AYRILIYOR", "Kullan\u0131c\u0131 ad\u0131": "circcassian"}
6 | 
7 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sklearn
 3 | import sklearn.metrics
 4 | from absl import app, flags
 5 | from sklearn.preprocessing import MultiLabelBinarizer
 6 | 
 7 | 
 8 | FLAGS = flags.FLAGS
 9 | 
10 | flags.DEFINE_string(
11 |     "input_file", default=None, help="Prompt file to use for the problem"
12 | )
13 | 
14 | 
15 | def main(_):
16 |     true_values = []
17 |     pred_values = []
18 | 
19 |     FILE_NAME = FLAGS.input_file
20 |     with open(FILE_NAME.replace("jsonl", "tsv"), "w") as handle:
21 | 
22 |         for line in open(FILE_NAME):
23 |             datum = json.loads(line)
24 |             y_true = datum["label"]
25 |             y_pred = datum["detailed_intent_json"]["intent"] #.split(",")
26 |             if "Alakasiz" in y_true:
27 |                 del y_true[y_true.index("Alakasiz")]
28 |             if len(y_true) == 0:
29 |                 continue
30 |             true_values.append(y_true)
31 |             pred_values.append(y_pred)
32 |             print(
33 |                 datum["image_url"].replace("\n", ""), "\t", y_true, "\t", y_pred, file=handle
34 |             )
35 | 
36 |     binarizer = MultiLabelBinarizer().fit(true_values)
37 | 
38 |     # pdb.set_trace()
39 |     true_values = binarizer.transform(true_values)
40 |     pred_values = binarizer.transform(pred_values)
41 | 
42 |     # pdb.set_trace()
43 |     print(
44 |         sklearn.metrics.classification_report(
45 |             true_values, pred_values, target_names=binarizer.classes_
46 |         ),
47 |     )
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     app.run(main)
52 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | import os
  4 | import re
  5 | from functools import lru_cache
  6 | from typing import List, Optional
  7 | from fastapi import FastAPI, Request, HTTPException
  8 | import src.converter as converter
  9 | from src.config import Settings
 10 | from src.logger import setup_logging
 11 | from src.models import IntentResponse, RequestIntent
 12 | from src.lm.tokenizer import GPTTokenizer
 13 | 
 14 | 
 15 | setup_logging()
 16 | app = FastAPI()
 17 | rotator = 0
 18 | lock = asyncio.Lock()
 19 | 
 20 | @lru_cache(maxsize=None)
 21 | def get_settings(pid: int):
 22 |     settings = Settings()
 23 | 
 24 |     with open(settings.address_prompt_file) as handle:
 25 |         settings.address_template = handle.read()
 26 | 
 27 |     with open(settings.detailed_intent_prompt_file) as handle:
 28 |         settings.detailed_intent_template = handle.read()
 29 | 
 30 |     if settings.geo_location:
 31 |         settings.geo_key = converter.setup_geocoding()
 32 | 
 33 |     settings.openai_keys = converter.setup_openai(pid % settings.num_workers)
 34 | 
 35 |     logging.warning(f"Engine {settings.engine}")
 36 | 
 37 |     return settings
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | async def convert(
 44 |         info: str,
 45 |         inputs: List[str],
 46 |         settings: Settings,
 47 |         api_key: Optional[str] = None,
 48 | ):
 49 |     if info == "address":
 50 |         template = settings.address_template
 51 |         max_tokens = settings.address_max_tokens
 52 |         completion_params = dict(temperature=0.1, frequency_penalty=0.3)
 53 |     elif info == "detailed_intent":
 54 |         template = settings.detailed_intent_template
 55 |         max_tokens = settings.detailed_intent_max_tokens
 56 |         completion_params = dict(temperature=0.0, frequency_penalty=0.0)
 57 |     else:
 58 |         raise ValueError("Unknown information extraction requested")
 59 | 
 60 |     text_inputs = []
 61 |     for tweet in inputs:
 62 |         text_inputs.append(converter.create_prompt(text=tweet, template=template, max_tokens=max_tokens))
 63 | 
 64 |     outputs = await converter.query_with_retry(
 65 |         text_inputs,
 66 |         api_key=api_key,
 67 |         engine=settings.engine,
 68 |         top_p=1,
 69 |         max_tokens=max_tokens,
 70 |         stop="#END",
 71 |         **completion_params,
 72 |     )
 73 | 
 74 |     returned = []
 75 |     for output in outputs:
 76 |         returned_dict = {}
 77 |         returned_dict["string"] = output
 78 |         try:
 79 |             returned_dict["processed"] = converter.postprocess(info, output[0])
 80 |         except Exception as e:
 81 |             returned_dict["processed"] = {
 82 |                 "intent": [],
 83 |                 "detailed_intent_tags": [],
 84 |             }
 85 |             logging.warning(f"Parsing error in {output},\n {e}")
 86 | 
 87 |         if info == "address" and settings.geo_location and returned_dict["processed"]:
 88 |             returned_dict["processed"]["geo"] = converter.get_geo_result(
 89 |                 settings.geo_key, returned_dict["processed"]
 90 |             )
 91 |         returned.append(returned_dict)
 92 | 
 93 |     return returned
 94 | 
 95 | 
 96 | @app.post("/intent-extractor/", response_model=IntentResponse)
 97 | async def intent(payload: RequestIntent, req: Request):
 98 |     # correct_token = os.getenv("NEEDS_RESOLVER_API_KEY", None)
 99 |     # if correct_token is None:
100 |     #     raise Exception("token not found in env files!")
101 |     # coming_token = req.headers["Authorization"]
102 |     # # Here your code for verifying the token or whatever you use
103 |     # if coming_token != 'Bearer ' + correct_token:
104 |     #     raise HTTPException(
105 |     #         status_code=401,
106 |     #         detail="Unauthorized"
107 |     #     )
108 | 
109 |     settings = get_settings(os.getpid())
110 | 
111 |     inputs = payload.dict()["inputs"]
112 | 
113 |     global rotator
114 |     async with lock:
115 |         rotator = (rotator + 1) % len(settings.openai_keys)
116 | 
117 |     api_key = settings.openai_keys[rotator]
118 | 
119 |     outputs = await convert("detailed_intent", inputs, settings, api_key=api_key)
120 |     return {"response": outputs}
121 | 
122 | 
123 | @app.get("/health")
124 | async def health():
125 |     return {"status": "living the dream"}


--------------------------------------------------------------------------------
/notes.md:
--------------------------------------------------------------------------------
 1 | # TODOs
 2 | 
 3 | 1) Check the error handling
 4 | - batch size > 20, give error --- # başka bir endpoint gelirse apikey ile artırılabilir. current limit <= 20
 5 | - token size limit prompt token + context token <= 4097, current prompt ile 2000 olabilir ama 1000 ok.
 6 | - current prompt v5 categories token size --> Tokens = 1,472
 7 | - json.loads() #
 8 | - eval() # eval kullanmayacakmısız
 9 | 
10 | # fix postprocess_for_intent_v2
11 |     - regex.match -> eger match olmazsa ne oluyor, 
12 |         INTENT için halucination olursa sorun değil çünkü intent ler sadece bert den gelecek, ilgili text in needs leri gerekiyor sadece.
13 |         bu yüzden belki prompt da değişiebilir bu noktadan sonra. [TODO]
14 |     - constant formate - input ve output örnekleri lazım
15 |         INPUT
16 |         
17 | ## INPUT BATCH INPUT
18 | {'inputs': ['İncilikaya mahallesi Şehitkamil/Gaziantep Lütfen bu bölgeye acil çadır, ilaç, aspirin desteği insanlar kendi imkanlarıyla olan çadırlara sığmaya çalışıyorlar lütfen yardım edin !! @AFADBaskanlik @ahbap @EmniyetGM @jandarma @Kizilay',
19 |   'İncilikaya mahallesi Şehitkamil/Gaziantep Lütfen bu bölgeye acil çadır, ilaç, aspirin desteği insanlar kendi imkanlarıyla olan çadırlara sığmaya çalışıyorlar lütfen yardım edin !! @AFADBaskanlik @ahbap @EmniyetGM @jandarma @Kizilay']}
20 | 
21 | ## EXAMPLE BATCH OUTPUT
22 | {'response': [{'string': ['People need [çadır, ilaç, aspirin], tags are [SHELTER, HEALTH, MEDICINE]'],
23 |    'processed': {'intent': ['Barinma', 'Saglik', 'MEDICINE'],
24 |     'detailed_intent_tags': ['çadır', 'ilaç', 'aspirin']}},
25 |   {'string': ['People need [çadır, ilaç, aspirin], tags are [SHELTER, HEALTH, MEDICINE]'],
26 |    'processed': {'intent': ['Barinma', 'Saglik', 'MEDICINE'],
27 |     'detailed_intent_tags': ['çadır', 'ilaç', 'aspirin']}}]}
28 | 
29 |         örnek request
30 |         batch inference nasıl dönüyor
31 | 
32 | 3) Prompt fix & automatic CI
33 |     prompts/intent_v5_categories.txt
34 |     # update after TAG_MAP
35 | 
36 | 5) Grad IO?
37 | 


--------------------------------------------------------------------------------
/prompts/address.txt:
--------------------------------------------------------------------------------
 1 | # Address Extraction from Twitter Text
 2 | 
 3 | # Provinces: Gaziantep (Antep), Kahramanmaraş (Maraş), Hatay, Osmaniye, Adıyaman, Malatya, Batman, Bingöl, Elazığ, Kilis, Diyarbakır, Mardin, Siirt, Şırnak, Van, Muş, Bitlis, Hakkari, Adana
 4 | 
 5 | # Acronyms:
 6 | - mah., Mah. => mahallesi
 7 | - sok., Sok., Sk. => sokak
 8 | - cad., Cad. => caddesi
 9 | - apt., Apt => apartmanı
10 | 
11 | # Examples: 
12 | 
13 | Input:
14 | """
15 | {ocr_input}
16 | """
17 | 
18 | Output:
19 | 


--------------------------------------------------------------------------------
/prompts/detailed_intent.txt:
--------------------------------------------------------------------------------
  1 | # Find and categorize what people need from Turkish Tweets
  2 | - Ignore links, address info. and hashtags.
  3 | - possible tags: [RESCUE, POWER_SOURCE, HEALTH, CLOTHES, PORTABLE_TOILET, LOGISTICS, FOOD, BURIAL, SHELTER, WATER, HEATING]
  4 | - Transportation related words "benzin", "araba", "mazot" are taged as LOGISTICS
  5 | 
  6 | Input: 
  7 | """
  8 | Feyzanur Alkan YAŞIYOR ACİL YARIM SAAT ÖNCE SES GELMİŞ KURTARILMAYI BEKLİYOR. Atatürk Bulvarı Mehmet Akif Ersoy Mahallesi 505.Sokak No 1 Alkanlar Apartmanı Adıyaman/Merkez - Adnan AlkanACİL HİLTİ İSPİRAL JENERATÖR LAZIM
  9 | """
 10 | 
 11 | Summary: 
 12 | People need [jeneratör, hilti] to save other people, tags are [POWER_SOURCE, RESCUE]
 13 | #END
 14 | 
 15 | Input:
 16 | """
 17 | @ProfDemirtas Gıda gereksinimi yokmuş..kadınların pet,iç çamaşırı,çocuk bzi..seyyar tuvalet..en önemlisi de acil ilaç..
 18 | """
 19 | 
 20 | Summary: 
 21 | People need [pet, iç çamaşırı, çocuk bezi, seyyar tuvalet], tags are [HEALTH, CLOTHES, PORTABLE_TOILET]
 22 | #END
 23 | 
 24 | Input: 
 25 | """
 26 | Lütfen önerileri dikkate alın ÜLKEMİZDEKİ DEMİR VE KALIP USTALARINI ACİL ACİL CAĞIRIN GERGİN DEMİR KESME USTALIK İŞİDİR LÜTFEN USTALARIMIZI AÇİL TOPLAYIN DEPREM SAHALARINA SEVK EDİN
 27 | """
 28 | 
 29 | Summary:
 30 | People need [demir ve kalıp ustası, demir kesme ustası], tags are [RESCUE]
 31 | #END
 32 | 
 33 | Intput:
 34 | """
 35 | İstanbul Küçükçekmece Arenapark AVM insan gücü lazım koliler var. Teyitli bizzat kendim yardımda idim sınava yetişmem gerekiyor.@AFADBaskanlik @ahbap_istanbul
 36 | """
 37 | 
 38 | Summary:
 39 | People need [insan gücü] for packaging, tags are [LOGISTICS]
 40 | #END
 41 | 
 42 | Input:
 43 | """
 44 | ❗VİNÇ VE TERMAL KAMERA LAZIM ❗@AFADBaskanlik @ahbap @ahbaphatay @AFADHatay @ibbhabercomtr @chpgenclikgm @AKUT_Dernegi
 45 | """
 46 | 
 47 | Summary:
 48 | People need [vinç, termal kamera], tags are [RESCUE, RESCUE_ELECTRONICS]
 49 | #END
 50 | 
 51 | Intput:
 52 | """
 53 | "Bölgeden gelen ihtiyaç taleplerine göre; gıda kolisi, hijyen malzemesi, battaniye, el feneri, powerbank, çocuk bezi ve çocuk maması gibi ihtiyaç malzemelerinin toplanmasına devam edeceğiz.Bu kapsamda 2. el giyim malzemesi kabul edilmeyecektir."
 54 | """
 55 | 
 56 | Summary: 
 57 | People need [hijyen malzemesi, mama, battaniye, el feneri, powerbank, çocuk bezi], tags are [HEALTH, FOOD, CLOTHES, POWER_SOURCE]
 58 | #END
 59 | 
 60 | Intput:
 61 | """
 62 | ARKADAŞLAR LÜTFEN 20 kişilik aileye çadır lazım lütfen yardım edin
 63 | """
 64 | 
 65 | Summary: 
 66 | People need [çadır], tags are [SHELTER]
 67 | #END
 68 | 
 69 | Input:
 70 | """
 71 | Arkadaşlar Hataydan arkadaşım yazdı. Cenazeleri poşetle defnediyorlarmış. Çok acil kefen ihtiyacı varmış. Lütfen yayalı
 72 | """
 73 | 
 74 | Summary:
 75 | People need [kefen] for burial, tags are [BURIAL]
 76 | #END
 77 | 
 78 | Input: 
 79 | """
 80 | @DepremDairesi @AKUT_Dernegi @KizilayDestek #depremadres
 81 | 
 82 | https://t.co/FbCmDUSz5I
 83 | 
 84 | Ekip arkadaşlarımıza ulaşamıyoruz. Lütfen binalar kontrol edilsin. Hayrullah mah. Malik Ejder cad. Arıkan sit. A blok Kat:4 Kapı No:10 Onikişubat/Kahramanmaraş
 85 | 
 86 | @AFADBaskanlik 
 87 | """
 88 | 
 89 | Summary: 
 90 | People need [bina kontrolü], tags are [RESCUE]
 91 | #END
 92 | 
 93 | Intput:
 94 | """
 95 | HATAY ANTAKYA BÖBREK YETMEZLİĞİ 5 YAŞINDA ÇOCUK ACİL İLAÇ Topamax 100 mg.Derince mah. Derince çıkmazı bölge trafik yaniHatay AntakyaAhmet Akşun: 05347056864-05322173303
 96 | """
 97 | 
 98 | Summary: 
 99 | People need [ilaç, Topamax], tags are [HEALTH]
100 | #END
101 | 
102 | Input:
103 | """
104 | KAHRAMANMARAŞ PAZARCIK ÖRDEKDEDE KÖYÜNE ACİL ÇADIR - BATTANİYE - ISITICI GEREKİYOR İNSANLAR 4 GÜNDÜR DIŞARDA SOGUKTA YATIYORLAR ACİLLL.    @OguzhanUgur @haluklevent @AFADBaskanlik
105 | """
106 | 
107 | Summary:
108 | People need [ısıtıcı], tags are [HEATING]
109 | #END
110 | 
111 | Intput:
112 | """
113 | "Acil çok acil bu konuma gıda şimdi konuştum #depremsondakika @ProfDemirtas @ahbap @haluklevent @OguzhanUgur"
114 | """
115 | 
116 | Summary: 
117 | People need [gıda], tags are [FOOD]
118 | #END
119 | 
120 | Intput:
121 | """
122 | @ProfDemirtas  @haluklevent ABİ SULARI YOKMUŞ.ERZAK İHTİYACI ACİL.
123 | """
124 | 
125 | Summary: 
126 | People need [su, erzak], tags are [WATER, FOOD]
127 | #END
128 | 
129 | Input:
130 | """
131 | {ocr_input}
132 | """
133 | 
134 | Summary:
135 | 


--------------------------------------------------------------------------------
/prompts/intent.txt:
--------------------------------------------------------------------------------
1 | # Intent Classification for Turkish Tweets
2 | 
3 | Input:
4 | """
5 | {ocr_input}
6 | """
7 | 
8 | Summary:
9 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 88
 3 | max-line-length = 88
 4 | target-version = ["py36", "py37", "py38", "py39"]
 5 | enforce_line_length = true
 6 | experimental_string_processing = true
 7 | 
 8 | [tool.isort]
 9 | profile = "black"
10 | multi_line_output = 3
11 | lines_after_imports = 2
12 | include_trailing_comma = true
13 | force_grid_wrap = 0
14 | use_parentheses = true
15 | ensure_newline_before_comments = true
16 | lines_between_sections = 0
17 | line_length = 88
18 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==1.4.0
 2 | aiohttp==3.8.3
 3 | aiosignal==1.3.1
 4 | anyio==3.6.2
 5 | async-timeout==4.0.2
 6 | attrs==22.2.0
 7 | certifi==2022.12.7
 8 | charset-normalizer==2.1.1
 9 | click==8.1.3
10 | dnspython==2.3.0
11 | email-validator==1.3.1
12 | exceptiongroup==1.1.0
13 | fastapi==0.91.0
14 | frozenlist==1.3.3
15 | h11==0.14.0
16 | httpcore==0.16.3
17 | httptools==0.5.0
18 | httpx==0.23.3
19 | idna==3.4
20 | iniconfig==2.0.0
21 | itsdangerous==2.1.2
22 | Jinja2==3.1.2
23 | joblib==1.2.0
24 | MarkupSafe==2.1.2
25 | multidict==6.0.4
26 | numpy==1.24.2
27 | openai==0.26.5
28 | orjson==3.8.6
29 | packaging==23.0
30 | pluggy==1.0.0
31 | pydantic==1.10.4
32 | pytest==7.2.1
33 | python-dotenv==0.21.1
34 | python-multipart==0.0.5
35 | PyYAML==6.0
36 | requests==2.28.2
37 | rfc3986==1.5.0
38 | scikit-learn==1.2.1
39 | scipy==1.10.0
40 | six==1.16.0
41 | sniffio==1.3.0
42 | starlette==0.24.0
43 | threadpoolctl==3.1.0
44 | tomli==2.0.1
45 | tqdm==4.64.1
46 | typing_extensions==4.4.0
47 | ujson==5.7.0
48 | urllib3==1.26.14
49 | uvicorn==0.20.0
50 | uvloop==0.17.0
51 | watchfiles==0.18.1
52 | websockets==10.4
53 | yarl==1.8.2
54 | transformers==4.26.1
55 | 


--------------------------------------------------------------------------------
/run_address.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | EXP_NAME="address_test/"
 3 | BASE_PATH="/home/akyurek/deprem/"
 4 | INPUTFILE="data/test.jsonl"
 5 | NUMKEY=5
 6 | 
 7 | 
 8 | for i in $(seq 0 $((NUMKEY-1)));
 9 | do
10 |     OUTPUT_PATH=$BASE_PATH/exps/${EXP_NAME}/${i}/
11 |     mkdir -p $OUTPUT_PATH
12 |     python converter.py \
13 |     --prompt_file prompts/main.txt \
14 |     --input_file $INPUTFILE \
15 |     --output_file $OUTPUT_PATH/output.jsonl \
16 |     --worker_id $i \
17 |     --geo_location \
18 |     --info="address" \
19 |     --num_workers $NUMKEY > $OUTPUT_PATH/out.log 2>&1 &
20 | done
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/run_intent.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source setup.sh
 3 | EXP_NAME="new_labels_code_davinci_v4"
 4 | BASE_PATH="/home/akyurek/git/deprem/"
 5 | #INPUTFILE="data/intent-multilabel-test-v1-2.json"
 6 | INPUTFILE="data/testv1.3.json"
 7 | NUMKEY=4
 8 | EXP_FOLDER=$BASE_PATH/exps/${EXP_NAME}/
 9 | 
10 | echo "deleting ${EXP_FOLDER}"
11 | 
12 | # rm -rf $EXP_FOLDER
13 | 
14 | # for i in $(seq 0 $((NUMKEY-1)));
15 | # do
16 | #     OUTPUT_PATH=$EXP_FOLDER/${i}/
17 | #     mkdir -p $OUTPUT_PATH
18 | #     python src/converter.py \
19 | #     --prompt_file prompts/detailed_intent.txt \
20 | #     --input_file $INPUTFILE \
21 | #     --output_file $OUTPUT_PATH/output.jsonl \
22 | #     --worker_id $i \
23 | #     --info="detailed_intent" \
24 | #     --max_tokens 100 \
25 | #     --engine="code-davinci-002" \
26 | #     --num_workers $NUMKEY > $OUTPUT_PATH/out.log 2>&1 &
27 | # done
28 | 
29 | cat $EXP_FOLDER/**/output.jsonl > $EXP_FOLDER/merged.jsonl
30 | python eval.py --input_file $EXP_FOLDER/merged.jsonl
31 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/acikyazilimagi/deprem_openai_apis/53aeaa4984b8616cbf7faef2d7c946de451c9e6e/src/__init__.py


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, List
 2 | from pydantic import BaseSettings
 3 | 
 4 | 
 5 | class Settings(BaseSettings):
 6 |     address_prompt_file: str = "prompts/address.txt"
 7 |     detailed_intent_prompt_file: str = "prompts/detailed_intent.txt"
 8 |     address_template: Optional[str] = None
 9 |     detailed_intent_template: Optional[str] = None
10 |     geo_key: Optional[str] = None
11 |     openai_keys: Optional[List[str]] = None
12 |     address_max_tokens: int = 384
13 |     detailed_intent_max_tokens: int = 100
14 |     batch_size: int = 20
15 |     geo_location: bool = False
16 |     num_workers: int = 5
17 |     engine: str = "afet-org"
18 | 
19 |     class Config:
20 |         env_file = ".env"
21 | 
22 | 
23 |     
24 | 


--------------------------------------------------------------------------------
/src/converter.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import os
  4 | import re
  5 | import urllib
  6 | from typing import List, Optional
  7 | import openai
  8 | import requests
  9 | from absl import app, flags, logging
 10 | from tqdm import tqdm
 11 | from src.gpt.network_manager import interact_with_api
 12 | from src.lm.tokenizer import GPTTokenizer
 13 | 
 14 | FLAGS = flags.FLAGS
 15 | 
 16 | flags.DEFINE_string(
 17 |     "prompt_file", default=None, help="Prompt file to use for the problem"
 18 | )
 19 | 
 20 | flags.DEFINE_string("input_file", default=None, help="Input file to read data")
 21 | 
 22 | flags.DEFINE_string("output_file", default=None, help="Output file to write to")
 23 | 
 24 | flags.DEFINE_integer("max_tokens", default=384, help="LM max generation length")
 25 | 
 26 | flags.DEFINE_integer("worker_id", default=0, help="Worker id for the job")
 27 | 
 28 | flags.DEFINE_integer("num_workers", default=1, help="number of workers")
 29 | 
 30 | flags.DEFINE_integer("batch_size", default=20, help="batch size for OpenAI queries")
 31 | 
 32 | flags.DEFINE_boolean(
 33 |     "geo_location", default=False, help="whether to add geo location to the output"
 34 | )
 35 | 
 36 | flags.DEFINE_string("info", default="address", help="address | intent")
 37 | 
 38 | flags.DEFINE_string("engine", "code-davinci-002", help="GPT engines")
 39 | 
 40 | GEO_BASE_URL = "https://maps.googleapis.com/maps/api/geocode/json?"
 41 | 
 42 | # TODO: add more keywords.
 43 | NON_ADDRESS_WORDS = [
 44 |     "arkadaş",
 45 |     "bebek",
 46 |     "enkaz",
 47 |     "deprem",
 48 |     "ekipman",
 49 |     "araç",
 50 |     "kayıp",
 51 |     "acil",
 52 |     "yardım",
 53 |     "kurtarma",
 54 |     "kayıp",
 55 |     "aile",
 56 |     "baba",
 57 | ]
 58 | 
 59 | 
 60 | def postprocess_for_address(address):
 61 |     # a quick rule based filtering for badly parsed outputs.
 62 |     address = json.loads(address)
 63 |     if type(address) == dict:
 64 |         for key in (
 65 |             "mahallesi | bulvarı",
 66 |             "sokak | caddesi | yolu",
 67 |             "sitesi | apartmanı",
 68 |             "no | blok",
 69 |             "kat",
 70 |             "phone",
 71 |         ):
 72 |             if (
 73 |                 key in address
 74 |                 and len(address[key]) > 50
 75 |                 or any(word in address[key] for word in NON_ADDRESS_WORDS)
 76 |             ):
 77 |                 address[key] = ""
 78 | 
 79 |         for key in ("no | blok", "kat"):
 80 |             if key in address and len(address[key]) > 20:
 81 |                 address[key] = ""
 82 | 
 83 |     return address
 84 | 
 85 | 
 86 | TAG_MAP = {
 87 |     "POWER_SOURCE": "Elektrik Kaynagi",
 88 |     "WATER": "Su",
 89 |     "LOGISTICS": "Lojistik",
 90 |     "TRANSPORTATION": "Lojistik",
 91 |     "FOOD": "Yemek",
 92 |     "RESCUE": "Enkaz Kaldirma",
 93 |     "HEALTH": "Saglik",
 94 |     "UNINFORMATIVE": "Alakasiz",
 95 |     "SHELTER": "Barınma",
 96 |     "HEATING": "Isinma",
 97 |     "RESCUE_ELECTRONICS": "Arama Ekipmani",
 98 |     # "LOOTING": "Yagma",
 99 |     "BURIAL": "Cenaze",
100 |     "CLOTHES": "Giysi",
101 |     "PORTABLE_TOILET": "Tuvalet"
102 | }
103 | 
104 | 
105 | CHARMAP = {
106 |     u"I": u"ı",
107 |     u"İ": u"i",
108 | }
109 | 
110 | def tr_lower(text):
111 |     for c1, c2 in CHARMAP.items():
112 |         text = text.replace(c1, c2)
113 |     return text
114 |     
115 | def postprocess_for_intent(intent):
116 |     m = re.search(r"(?<=\[).+?(?=\])", intent)
117 |     if m:
118 |         tags = m.group()
119 |         tags = [TAG_MAP.get(tag.strip(), tag.strip()) for tag in tags.split(",")]
120 |         return {"intent": ",".join(tags)}
121 |     else:
122 |         return {"intent": "Diğer"}
123 | 
124 | 
125 | def postprocess_for_intent_v2(intent):
126 |     matches = re.findall(r"(?<=\[).+?(?=\])", intent)
127 |     if matches:
128 |         detailed_intent = matches[0]
129 |         detailed_intent_tags = [
130 |             tr_lower(TAG_MAP.get(tag.strip(), tag.strip())).lower() for tag in detailed_intent.split(",")
131 |         ]
132 |         if len(matches) > 1:
133 |             intent = matches[1]
134 |             intent_tags = [
135 |                 TAG_MAP.get(tag.strip(), tag.strip()) for tag in intent.split(",")
136 |             ]
137 |         else:
138 |             intent_tags = []
139 | 
140 |         return {
141 |             "intent": intent_tags,  # ",".join(intent_tags),
142 |             "detailed_intent_tags": detailed_intent_tags,  # ",".join(detailed_intent_tags),
143 |         }
144 |     else:
145 |         return {
146 |             "intent": [],  # ",".join(intent_tags),
147 |             "detailed_intent_tags": [],  # ",".join(detailed_intent_tags),
148 |         }
149 | 
150 | 
151 | def postprocess(info, line):
152 |     if info == "address":
153 |         return postprocess_for_address(line)
154 |     elif info == "detailed_intent":
155 |         return postprocess_for_intent_v2(line)
156 |     else:
157 |         raise ValueError("Unknown info type")
158 | 
159 | 
160 | def get_address_str(address):
161 |     address_str = ""
162 |     for key in (
163 |         "mahallesi | bulvarı",
164 |         "sokak | caddesi | yolu",
165 |         "sitesi | apartmanı",
166 |         "no | blok",
167 |         "city",
168 |         "province",
169 |     ):
170 |         address_str += address.get(key, "") + " "
171 | 
172 |     return address_str.strip()
173 | 
174 | 
175 | async def query_with_retry(inputs: List[str], api_key: Optional[str] = None, **kwargs) -> List[List[str]]:
176 |     """Queries GPT API up to max_retry time to get the responses."""
177 |     if api_key:
178 |         openai.api_key = api_key
179 |     try:
180 |         response = await interact_with_api(openai.Completion.create, prompt=inputs, **kwargs)
181 |     except Exception:
182 |         return [['{"status": "ERROR"}']] * len(inputs)
183 | 
184 |     return [
185 |         [line for line in choice["text"].split("\n") if len(line) > 10]
186 |         for choice in response["choices"]
187 |     ]
188 | 
189 | 
190 | def setup_openai(worker_id: int = 0) -> List[str]:
191 |     logging.warning(f"worker id in open ai keys {worker_id}")
192 | 
193 |     try:
194 |         openai_keys = os.getenv("OPENAI_API_KEY_POOL").split(",")
195 |     except KeyError:
196 |         logging.error(
197 |             "OPENAI_API_KEY_POOL or OPENAI_API_BASE_POOL environment variable is not"
198 |             " specified"
199 |         )
200 | 
201 |     assert len(openai_keys) > 0, "No keys specified in the environment variable"
202 | 
203 |     # set the default key
204 |     openai.api_key = openai_keys[worker_id % len(openai_keys)].strip()
205 | 
206 |     try:
207 |         openai_bases = os.getenv("OPENAI_API_BASE_POOL").split(",")
208 |         assert len(openai_bases) == len(openai_keys)
209 |         openai.api_type = "azure"
210 |         openai.api_version = "2022-12-01"
211 |         openai.api_base = openai_bases[worker_id % len(openai_bases)].strip()
212 |     except (KeyError, AttributeError):
213 |         logging.warning("OPENAI_API_BASE_POOL is not specified in the environment")
214 |     except AssertionError as msg:
215 |         logging.error(
216 |             "Env variables OPENAI_API_KEY_POOL and OPENAI_API_BASE_POOL has"
217 |             f" incosistent shapes, {msg}"
218 |         )
219 | 
220 |     return openai_keys
221 | 
222 | def setup_geocoding(worker_id: int = 0):
223 |     try:
224 |         geo_keys = os.getenv("GEO_KEY_POOL").split(",")
225 |     except KeyError:
226 |         logging.error("GEO_KEY_POOL environment variable is not specified")
227 | 
228 |     assert len(geo_keys) > 0, "No keys specified in the environment variable"
229 | 
230 |     worker_geo_key = geo_keys[worker_id % len(geo_keys)].strip()
231 | 
232 |     return worker_geo_key
233 | 
234 | 
235 | def get_geo_result(key, address):
236 |     address_str = get_address_str(address)
237 |     parameters = {"address": address_str, "key": key}
238 |     response = requests.get(f"{GEO_BASE_URL}{urllib.parse.urlencode(parameters)}")
239 | 
240 |     if response.status_code == 200:
241 |         results = json.loads(response.content)["results"]
242 |         if results:
243 |             for result in results:
244 |                 if "geometry" in result and "location" in result["geometry"]:
245 |                     loc = result["geometry"]["location"]
246 |                     link = "https://maps.google.com/?q={lat},{lng}".format(
247 |                         lat=loc["lat"], lng=loc["lng"]
248 |                     )
249 |                     result["gmaps_link"] = link
250 |         return results
251 |     else:
252 |         logging.warning(response.content)
253 | 
254 | 
255 | def preprocess_tweet(text: str) -> str:
256 |     mention_pattern = r"@\w+"
257 |     url_pattern = r"(\w+?://)?(?:www\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\.[a-zA-Z]{1,10}\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)"
258 |     # remove mentions
259 |     mentions_removed = re.sub(mention_pattern, " ", text)
260 |     # remove urls
261 |     url_removed = re.sub(url_pattern, "", mentions_removed)
262 |     # remove consequent spaces
263 |     return re.sub(r"\s+", " ", url_removed)
264 | 
265 | 
266 | def create_prompt(text: str, template: str, max_tokens: int) -> str:
267 |     template_token_count = GPTTokenizer.token_count(template)
268 | 
269 |     preprocessed_text = preprocess_tweet(text)
270 | 
271 |     truncated_text = GPTTokenizer.truncate(
272 |         preprocessed_text,
273 |         max_tokens=GPTTokenizer.MAX_TOKENS - max_tokens - template_token_count,
274 |     )
275 | 
276 |     return template.format(ocr_input=truncated_text)
277 | 
278 | 
279 | def main(_):
280 |     setup_openai(FLAGS.worker_id)
281 |     if FLAGS.geo_location:
282 |         geo_key = setup_geocoding(FLAGS.worker_id)
283 | 
284 |     with open(FLAGS.prompt_file) as handle:
285 |         template = handle.read()
286 | 
287 |     if FLAGS.info == "address":
288 |         completion_params = dict(temperature=0.1, frequency_penalty=0.3)
289 |     elif "intent" in FLAGS.info:
290 |         completion_params = dict(temperature=0.0, frequency_penalty=0.0)
291 |     else:
292 |         raise ValueError("Unknown info")
293 | 
294 |     logging.info(f"Engine {FLAGS.engine}")
295 | 
296 |     loop = asyncio.get_event_loop()
297 | 
298 |     with open(FLAGS.input_file) as handle:
299 |         # raw_data = [json.loads(line.strip()) for line in handle]
300 |         raw_data = json.load(handle)
301 |         split_size = len(raw_data) // FLAGS.num_workers
302 |         raw_data = raw_data[
303 |             FLAGS.worker_id * split_size : (FLAGS.worker_id + 1) * split_size
304 |         ]
305 | 
306 |     logging.info(f"Length of the data for this worker is {len(raw_data)}")
307 |     text_inputs = []
308 |     raw_inputs = []
309 | 
310 |     for index, row in tqdm(enumerate(raw_data)):
311 |         # text_inputs.append(template.format(ocr_input=row["Tweet"]))
312 |         text_inputs.append(create_prompt(text=row["image_url"], template=template, max_tokens=FLAGS.max_tokens))
313 |         raw_inputs.append(row)
314 | 
315 |         if (index + 1) % FLAGS.batch_size == 0 or index == len(raw_data) - 1:
316 |             # to not throttle api key limits with parallel queries?
317 |             outputs = loop.run_until_complete(query_with_retry(
318 |                 text_inputs,
319 |                 engine=FLAGS.engine,
320 |                 max_tokens=FLAGS.max_tokens,
321 |                 top_p=1,
322 |                 presence_penalty=0,
323 |                 stop="#END",
324 |                 **completion_params,
325 |             ))
326 | 
327 |             with open(FLAGS.output_file, "a+") as handle:
328 |                 for inp, output_lines in zip(raw_inputs, outputs):
329 |                     # for output_line in output_lines:
330 |                     output_line = output_lines[0]
331 |                     current_input = inp.copy()
332 |                     try:
333 |                         current_input[FLAGS.info + "_json"] = postprocess(
334 |                             FLAGS.info, output_line
335 |                         )
336 |                         current_input[FLAGS.info + "_str"] = output_line
337 |                     except Exception as e:
338 |                         logging.warning(f"Parsing error in {output_line},\n {e}")
339 |                         current_input[FLAGS.info + "_json"] = {}
340 |                         current_input[FLAGS.info + "_str"] = output_line
341 | 
342 |                     if (
343 |                         FLAGS.info == "address"
344 |                         and FLAGS.geo_location
345 |                         and type(current_input[FLAGS.info + "_json"]) == dict
346 |                     ):
347 |                         current_input["geo"] = get_geo_result(
348 |                             geo_key, current_input[FLAGS.info + "_json"]
349 |                         )
350 | 
351 |                     json_output = json.dumps(current_input)
352 |                     handle.write(json_output + "\n")
353 | 
354 |             text_inputs = []
355 |             raw_inputs = []
356 | 
357 | 
358 | if __name__ == "__main__":
359 |     app.run(main)
360 | 


--------------------------------------------------------------------------------
/src/gpt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/acikyazilimagi/deprem_openai_apis/53aeaa4984b8616cbf7faef2d7c946de451c9e6e/src/gpt/__init__.py


--------------------------------------------------------------------------------
/src/gpt/network_manager.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import logging
 3 | from functools import wraps
 4 | from math import ceil, log2
 5 | from random import random
 6 | from openai import APIError
 7 | from openai.error import (
 8 |     APIConnectionError,
 9 |     AuthenticationError,
10 |     InvalidRequestError,
11 |     OpenAIError,
12 |     RateLimitError,
13 |     ServiceUnavailableError,
14 |     TryAgain,
15 | )
16 | # from src.concurrent.asynchronous import run_async_tasks
17 | 
18 | 
19 | logger = logging.getLogger(__name__)
20 | 
21 | 
22 | OPENAI_MAX_RETRY = 10
23 | # quota is reset in every 60 seconds
24 | OPENAI_REFRESH_QUOTA = 60
25 | OPENAI_EXP_CAP = int(ceil(log2(OPENAI_REFRESH_QUOTA)))
26 | 
27 | 
28 | class OpenAINetworkManager:
29 |     def __init__(self):
30 |         raise AssertionError(f"{type(self).__name__} should not be instantiated.")
31 | 
32 |     @staticmethod
33 |     def async_retry_with_exp_backoff(task):
34 |         @wraps(task)
35 |         async def wrapper(*args, **kwargs):
36 |             for i in range(OPENAI_MAX_RETRY + 1):
37 |                 wait_time = (1 << min(i, OPENAI_EXP_CAP)) + random() / 10
38 |                 try:
39 |                     return task(*args, **kwargs)
40 |                 except (
41 |                     RateLimitError,
42 |                     ServiceUnavailableError,
43 |                     APIConnectionError,
44 |                     APIError,
45 |                     TryAgain,
46 |                 ) as e:
47 |                     if i == OPENAI_MAX_RETRY:
48 |                         logger.error(
49 |                             f"Retry, TooManyRequests or Server Error. {str(e)}"
50 |                         )
51 |                         raise e
52 |                     else:
53 |                         logger.warning(
54 |                             f"Waiting {round(wait_time, 2)} seconds for API...",
55 |                         )
56 |                         await asyncio.sleep(wait_time)
57 |                 except AuthenticationError as e:
58 |                     # No way to handle
59 |                     logger.error(f"AuthenticationError: {str(e)}")
60 |                     raise Exception(
61 |                         "AuthenticationError: Incorrect API key is provided.",
62 |                     )
63 |                 except InvalidRequestError as e:
64 |                     logger.error(f"InvalidRequestError: {str(e)}")
65 |                     raise e
66 |                 except OpenAIError as e:
67 |                     logger.error(f"API Request failed. {str(e)}")
68 |                     raise e
69 |                 except Exception as e:
70 |                     logger.error(f"Error unrelated to API. {str(e)}")
71 |                     raise e
72 | 
73 |         return wrapper
74 | 
75 | 
76 | async def interact_with_api(func, *args, **kwargs):
77 |     @OpenAINetworkManager.async_retry_with_exp_backoff
78 |     def interact():
79 |         return func(*args, **kwargs)
80 | 
81 |     return await interact()
82 | 


--------------------------------------------------------------------------------
/src/lm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/acikyazilimagi/deprem_openai_apis/53aeaa4984b8616cbf7faef2d7c946de451c9e6e/src/lm/__init__.py


--------------------------------------------------------------------------------
/src/lm/tokenizer.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer
 2 | 
 3 | 
 4 | tokenizer = AutoTokenizer.from_pretrained("gpt2")
 5 | 
 6 | 
 7 | class GPTTokenizer:
 8 |     MAX_TOKENS = 4096
 9 | 
10 |     @classmethod
11 |     def token_count(cls, text: str) -> int:
12 |         return len(tokenizer(text, truncation=False)["input_ids"])
13 | 
14 |     @classmethod
15 |     def truncate(self, text: str, max_tokens: int) -> str:
16 |         encoded = tokenizer(text, truncation=True, max_length=max_tokens)
17 |         return tokenizer.decode(encoded["input_ids"])
18 | 


--------------------------------------------------------------------------------
/src/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | 
 4 | 
 5 | def setup_logging():
 6 |     handler = logging.StreamHandler(sys.stdout)
 7 |     formatter = logging.Formatter(
 8 |         '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
 9 |     )
10 |     handler.setFormatter(formatter)
11 | 
12 |     # default logger
13 |     default_logger = logging.getLogger()
14 |     # remove default handler with formatter
15 |     default_logger.handlers.clear()
16 |     default_logger.setLevel(logging.INFO)
17 |     default_logger.addHandler(handler)
18 | 
19 | 


--------------------------------------------------------------------------------
/src/models.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, Field
 2 | from typing import List
 3 | 
 4 | 
 5 | class RequestIntent(BaseModel):
 6 |     inputs: List[str] = Field(
 7 |         description="list of tweets to classify or parse",
 8 |         default=""" ["İskenderun Hatay Mustafa Kemal mahallesi 544 sokak no:11 (Batı Göz hastanesi sokağı) Selahattin Yurt Dudu Yurt Sezer Yurt GÖÇÜK ALTINDALAR!!! #DEPREMOLDU #depremhatay #deprem #Hatay #hatayacil #HatayaYardım #hataydepremi", "LÜTFEN YAYIN!!!! 8 katlı bina HATAYDA Odabaşı mah. Uğur Mumcu caddesi no 4 Mahmut Karakaş kat 4"]""",
 9 |     )
10 | 
11 | 
12 | class IntentRequest(BaseModel):
13 |     inputs: List[str] = Field(
14 |         description="list of tweets to classify or parse",
15 |         default=""" ["İskenderun Hatay Mustafa Kemal mahallesi 544 sokak no:11 (Batı Göz hastanesi sokağı) Selahattin Yurt Dudu Yurt Sezer Yurt GÖÇÜK ALTINDALAR!!! #DEPREMOLDU #depremhatay #deprem #Hatay #hatayacil #HatayaYardım #hataydepremi", "LÜTFEN YAYIN!!!! 8 katlı bina HATAYDA Odabaşı mah. Uğur Mumcu caddesi no 4 Mahmut Karakaş kat 4"]""",
16 |     )
17 | 
18 | 
19 | class IntentResponse(BaseModel):
20 |     response: List[dict]
21 | 


--------------------------------------------------------------------------------
/task-definition.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "ipcMode": null,
  3 |   "executionRoleArn": "arn:aws:iam::366354050833:role/ecsServiceRole",
  4 |   "containerDefinitions": [
  5 |     {
  6 |       "dnsSearchDomains": null,
  7 |       "environmentFiles": [
  8 |         {
  9 |           "value": "arn:aws:s3:::credent-bucket/open-api-apis.env",
 10 |           "type": "s3"
 11 |         }
 12 |       ],
 13 |       "logConfiguration": {
 14 |         "logDriver": "awslogs",
 15 |         "secretOptions": null,
 16 |         "options": {
 17 |           "awslogs-group": "/ecs/deprem-openai-api",
 18 |           "awslogs-region": "eu-central-1",
 19 |           "awslogs-create-group": "true",
 20 |           "awslogs-stream-prefix": "ecs"
 21 |         }
 22 |       },
 23 |       "entryPoint": null,
 24 |       "portMappings": [
 25 |         {
 26 |           "hostPort": 80,
 27 |           "protocol": "tcp",
 28 |           "containerPort": 80
 29 |         }
 30 |       ],
 31 |       "command": null,
 32 |       "linuxParameters": null,
 33 |       "cpu": 4096,
 34 |       "environment": [],
 35 |       "resourceRequirements": null,
 36 |       "ulimits": null,
 37 |       "dnsServers": null,
 38 |       "mountPoints": [],
 39 |       "workingDirectory": null,
 40 |       "secrets": null,
 41 |       "dockerSecurityOptions": null,
 42 |       "memory": 8192,
 43 |       "memoryReservation": null,
 44 |       "volumesFrom": [],
 45 |       "stopTimeout": null,
 46 |       "image": "deprem-openai-api",
 47 |       "startTimeout": null,
 48 |       "firelensConfiguration": null,
 49 |       "dependsOn": null,
 50 |       "disableNetworking": null,
 51 |       "interactive": null,
 52 |       "healthCheck": null,
 53 |       "essential": true,
 54 |       "links": null,
 55 |       "hostname": null,
 56 |       "extraHosts": null,
 57 |       "pseudoTerminal": null,
 58 |       "user": null,
 59 |       "readonlyRootFilesystem": null,
 60 |       "dockerLabels": null,
 61 |       "systemControls": null,
 62 |       "privileged": null,
 63 |       "name": "container-name"
 64 |     }
 65 |   ],
 66 |   "placementConstraints": [],
 67 |   "memory": "8192",
 68 |   "taskRoleArn": null,
 69 |   "compatibilities": [
 70 |     "EC2",
 71 |     "FARGATE"
 72 |   ],
 73 |   "taskDefinitionArn": "arn:aws:ecs:eu-central-1:366354050833:task-definition/deprem-openai-api-TD:1",
 74 |   "family": "deprem-openai-api-TD",
 75 |   "requiresAttributes": [
 76 |     {
 77 |       "targetId": null,
 78 |       "targetType": null,
 79 |       "value": null,
 80 |       "name": "com.amazonaws.ecs.capability.logging-driver.awslogs"
 81 |     },
 82 |     {
 83 |       "targetId": null,
 84 |       "targetType": null,
 85 |       "value": null,
 86 |       "name": "ecs.capability.execution-role-awslogs"
 87 |     },
 88 |     {
 89 |       "targetId": null,
 90 |       "targetType": null,
 91 |       "value": null,
 92 |       "name": "com.amazonaws.ecs.capability.docker-remote-api.1.19"
 93 |     },
 94 |     {
 95 |       "targetId": null,
 96 |       "targetType": null,
 97 |       "value": null,
 98 |       "name": "com.amazonaws.ecs.capability.docker-remote-api.1.18"
 99 |     },
100 |     {
101 |       "targetId": null,
102 |       "targetType": null,
103 |       "value": null,
104 |       "name": "ecs.capability.task-eni"
105 |     },
106 |     {
107 |       "targetId": null,
108 |       "targetType": null,
109 |       "value": null,
110 |       "name": "com.amazonaws.ecs.capability.docker-remote-api.1.29"
111 |     }
112 |   ],
113 |   "pidMode": null,
114 |   "requiresCompatibilities": [
115 |     "FARGATE"
116 |   ],
117 |   "networkMode": "awsvpc",
118 |   "runtimePlatform": null,
119 |   "cpu": "4096",
120 |   "revision": 1,
121 |   "status": "ACTIVE",
122 |   "inferenceAccelerators": null,
123 |   "proxyConfiguration": null,
124 |   "volumes": []
125 | }
126 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/acikyazilimagi/deprem_openai_apis/53aeaa4984b8616cbf7faef2d7c946de451c9e6e/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_ai.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | from fastapi.testclient import TestClient
 5 | from main import app
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | client = TestClient(app=app)
10 | 
11 | PAYLOAD = {
12 |     "inputs": [
13 |         "İskenderun Hatay Mustafa Kemal mahallesi 544 sokak no:11 (Batı Göz hastanesi"
14 |         " sokağı) Selahattin Yurt Dudu Yurt Sezer Yurt GÖÇÜK ALTINDALAR!!! #DEPREMOLDU"
15 |         " #depremhatay #deprem #Hatay #hatayacil #HatayaYardım #hataydepremi",
16 |         "LÜTFEN YAYIN!!!! 8 katlı bina HATAYDA Odabaşı mah. Uğur Mumcu caddesi no 4"
17 |         " Mahmut Karakaş kat 4",
18 |     ],
19 | }
20 | 
21 | 
22 | def test_intent():
23 |     correct_token = os.getenv("NEEDS_RESOLVER_API_KEY")
24 |     headers = {"Authorization": f"Bearer {correct_token}"}
25 |     response = client.post("/intent-extractor/", json=PAYLOAD, headers=headers)
26 |     assert response.status_code == 200
27 |     outputs = response.json()["response"]
28 |     assert isinstance(outputs, list)
29 | 
30 |     for obj in outputs:
31 |         logger.debug(obj)
32 |         assert isinstance(obj, dict)
33 |         assert "string" in obj
34 |         assert "processed" in obj
35 |         assert isinstance(obj["processed"]["intent"], list)
36 |         assert len(obj["processed"]["intent"]) > 0
37 |         assert isinstance(obj["processed"]["detailed_intent_tags"], list)
38 |         assert len(obj["processed"]["detailed_intent_tags"]) > 0
39 | 


--------------------------------------------------------------------------------