├── .env ├── .flake8 ├── .github └── workflows │ ├── dev.yml │ └── docker-image.yml ├── .gitignore ├── .vscode └── settings.json ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── data └── test.jsonl ├── eval.py ├── main.py ├── notes.md ├── prompts ├── address.txt ├── detailed_intent.txt └── intent.txt ├── pyproject.toml ├── requirements.txt ├── run_address.sh ├── run_intent.sh ├── src ├── __init__.py ├── config.py ├── converter.py ├── gpt │ ├── __init__.py │ └── network_manager.py ├── lm │ ├── __init__.py │ └── tokenizer.py ├── logger.py └── models.py ├── task-definition.json └── tests ├── __init__.py └── test_ai.py /.env: -------------------------------------------------------------------------------- 1 | address_prompt_file="prompts/address.txt" 2 | detailed_intent_prompt_file="prompts/detailed_intent.txt" 3 | max_tokens=384 4 | batch_size=20 5 | geo_location=false 6 | engine="afet-org" -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = .git 3 | max-line-length = 88 4 | ignore = 5 | E203, # Black default for colon/whitespace handling... 6 | E501, # Line length issues... black takes care of code, but comments/markdown are not handled 7 | W503, # Line break before binary operatorflake8(W503) 8 | E722 9 | per-file-ignores = 10 | */__init__.py: F401 11 | -------------------------------------------------------------------------------- /.github/workflows/dev.yml: -------------------------------------------------------------------------------- 1 | name: Test DepremOpenAiApi 2 | 3 | on: 4 | push: 5 | branches-ignore: ["main"] 6 | 7 | env: 8 | OPENAI_API_BASE_POOL: ${{ secrets.OPENAI_API_BASE_POOL }} 9 | OPENAI_API_KEY_POOL: ${{ secrets.OPENAI_API_KEY_POOL }} 10 | NEEDS_RESOLVER_API_KEY: ${{ secrets.NEEDS_RESOLVER_API_KEY }} 11 | 12 | jobs: 13 | deploy: 14 | name: test_on_branch 15 | runs-on: ubuntu-latest 16 | environment: prod 17 | 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v3 21 | 22 | - name: Build and test 23 | id: build-image 24 | env: 25 | IMAGE_TAG: ${{ github.sha }} 26 | run: | 27 | docker build -t deprem-openai-apis-test:latest . 28 | docker run -e NEEDS_RESOLVER_API_KEY=${NEEDS_RESOLVER_API_KEY} --rm -t deprem-openai-apis-test:latest sh -c "OPENAI_API_BASE_POOL=${OPENAI_API_BASE_POOL} OPENAI_API_KEY_POOL=${OPENAI_API_KEY_POOL} pytest --verbose -s" 29 | -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Deploy DepremOpenAiApi 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | 7 | env: 8 | OPENAI_API_BASE_POOL: ${{ secrets.OPENAI_API_BASE_POOL }} 9 | OPENAI_API_KEY_POOL: ${{ secrets.OPENAI_API_KEY_POOL }} 10 | NEEDS_RESOLVER_API_KEY: ${{ secrets.NEEDS_RESOLVER_API_KEY }} 11 | 12 | jobs: 13 | deploy: 14 | name: Deploy 15 | runs-on: ubuntu-latest 16 | permissions: 17 | actions: write 18 | id-token: write 19 | environment: prod 20 | 21 | steps: 22 | - name: Checkout 23 | uses: actions/checkout@v3 24 | 25 | - name: Configure AWS credentials 26 | uses: aws-actions/configure-aws-credentials@v1 27 | with: 28 | aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }} 29 | aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }} 30 | aws-region: eu-central-1 31 | 32 | - name: Login to Amazon ECR 33 | id: login-ecr 34 | uses: aws-actions/amazon-ecr-login@v1 35 | 36 | - name: Build, tag, and push image to Amazon ECR 37 | id: build-image 38 | env: 39 | ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} 40 | ECR_REPOSITORY: base-ecr 41 | IMAGE_TAG: ${{ github.sha }} 42 | run: | 43 | # Build a docker container and 44 | # push it to ECR so that it can 45 | # be deployed to ECS. 46 | docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG . 47 | docker run -e NEEDS_RESOLVER_API_KEY=${NEEDS_RESOLVER_API_KEY} --rm -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG sh -c "OPENAI_API_BASE_POOL=${OPENAI_API_BASE_POOL} OPENAI_API_KEY_POOL=${OPENAI_API_KEY_POOL} pytest --verbose -s" 48 | docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG 49 | echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" 50 | 51 | - name: Fill in the new image ID in the Amazon ECS task definition 52 | id: task-def 53 | uses: aws-actions/amazon-ecs-render-task-definition@v1 54 | with: 55 | task-definition: task-definition.json 56 | container-name: container-name 57 | image: ${{ steps.build-image.outputs.image }} 58 | 59 | - name: Deploy Amazon ECS task definition 60 | uses: aws-actions/amazon-ecs-deploy-task-definition@v1 61 | with: 62 | task-definition: ${{ steps.task-def.outputs.task-definition }} 63 | service: deprem-openai-api-service 64 | cluster: base-cluster 65 | wait-for-service-stability: false 66 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__/ 2 | __pycache__/ 3 | 4 | # C extensions 5 | *.so 6 | 7 | # Distribution / packaging 8 | .Python 9 | build/ 10 | develop-eggs/ 11 | dist/ 12 | downloads/ 13 | eggs/ 14 | .eggs/ 15 | lib/ 16 | lib64/ 17 | parts/ 18 | sdist/ 19 | var/ 20 | wheels/ 21 | share/python-wheels/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | MANIFEST 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .nox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | *.py,cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | cover/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | db.sqlite3-journal 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | .pybuilder/ 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 84 | __pypackages__/ 85 | 86 | # Celery stuff 87 | celerybeat-schedule 88 | celerybeat.pid 89 | 90 | # SageMath parsed files 91 | *.sage.py 92 | 93 | # Environments 94 | .env 95 | .venv 96 | env/ 97 | venv/ 98 | ENV/ 99 | env.bak/ 100 | venv.bak/ 101 | 102 | # Spyder project settings 103 | .spyderproject 104 | .spyproject 105 | 106 | # Rope project settings 107 | .ropeproject 108 | 109 | # mkdocs documentation 110 | /site 111 | 112 | # mypy 113 | .mypy_cache/ 114 | .dmypy.json 115 | dmypy.json 116 | 117 | # Pyre type checker 118 | .pyre/ 119 | 120 | # pytype static type analyzer 121 | .pytype/ 122 | 123 | # Cython debug symbols 124 | cython_debug/ 125 | 126 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "git.ignoreLimitWarning": true, 3 | "python.linting.flake8Enabled": true, 4 | "python.linting.enabled": true, 5 | "editor.formatOnSave": true, 6 | "python.linting.lintOnSave": true, 7 | "python.analysis.extraPaths": [ 8 | "${workspaceFolder}", 9 | ], 10 | "python.formatting.provider": "black", 11 | "python.formatting.blackArgs": [ 12 | "--line-length=88" 13 | ], 14 | "python.sortImports.args": [ 15 | "--profile", 16 | "black" 17 | ], 18 | "[python]": { 19 | "editor.codeActionsOnSave": { 20 | "source.organizeImports": true 21 | } 22 | }, 23 | "python.linting.pylintEnabled": false 24 | } -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10 2 | 3 | RUN mkdir -p /usr/src/app 4 | WORKDIR /usr/src/app 5 | 6 | COPY requirements.txt /usr/src/app/requirements.txt 7 | 8 | RUN pip install --no-cache-dir --upgrade -r requirements.txt 9 | 10 | ENV PYTHONUNBUFFERED 0 11 | EXPOSE 80 12 | 13 | COPY . /usr/src/app 14 | 15 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"] 16 | 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 açık-kaynak.org 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | local-test: 2 | docker build -t deprem-openai-apis-test:latest . 3 | docker run --rm -t deprem-openai-apis-test:latest sh -c "OPENAI_API_BASE_POOL=${OPENAI_API_BASE_POOL} OPENAI_API_KEY_POOL=${OPENAI_API_KEY_POOL} pytest --verbose" 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Address and Intent Extractor 2 | 3 | > Please use the active remote: https://github.com/acikkaynak/deprem_openai_apis 4 | 5 | > Prompts in this repo are placeholder for the privacy reasons. Please contact us if you'd like to obtain them. 6 | 7 | The code can extract adresses from raw Turkiye earthquake tweets and classify them for intent via OpenAI GPT Codex API by using few-shot prompting. 8 | 9 | # How To Run 10 | 11 | Currently the input format is `.jsonl` where each line has a json string with "Tweet" field, see an example input file here [data/test.jsonl](./data/test.jsonl). 12 | 13 | Export two environment variables as comma seperated keys: 14 | 15 | ```SHELL 16 | export OPENAI_API_KEY_POOL=key1,key2,key3... 17 | export GEO_KEY_POOL=key1,key2 18 | ``` 19 | 20 | optionally for afet org api base urls 21 | ```SHELL 22 | export OPENAI_API_BASE_POOL= 23 | ``` 24 | 25 | To extract the geo location address information: 26 | - Specify your paths in [run_addres.sh](./run_address.sh), then run the script 27 | ```SHELL 28 | ./run_address.sh 29 | ``` 30 | 31 | To extract the intent information: 32 | - Specify your paths in [run_intent.sh](./run_intent.sh), then run the script. 33 | ```SHELL 34 | ./run_intent.sh 35 | ``` 36 | 37 | # To Run FastAPI Backend 38 | 39 | - To run locally `uvicorn main:app --reload` 40 | 41 | 42 | 43 | Running github actions 44 | -------------------------------------------------------------------------------- /data/test.jsonl: -------------------------------------------------------------------------------- 1 | {"URL": "https://www.twitter.com/circcassian/status/1623400284046168071", "Tarih": 1675894563000, "Tweet": "3 G\u00dcN OLDU NOLUR YARDIM ED\u0130N HATAY/ KIRIKHAN CUMHUR\u0130YET MAHALLES\u0130 G\u00dcL SOKAK NO 14 (erdo\u011fan kebap yan\u0131) ENKAZ ALTINDA KALAN EMEL \u00c7A\u011eLAR VE M\u0130THAT \u00c7A\u011eLAR KURTARILMAYA \u00c7ALI\u015eILIYOR SESLER\u0130 GEL\u0130YOR FAKAT EK\u0130PLER S\u00dcREKL\u0130 ARAMAYI BIRAKIP (\u0131\u015f\u0131k yok,yorulduk vs diyerek) B\u00d6LGEDEN AYRILIYOR", "Kullan\u0131c\u0131 ad\u0131": "circcassian"} 2 | {"URL": "https://www.twitter.com/circcassian/status/1623400284046168071", "Tarih": 1675894563000, "Tweet": "3 G\u00dcN OLDU NOLUR YARDIM ED\u0130N HATAY/ KIRIKHAN CUMHUR\u0130YET MAHALLES\u0130 G\u00dcL SOKAK NO 14 (erdo\u011fan kebap yan\u0131) ENKAZ ALTINDA KALAN EMEL \u00c7A\u011eLAR VE M\u0130THAT \u00c7A\u011eLAR KURTARILMAYA \u00c7ALI\u015eILIYOR SESLER\u0130 GEL\u0130YOR FAKAT EK\u0130PLER S\u00dcREKL\u0130 ARAMAYI BIRAKIP (\u0131\u015f\u0131k yok,yorulduk vs diyerek) B\u00d6LGEDEN AYRILIYOR", "Kullan\u0131c\u0131 ad\u0131": "circcassian"} 3 | {"URL": "https://www.twitter.com/circcassian/status/1623400284046168071", "Tarih": 1675894563000, "Tweet": "3 G\u00dcN OLDU NOLUR YARDIM ED\u0130N HATAY/ KIRIKHAN CUMHUR\u0130YET MAHALLES\u0130 G\u00dcL SOKAK NO 14 (erdo\u011fan kebap yan\u0131) ENKAZ ALTINDA KALAN EMEL \u00c7A\u011eLAR VE M\u0130THAT \u00c7A\u011eLAR KURTARILMAYA \u00c7ALI\u015eILIYOR SESLER\u0130 GEL\u0130YOR FAKAT EK\u0130PLER S\u00dcREKL\u0130 ARAMAYI BIRAKIP (\u0131\u015f\u0131k yok,yorulduk vs diyerek) B\u00d6LGEDEN AYRILIYOR", "Kullan\u0131c\u0131 ad\u0131": "circcassian"} 4 | {"URL": "https://www.twitter.com/circcassian/status/1623400284046168071", "Tarih": 1675894563000, "Tweet": "3 G\u00dcN OLDU NOLUR YARDIM ED\u0130N HATAY/ KIRIKHAN CUMHUR\u0130YET MAHALLES\u0130 G\u00dcL SOKAK NO 14 (erdo\u011fan kebap yan\u0131) ENKAZ ALTINDA KALAN EMEL \u00c7A\u011eLAR VE M\u0130THAT \u00c7A\u011eLAR KURTARILMAYA \u00c7ALI\u015eILIYOR SESLER\u0130 GEL\u0130YOR FAKAT EK\u0130PLER S\u00dcREKL\u0130 ARAMAYI BIRAKIP (\u0131\u015f\u0131k yok,yorulduk vs diyerek) B\u00d6LGEDEN AYRILIYOR", "Kullan\u0131c\u0131 ad\u0131": "circcassian"} 5 | {"URL": "https://www.twitter.com/circcassian/status/1623400284046168071", "Tarih": 1675894563000, "Tweet": "3 G\u00dcN OLDU NOLUR YARDIM ED\u0130N HATAY/ KIRIKHAN CUMHUR\u0130YET MAHALLES\u0130 G\u00dcL SOKAK NO 14 (erdo\u011fan kebap yan\u0131) ENKAZ ALTINDA KALAN EMEL \u00c7A\u011eLAR VE M\u0130THAT \u00c7A\u011eLAR KURTARILMAYA \u00c7ALI\u015eILIYOR SESLER\u0130 GEL\u0130YOR FAKAT EK\u0130PLER S\u00dcREKL\u0130 ARAMAYI BIRAKIP (\u0131\u015f\u0131k yok,yorulduk vs diyerek) B\u00d6LGEDEN AYRILIYOR", "Kullan\u0131c\u0131 ad\u0131": "circcassian"} 6 | 7 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sklearn 3 | import sklearn.metrics 4 | from absl import app, flags 5 | from sklearn.preprocessing import MultiLabelBinarizer 6 | 7 | 8 | FLAGS = flags.FLAGS 9 | 10 | flags.DEFINE_string( 11 | "input_file", default=None, help="Prompt file to use for the problem" 12 | ) 13 | 14 | 15 | def main(_): 16 | true_values = [] 17 | pred_values = [] 18 | 19 | FILE_NAME = FLAGS.input_file 20 | with open(FILE_NAME.replace("jsonl", "tsv"), "w") as handle: 21 | 22 | for line in open(FILE_NAME): 23 | datum = json.loads(line) 24 | y_true = datum["label"] 25 | y_pred = datum["detailed_intent_json"]["intent"] #.split(",") 26 | if "Alakasiz" in y_true: 27 | del y_true[y_true.index("Alakasiz")] 28 | if len(y_true) == 0: 29 | continue 30 | true_values.append(y_true) 31 | pred_values.append(y_pred) 32 | print( 33 | datum["image_url"].replace("\n", ""), "\t", y_true, "\t", y_pred, file=handle 34 | ) 35 | 36 | binarizer = MultiLabelBinarizer().fit(true_values) 37 | 38 | # pdb.set_trace() 39 | true_values = binarizer.transform(true_values) 40 | pred_values = binarizer.transform(pred_values) 41 | 42 | # pdb.set_trace() 43 | print( 44 | sklearn.metrics.classification_report( 45 | true_values, pred_values, target_names=binarizer.classes_ 46 | ), 47 | ) 48 | 49 | 50 | if __name__ == "__main__": 51 | app.run(main) 52 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import os 4 | import re 5 | from functools import lru_cache 6 | from typing import List, Optional 7 | from fastapi import FastAPI, Request, HTTPException 8 | import src.converter as converter 9 | from src.config import Settings 10 | from src.logger import setup_logging 11 | from src.models import IntentResponse, RequestIntent 12 | from src.lm.tokenizer import GPTTokenizer 13 | 14 | 15 | setup_logging() 16 | app = FastAPI() 17 | rotator = 0 18 | lock = asyncio.Lock() 19 | 20 | @lru_cache(maxsize=None) 21 | def get_settings(pid: int): 22 | settings = Settings() 23 | 24 | with open(settings.address_prompt_file) as handle: 25 | settings.address_template = handle.read() 26 | 27 | with open(settings.detailed_intent_prompt_file) as handle: 28 | settings.detailed_intent_template = handle.read() 29 | 30 | if settings.geo_location: 31 | settings.geo_key = converter.setup_geocoding() 32 | 33 | settings.openai_keys = converter.setup_openai(pid % settings.num_workers) 34 | 35 | logging.warning(f"Engine {settings.engine}") 36 | 37 | return settings 38 | 39 | 40 | 41 | 42 | 43 | async def convert( 44 | info: str, 45 | inputs: List[str], 46 | settings: Settings, 47 | api_key: Optional[str] = None, 48 | ): 49 | if info == "address": 50 | template = settings.address_template 51 | max_tokens = settings.address_max_tokens 52 | completion_params = dict(temperature=0.1, frequency_penalty=0.3) 53 | elif info == "detailed_intent": 54 | template = settings.detailed_intent_template 55 | max_tokens = settings.detailed_intent_max_tokens 56 | completion_params = dict(temperature=0.0, frequency_penalty=0.0) 57 | else: 58 | raise ValueError("Unknown information extraction requested") 59 | 60 | text_inputs = [] 61 | for tweet in inputs: 62 | text_inputs.append(converter.create_prompt(text=tweet, template=template, max_tokens=max_tokens)) 63 | 64 | outputs = await converter.query_with_retry( 65 | text_inputs, 66 | api_key=api_key, 67 | engine=settings.engine, 68 | top_p=1, 69 | max_tokens=max_tokens, 70 | stop="#END", 71 | **completion_params, 72 | ) 73 | 74 | returned = [] 75 | for output in outputs: 76 | returned_dict = {} 77 | returned_dict["string"] = output 78 | try: 79 | returned_dict["processed"] = converter.postprocess(info, output[0]) 80 | except Exception as e: 81 | returned_dict["processed"] = { 82 | "intent": [], 83 | "detailed_intent_tags": [], 84 | } 85 | logging.warning(f"Parsing error in {output},\n {e}") 86 | 87 | if info == "address" and settings.geo_location and returned_dict["processed"]: 88 | returned_dict["processed"]["geo"] = converter.get_geo_result( 89 | settings.geo_key, returned_dict["processed"] 90 | ) 91 | returned.append(returned_dict) 92 | 93 | return returned 94 | 95 | 96 | @app.post("/intent-extractor/", response_model=IntentResponse) 97 | async def intent(payload: RequestIntent, req: Request): 98 | # correct_token = os.getenv("NEEDS_RESOLVER_API_KEY", None) 99 | # if correct_token is None: 100 | # raise Exception("token not found in env files!") 101 | # coming_token = req.headers["Authorization"] 102 | # # Here your code for verifying the token or whatever you use 103 | # if coming_token != 'Bearer ' + correct_token: 104 | # raise HTTPException( 105 | # status_code=401, 106 | # detail="Unauthorized" 107 | # ) 108 | 109 | settings = get_settings(os.getpid()) 110 | 111 | inputs = payload.dict()["inputs"] 112 | 113 | global rotator 114 | async with lock: 115 | rotator = (rotator + 1) % len(settings.openai_keys) 116 | 117 | api_key = settings.openai_keys[rotator] 118 | 119 | outputs = await convert("detailed_intent", inputs, settings, api_key=api_key) 120 | return {"response": outputs} 121 | 122 | 123 | @app.get("/health") 124 | async def health(): 125 | return {"status": "living the dream"} -------------------------------------------------------------------------------- /notes.md: -------------------------------------------------------------------------------- 1 | # TODOs 2 | 3 | 1) Check the error handling 4 | - batch size > 20, give error --- # başka bir endpoint gelirse apikey ile artırılabilir. current limit <= 20 5 | - token size limit prompt token + context token <= 4097, current prompt ile 2000 olabilir ama 1000 ok. 6 | - current prompt v5 categories token size --> Tokens = 1,472 7 | - json.loads() # 8 | - eval() # eval kullanmayacakmısız 9 | 10 | # fix postprocess_for_intent_v2 11 | - regex.match -> eger match olmazsa ne oluyor, 12 | INTENT için halucination olursa sorun değil çünkü intent ler sadece bert den gelecek, ilgili text in needs leri gerekiyor sadece. 13 | bu yüzden belki prompt da değişiebilir bu noktadan sonra. [TODO] 14 | - constant formate - input ve output örnekleri lazım 15 | INPUT 16 | 17 | ## INPUT BATCH INPUT 18 | {'inputs': ['İncilikaya mahallesi Şehitkamil/Gaziantep Lütfen bu bölgeye acil çadır, ilaç, aspirin desteği insanlar kendi imkanlarıyla olan çadırlara sığmaya çalışıyorlar lütfen yardım edin !! @AFADBaskanlik @ahbap @EmniyetGM @jandarma @Kizilay', 19 | 'İncilikaya mahallesi Şehitkamil/Gaziantep Lütfen bu bölgeye acil çadır, ilaç, aspirin desteği insanlar kendi imkanlarıyla olan çadırlara sığmaya çalışıyorlar lütfen yardım edin !! @AFADBaskanlik @ahbap @EmniyetGM @jandarma @Kizilay']} 20 | 21 | ## EXAMPLE BATCH OUTPUT 22 | {'response': [{'string': ['People need [çadır, ilaç, aspirin], tags are [SHELTER, HEALTH, MEDICINE]'], 23 | 'processed': {'intent': ['Barinma', 'Saglik', 'MEDICINE'], 24 | 'detailed_intent_tags': ['çadır', 'ilaç', 'aspirin']}}, 25 | {'string': ['People need [çadır, ilaç, aspirin], tags are [SHELTER, HEALTH, MEDICINE]'], 26 | 'processed': {'intent': ['Barinma', 'Saglik', 'MEDICINE'], 27 | 'detailed_intent_tags': ['çadır', 'ilaç', 'aspirin']}}]} 28 | 29 | örnek request 30 | batch inference nasıl dönüyor 31 | 32 | 3) Prompt fix & automatic CI 33 | prompts/intent_v5_categories.txt 34 | # update after TAG_MAP 35 | 36 | 5) Grad IO? 37 | -------------------------------------------------------------------------------- /prompts/address.txt: -------------------------------------------------------------------------------- 1 | # Address Extraction from Twitter Text 2 | 3 | # Provinces: Gaziantep (Antep), Kahramanmaraş (Maraş), Hatay, Osmaniye, Adıyaman, Malatya, Batman, Bingöl, Elazığ, Kilis, Diyarbakır, Mardin, Siirt, Şırnak, Van, Muş, Bitlis, Hakkari, Adana 4 | 5 | # Acronyms: 6 | - mah., Mah. => mahallesi 7 | - sok., Sok., Sk. => sokak 8 | - cad., Cad. => caddesi 9 | - apt., Apt => apartmanı 10 | 11 | # Examples: 12 | 13 | Input: 14 | """ 15 | {ocr_input} 16 | """ 17 | 18 | Output: 19 | -------------------------------------------------------------------------------- /prompts/detailed_intent.txt: -------------------------------------------------------------------------------- 1 | # Find and categorize what people need from Turkish Tweets 2 | - Ignore links, address info. and hashtags. 3 | - possible tags: [RESCUE, POWER_SOURCE, HEALTH, CLOTHES, PORTABLE_TOILET, LOGISTICS, FOOD, BURIAL, SHELTER, WATER, HEATING] 4 | - Transportation related words "benzin", "araba", "mazot" are taged as LOGISTICS 5 | 6 | Input: 7 | """ 8 | Feyzanur Alkan YAŞIYOR ACİL YARIM SAAT ÖNCE SES GELMİŞ KURTARILMAYI BEKLİYOR. Atatürk Bulvarı Mehmet Akif Ersoy Mahallesi 505.Sokak No 1 Alkanlar Apartmanı Adıyaman/Merkez - Adnan AlkanACİL HİLTİ İSPİRAL JENERATÖR LAZIM 9 | """ 10 | 11 | Summary: 12 | People need [jeneratör, hilti] to save other people, tags are [POWER_SOURCE, RESCUE] 13 | #END 14 | 15 | Input: 16 | """ 17 | @ProfDemirtas Gıda gereksinimi yokmuş..kadınların pet,iç çamaşırı,çocuk bzi..seyyar tuvalet..en önemlisi de acil ilaç.. 18 | """ 19 | 20 | Summary: 21 | People need [pet, iç çamaşırı, çocuk bezi, seyyar tuvalet], tags are [HEALTH, CLOTHES, PORTABLE_TOILET] 22 | #END 23 | 24 | Input: 25 | """ 26 | Lütfen önerileri dikkate alın ÜLKEMİZDEKİ DEMİR VE KALIP USTALARINI ACİL ACİL CAĞIRIN GERGİN DEMİR KESME USTALIK İŞİDİR LÜTFEN USTALARIMIZI AÇİL TOPLAYIN DEPREM SAHALARINA SEVK EDİN 27 | """ 28 | 29 | Summary: 30 | People need [demir ve kalıp ustası, demir kesme ustası], tags are [RESCUE] 31 | #END 32 | 33 | Intput: 34 | """ 35 | İstanbul Küçükçekmece Arenapark AVM insan gücü lazım koliler var. Teyitli bizzat kendim yardımda idim sınava yetişmem gerekiyor.@AFADBaskanlik @ahbap_istanbul 36 | """ 37 | 38 | Summary: 39 | People need [insan gücü] for packaging, tags are [LOGISTICS] 40 | #END 41 | 42 | Input: 43 | """ 44 | ❗VİNÇ VE TERMAL KAMERA LAZIM ❗@AFADBaskanlik @ahbap @ahbaphatay @AFADHatay @ibbhabercomtr @chpgenclikgm @AKUT_Dernegi 45 | """ 46 | 47 | Summary: 48 | People need [vinç, termal kamera], tags are [RESCUE, RESCUE_ELECTRONICS] 49 | #END 50 | 51 | Intput: 52 | """ 53 | "Bölgeden gelen ihtiyaç taleplerine göre; gıda kolisi, hijyen malzemesi, battaniye, el feneri, powerbank, çocuk bezi ve çocuk maması gibi ihtiyaç malzemelerinin toplanmasına devam edeceğiz.Bu kapsamda 2. el giyim malzemesi kabul edilmeyecektir." 54 | """ 55 | 56 | Summary: 57 | People need [hijyen malzemesi, mama, battaniye, el feneri, powerbank, çocuk bezi], tags are [HEALTH, FOOD, CLOTHES, POWER_SOURCE] 58 | #END 59 | 60 | Intput: 61 | """ 62 | ARKADAŞLAR LÜTFEN 20 kişilik aileye çadır lazım lütfen yardım edin 63 | """ 64 | 65 | Summary: 66 | People need [çadır], tags are [SHELTER] 67 | #END 68 | 69 | Input: 70 | """ 71 | Arkadaşlar Hataydan arkadaşım yazdı. Cenazeleri poşetle defnediyorlarmış. Çok acil kefen ihtiyacı varmış. Lütfen yayalı 72 | """ 73 | 74 | Summary: 75 | People need [kefen] for burial, tags are [BURIAL] 76 | #END 77 | 78 | Input: 79 | """ 80 | @DepremDairesi @AKUT_Dernegi @KizilayDestek #depremadres 81 | 82 | https://t.co/FbCmDUSz5I 83 | 84 | Ekip arkadaşlarımıza ulaşamıyoruz. Lütfen binalar kontrol edilsin. Hayrullah mah. Malik Ejder cad. Arıkan sit. A blok Kat:4 Kapı No:10 Onikişubat/Kahramanmaraş 85 | 86 | @AFADBaskanlik 87 | """ 88 | 89 | Summary: 90 | People need [bina kontrolü], tags are [RESCUE] 91 | #END 92 | 93 | Intput: 94 | """ 95 | HATAY ANTAKYA BÖBREK YETMEZLİĞİ 5 YAŞINDA ÇOCUK ACİL İLAÇ Topamax 100 mg.Derince mah. Derince çıkmazı bölge trafik yaniHatay AntakyaAhmet Akşun: 05347056864-05322173303 96 | """ 97 | 98 | Summary: 99 | People need [ilaç, Topamax], tags are [HEALTH] 100 | #END 101 | 102 | Input: 103 | """ 104 | KAHRAMANMARAŞ PAZARCIK ÖRDEKDEDE KÖYÜNE ACİL ÇADIR - BATTANİYE - ISITICI GEREKİYOR İNSANLAR 4 GÜNDÜR DIŞARDA SOGUKTA YATIYORLAR ACİLLL. @OguzhanUgur @haluklevent @AFADBaskanlik 105 | """ 106 | 107 | Summary: 108 | People need [ısıtıcı], tags are [HEATING] 109 | #END 110 | 111 | Intput: 112 | """ 113 | "Acil çok acil bu konuma gıda şimdi konuştum #depremsondakika @ProfDemirtas @ahbap @haluklevent @OguzhanUgur" 114 | """ 115 | 116 | Summary: 117 | People need [gıda], tags are [FOOD] 118 | #END 119 | 120 | Intput: 121 | """ 122 | @ProfDemirtas @haluklevent ABİ SULARI YOKMUŞ.ERZAK İHTİYACI ACİL. 123 | """ 124 | 125 | Summary: 126 | People need [su, erzak], tags are [WATER, FOOD] 127 | #END 128 | 129 | Input: 130 | """ 131 | {ocr_input} 132 | """ 133 | 134 | Summary: 135 | -------------------------------------------------------------------------------- /prompts/intent.txt: -------------------------------------------------------------------------------- 1 | # Intent Classification for Turkish Tweets 2 | 3 | Input: 4 | """ 5 | {ocr_input} 6 | """ 7 | 8 | Summary: 9 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 88 3 | max-line-length = 88 4 | target-version = ["py36", "py37", "py38", "py39"] 5 | enforce_line_length = true 6 | experimental_string_processing = true 7 | 8 | [tool.isort] 9 | profile = "black" 10 | multi_line_output = 3 11 | lines_after_imports = 2 12 | include_trailing_comma = true 13 | force_grid_wrap = 0 14 | use_parentheses = true 15 | ensure_newline_before_comments = true 16 | lines_between_sections = 0 17 | line_length = 88 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==1.4.0 2 | aiohttp==3.8.3 3 | aiosignal==1.3.1 4 | anyio==3.6.2 5 | async-timeout==4.0.2 6 | attrs==22.2.0 7 | certifi==2022.12.7 8 | charset-normalizer==2.1.1 9 | click==8.1.3 10 | dnspython==2.3.0 11 | email-validator==1.3.1 12 | exceptiongroup==1.1.0 13 | fastapi==0.91.0 14 | frozenlist==1.3.3 15 | h11==0.14.0 16 | httpcore==0.16.3 17 | httptools==0.5.0 18 | httpx==0.23.3 19 | idna==3.4 20 | iniconfig==2.0.0 21 | itsdangerous==2.1.2 22 | Jinja2==3.1.2 23 | joblib==1.2.0 24 | MarkupSafe==2.1.2 25 | multidict==6.0.4 26 | numpy==1.24.2 27 | openai==0.26.5 28 | orjson==3.8.6 29 | packaging==23.0 30 | pluggy==1.0.0 31 | pydantic==1.10.4 32 | pytest==7.2.1 33 | python-dotenv==0.21.1 34 | python-multipart==0.0.5 35 | PyYAML==6.0 36 | requests==2.28.2 37 | rfc3986==1.5.0 38 | scikit-learn==1.2.1 39 | scipy==1.10.0 40 | six==1.16.0 41 | sniffio==1.3.0 42 | starlette==0.24.0 43 | threadpoolctl==3.1.0 44 | tomli==2.0.1 45 | tqdm==4.64.1 46 | typing_extensions==4.4.0 47 | ujson==5.7.0 48 | urllib3==1.26.14 49 | uvicorn==0.20.0 50 | uvloop==0.17.0 51 | watchfiles==0.18.1 52 | websockets==10.4 53 | yarl==1.8.2 54 | transformers==4.26.1 55 | -------------------------------------------------------------------------------- /run_address.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | EXP_NAME="address_test/" 3 | BASE_PATH="/home/akyurek/deprem/" 4 | INPUTFILE="data/test.jsonl" 5 | NUMKEY=5 6 | 7 | 8 | for i in $(seq 0 $((NUMKEY-1))); 9 | do 10 | OUTPUT_PATH=$BASE_PATH/exps/${EXP_NAME}/${i}/ 11 | mkdir -p $OUTPUT_PATH 12 | python converter.py \ 13 | --prompt_file prompts/main.txt \ 14 | --input_file $INPUTFILE \ 15 | --output_file $OUTPUT_PATH/output.jsonl \ 16 | --worker_id $i \ 17 | --geo_location \ 18 | --info="address" \ 19 | --num_workers $NUMKEY > $OUTPUT_PATH/out.log 2>&1 & 20 | done 21 | 22 | 23 | -------------------------------------------------------------------------------- /run_intent.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source setup.sh 3 | EXP_NAME="new_labels_code_davinci_v4" 4 | BASE_PATH="/home/akyurek/git/deprem/" 5 | #INPUTFILE="data/intent-multilabel-test-v1-2.json" 6 | INPUTFILE="data/testv1.3.json" 7 | NUMKEY=4 8 | EXP_FOLDER=$BASE_PATH/exps/${EXP_NAME}/ 9 | 10 | echo "deleting ${EXP_FOLDER}" 11 | 12 | # rm -rf $EXP_FOLDER 13 | 14 | # for i in $(seq 0 $((NUMKEY-1))); 15 | # do 16 | # OUTPUT_PATH=$EXP_FOLDER/${i}/ 17 | # mkdir -p $OUTPUT_PATH 18 | # python src/converter.py \ 19 | # --prompt_file prompts/detailed_intent.txt \ 20 | # --input_file $INPUTFILE \ 21 | # --output_file $OUTPUT_PATH/output.jsonl \ 22 | # --worker_id $i \ 23 | # --info="detailed_intent" \ 24 | # --max_tokens 100 \ 25 | # --engine="code-davinci-002" \ 26 | # --num_workers $NUMKEY > $OUTPUT_PATH/out.log 2>&1 & 27 | # done 28 | 29 | cat $EXP_FOLDER/**/output.jsonl > $EXP_FOLDER/merged.jsonl 30 | python eval.py --input_file $EXP_FOLDER/merged.jsonl 31 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/acikyazilimagi/deprem_openai_apis/53aeaa4984b8616cbf7faef2d7c946de451c9e6e/src/__init__.py -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List 2 | from pydantic import BaseSettings 3 | 4 | 5 | class Settings(BaseSettings): 6 | address_prompt_file: str = "prompts/address.txt" 7 | detailed_intent_prompt_file: str = "prompts/detailed_intent.txt" 8 | address_template: Optional[str] = None 9 | detailed_intent_template: Optional[str] = None 10 | geo_key: Optional[str] = None 11 | openai_keys: Optional[List[str]] = None 12 | address_max_tokens: int = 384 13 | detailed_intent_max_tokens: int = 100 14 | batch_size: int = 20 15 | geo_location: bool = False 16 | num_workers: int = 5 17 | engine: str = "afet-org" 18 | 19 | class Config: 20 | env_file = ".env" 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/converter.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import os 4 | import re 5 | import urllib 6 | from typing import List, Optional 7 | import openai 8 | import requests 9 | from absl import app, flags, logging 10 | from tqdm import tqdm 11 | from src.gpt.network_manager import interact_with_api 12 | from src.lm.tokenizer import GPTTokenizer 13 | 14 | FLAGS = flags.FLAGS 15 | 16 | flags.DEFINE_string( 17 | "prompt_file", default=None, help="Prompt file to use for the problem" 18 | ) 19 | 20 | flags.DEFINE_string("input_file", default=None, help="Input file to read data") 21 | 22 | flags.DEFINE_string("output_file", default=None, help="Output file to write to") 23 | 24 | flags.DEFINE_integer("max_tokens", default=384, help="LM max generation length") 25 | 26 | flags.DEFINE_integer("worker_id", default=0, help="Worker id for the job") 27 | 28 | flags.DEFINE_integer("num_workers", default=1, help="number of workers") 29 | 30 | flags.DEFINE_integer("batch_size", default=20, help="batch size for OpenAI queries") 31 | 32 | flags.DEFINE_boolean( 33 | "geo_location", default=False, help="whether to add geo location to the output" 34 | ) 35 | 36 | flags.DEFINE_string("info", default="address", help="address | intent") 37 | 38 | flags.DEFINE_string("engine", "code-davinci-002", help="GPT engines") 39 | 40 | GEO_BASE_URL = "https://maps.googleapis.com/maps/api/geocode/json?" 41 | 42 | # TODO: add more keywords. 43 | NON_ADDRESS_WORDS = [ 44 | "arkadaş", 45 | "bebek", 46 | "enkaz", 47 | "deprem", 48 | "ekipman", 49 | "araç", 50 | "kayıp", 51 | "acil", 52 | "yardım", 53 | "kurtarma", 54 | "kayıp", 55 | "aile", 56 | "baba", 57 | ] 58 | 59 | 60 | def postprocess_for_address(address): 61 | # a quick rule based filtering for badly parsed outputs. 62 | address = json.loads(address) 63 | if type(address) == dict: 64 | for key in ( 65 | "mahallesi | bulvarı", 66 | "sokak | caddesi | yolu", 67 | "sitesi | apartmanı", 68 | "no | blok", 69 | "kat", 70 | "phone", 71 | ): 72 | if ( 73 | key in address 74 | and len(address[key]) > 50 75 | or any(word in address[key] for word in NON_ADDRESS_WORDS) 76 | ): 77 | address[key] = "" 78 | 79 | for key in ("no | blok", "kat"): 80 | if key in address and len(address[key]) > 20: 81 | address[key] = "" 82 | 83 | return address 84 | 85 | 86 | TAG_MAP = { 87 | "POWER_SOURCE": "Elektrik Kaynagi", 88 | "WATER": "Su", 89 | "LOGISTICS": "Lojistik", 90 | "TRANSPORTATION": "Lojistik", 91 | "FOOD": "Yemek", 92 | "RESCUE": "Enkaz Kaldirma", 93 | "HEALTH": "Saglik", 94 | "UNINFORMATIVE": "Alakasiz", 95 | "SHELTER": "Barınma", 96 | "HEATING": "Isinma", 97 | "RESCUE_ELECTRONICS": "Arama Ekipmani", 98 | # "LOOTING": "Yagma", 99 | "BURIAL": "Cenaze", 100 | "CLOTHES": "Giysi", 101 | "PORTABLE_TOILET": "Tuvalet" 102 | } 103 | 104 | 105 | CHARMAP = { 106 | u"I": u"ı", 107 | u"İ": u"i", 108 | } 109 | 110 | def tr_lower(text): 111 | for c1, c2 in CHARMAP.items(): 112 | text = text.replace(c1, c2) 113 | return text 114 | 115 | def postprocess_for_intent(intent): 116 | m = re.search(r"(?<=\[).+?(?=\])", intent) 117 | if m: 118 | tags = m.group() 119 | tags = [TAG_MAP.get(tag.strip(), tag.strip()) for tag in tags.split(",")] 120 | return {"intent": ",".join(tags)} 121 | else: 122 | return {"intent": "Diğer"} 123 | 124 | 125 | def postprocess_for_intent_v2(intent): 126 | matches = re.findall(r"(?<=\[).+?(?=\])", intent) 127 | if matches: 128 | detailed_intent = matches[0] 129 | detailed_intent_tags = [ 130 | tr_lower(TAG_MAP.get(tag.strip(), tag.strip())).lower() for tag in detailed_intent.split(",") 131 | ] 132 | if len(matches) > 1: 133 | intent = matches[1] 134 | intent_tags = [ 135 | TAG_MAP.get(tag.strip(), tag.strip()) for tag in intent.split(",") 136 | ] 137 | else: 138 | intent_tags = [] 139 | 140 | return { 141 | "intent": intent_tags, # ",".join(intent_tags), 142 | "detailed_intent_tags": detailed_intent_tags, # ",".join(detailed_intent_tags), 143 | } 144 | else: 145 | return { 146 | "intent": [], # ",".join(intent_tags), 147 | "detailed_intent_tags": [], # ",".join(detailed_intent_tags), 148 | } 149 | 150 | 151 | def postprocess(info, line): 152 | if info == "address": 153 | return postprocess_for_address(line) 154 | elif info == "detailed_intent": 155 | return postprocess_for_intent_v2(line) 156 | else: 157 | raise ValueError("Unknown info type") 158 | 159 | 160 | def get_address_str(address): 161 | address_str = "" 162 | for key in ( 163 | "mahallesi | bulvarı", 164 | "sokak | caddesi | yolu", 165 | "sitesi | apartmanı", 166 | "no | blok", 167 | "city", 168 | "province", 169 | ): 170 | address_str += address.get(key, "") + " " 171 | 172 | return address_str.strip() 173 | 174 | 175 | async def query_with_retry(inputs: List[str], api_key: Optional[str] = None, **kwargs) -> List[List[str]]: 176 | """Queries GPT API up to max_retry time to get the responses.""" 177 | if api_key: 178 | openai.api_key = api_key 179 | try: 180 | response = await interact_with_api(openai.Completion.create, prompt=inputs, **kwargs) 181 | except Exception: 182 | return [['{"status": "ERROR"}']] * len(inputs) 183 | 184 | return [ 185 | [line for line in choice["text"].split("\n") if len(line) > 10] 186 | for choice in response["choices"] 187 | ] 188 | 189 | 190 | def setup_openai(worker_id: int = 0) -> List[str]: 191 | logging.warning(f"worker id in open ai keys {worker_id}") 192 | 193 | try: 194 | openai_keys = os.getenv("OPENAI_API_KEY_POOL").split(",") 195 | except KeyError: 196 | logging.error( 197 | "OPENAI_API_KEY_POOL or OPENAI_API_BASE_POOL environment variable is not" 198 | " specified" 199 | ) 200 | 201 | assert len(openai_keys) > 0, "No keys specified in the environment variable" 202 | 203 | # set the default key 204 | openai.api_key = openai_keys[worker_id % len(openai_keys)].strip() 205 | 206 | try: 207 | openai_bases = os.getenv("OPENAI_API_BASE_POOL").split(",") 208 | assert len(openai_bases) == len(openai_keys) 209 | openai.api_type = "azure" 210 | openai.api_version = "2022-12-01" 211 | openai.api_base = openai_bases[worker_id % len(openai_bases)].strip() 212 | except (KeyError, AttributeError): 213 | logging.warning("OPENAI_API_BASE_POOL is not specified in the environment") 214 | except AssertionError as msg: 215 | logging.error( 216 | "Env variables OPENAI_API_KEY_POOL and OPENAI_API_BASE_POOL has" 217 | f" incosistent shapes, {msg}" 218 | ) 219 | 220 | return openai_keys 221 | 222 | def setup_geocoding(worker_id: int = 0): 223 | try: 224 | geo_keys = os.getenv("GEO_KEY_POOL").split(",") 225 | except KeyError: 226 | logging.error("GEO_KEY_POOL environment variable is not specified") 227 | 228 | assert len(geo_keys) > 0, "No keys specified in the environment variable" 229 | 230 | worker_geo_key = geo_keys[worker_id % len(geo_keys)].strip() 231 | 232 | return worker_geo_key 233 | 234 | 235 | def get_geo_result(key, address): 236 | address_str = get_address_str(address) 237 | parameters = {"address": address_str, "key": key} 238 | response = requests.get(f"{GEO_BASE_URL}{urllib.parse.urlencode(parameters)}") 239 | 240 | if response.status_code == 200: 241 | results = json.loads(response.content)["results"] 242 | if results: 243 | for result in results: 244 | if "geometry" in result and "location" in result["geometry"]: 245 | loc = result["geometry"]["location"] 246 | link = "https://maps.google.com/?q={lat},{lng}".format( 247 | lat=loc["lat"], lng=loc["lng"] 248 | ) 249 | result["gmaps_link"] = link 250 | return results 251 | else: 252 | logging.warning(response.content) 253 | 254 | 255 | def preprocess_tweet(text: str) -> str: 256 | mention_pattern = r"@\w+" 257 | url_pattern = r"(\w+?://)?(?:www\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\.[a-zA-Z]{1,10}\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)" 258 | # remove mentions 259 | mentions_removed = re.sub(mention_pattern, " ", text) 260 | # remove urls 261 | url_removed = re.sub(url_pattern, "", mentions_removed) 262 | # remove consequent spaces 263 | return re.sub(r"\s+", " ", url_removed) 264 | 265 | 266 | def create_prompt(text: str, template: str, max_tokens: int) -> str: 267 | template_token_count = GPTTokenizer.token_count(template) 268 | 269 | preprocessed_text = preprocess_tweet(text) 270 | 271 | truncated_text = GPTTokenizer.truncate( 272 | preprocessed_text, 273 | max_tokens=GPTTokenizer.MAX_TOKENS - max_tokens - template_token_count, 274 | ) 275 | 276 | return template.format(ocr_input=truncated_text) 277 | 278 | 279 | def main(_): 280 | setup_openai(FLAGS.worker_id) 281 | if FLAGS.geo_location: 282 | geo_key = setup_geocoding(FLAGS.worker_id) 283 | 284 | with open(FLAGS.prompt_file) as handle: 285 | template = handle.read() 286 | 287 | if FLAGS.info == "address": 288 | completion_params = dict(temperature=0.1, frequency_penalty=0.3) 289 | elif "intent" in FLAGS.info: 290 | completion_params = dict(temperature=0.0, frequency_penalty=0.0) 291 | else: 292 | raise ValueError("Unknown info") 293 | 294 | logging.info(f"Engine {FLAGS.engine}") 295 | 296 | loop = asyncio.get_event_loop() 297 | 298 | with open(FLAGS.input_file) as handle: 299 | # raw_data = [json.loads(line.strip()) for line in handle] 300 | raw_data = json.load(handle) 301 | split_size = len(raw_data) // FLAGS.num_workers 302 | raw_data = raw_data[ 303 | FLAGS.worker_id * split_size : (FLAGS.worker_id + 1) * split_size 304 | ] 305 | 306 | logging.info(f"Length of the data for this worker is {len(raw_data)}") 307 | text_inputs = [] 308 | raw_inputs = [] 309 | 310 | for index, row in tqdm(enumerate(raw_data)): 311 | # text_inputs.append(template.format(ocr_input=row["Tweet"])) 312 | text_inputs.append(create_prompt(text=row["image_url"], template=template, max_tokens=FLAGS.max_tokens)) 313 | raw_inputs.append(row) 314 | 315 | if (index + 1) % FLAGS.batch_size == 0 or index == len(raw_data) - 1: 316 | # to not throttle api key limits with parallel queries? 317 | outputs = loop.run_until_complete(query_with_retry( 318 | text_inputs, 319 | engine=FLAGS.engine, 320 | max_tokens=FLAGS.max_tokens, 321 | top_p=1, 322 | presence_penalty=0, 323 | stop="#END", 324 | **completion_params, 325 | )) 326 | 327 | with open(FLAGS.output_file, "a+") as handle: 328 | for inp, output_lines in zip(raw_inputs, outputs): 329 | # for output_line in output_lines: 330 | output_line = output_lines[0] 331 | current_input = inp.copy() 332 | try: 333 | current_input[FLAGS.info + "_json"] = postprocess( 334 | FLAGS.info, output_line 335 | ) 336 | current_input[FLAGS.info + "_str"] = output_line 337 | except Exception as e: 338 | logging.warning(f"Parsing error in {output_line},\n {e}") 339 | current_input[FLAGS.info + "_json"] = {} 340 | current_input[FLAGS.info + "_str"] = output_line 341 | 342 | if ( 343 | FLAGS.info == "address" 344 | and FLAGS.geo_location 345 | and type(current_input[FLAGS.info + "_json"]) == dict 346 | ): 347 | current_input["geo"] = get_geo_result( 348 | geo_key, current_input[FLAGS.info + "_json"] 349 | ) 350 | 351 | json_output = json.dumps(current_input) 352 | handle.write(json_output + "\n") 353 | 354 | text_inputs = [] 355 | raw_inputs = [] 356 | 357 | 358 | if __name__ == "__main__": 359 | app.run(main) 360 | -------------------------------------------------------------------------------- /src/gpt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/acikyazilimagi/deprem_openai_apis/53aeaa4984b8616cbf7faef2d7c946de451c9e6e/src/gpt/__init__.py -------------------------------------------------------------------------------- /src/gpt/network_manager.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from functools import wraps 4 | from math import ceil, log2 5 | from random import random 6 | from openai import APIError 7 | from openai.error import ( 8 | APIConnectionError, 9 | AuthenticationError, 10 | InvalidRequestError, 11 | OpenAIError, 12 | RateLimitError, 13 | ServiceUnavailableError, 14 | TryAgain, 15 | ) 16 | # from src.concurrent.asynchronous import run_async_tasks 17 | 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | OPENAI_MAX_RETRY = 10 23 | # quota is reset in every 60 seconds 24 | OPENAI_REFRESH_QUOTA = 60 25 | OPENAI_EXP_CAP = int(ceil(log2(OPENAI_REFRESH_QUOTA))) 26 | 27 | 28 | class OpenAINetworkManager: 29 | def __init__(self): 30 | raise AssertionError(f"{type(self).__name__} should not be instantiated.") 31 | 32 | @staticmethod 33 | def async_retry_with_exp_backoff(task): 34 | @wraps(task) 35 | async def wrapper(*args, **kwargs): 36 | for i in range(OPENAI_MAX_RETRY + 1): 37 | wait_time = (1 << min(i, OPENAI_EXP_CAP)) + random() / 10 38 | try: 39 | return task(*args, **kwargs) 40 | except ( 41 | RateLimitError, 42 | ServiceUnavailableError, 43 | APIConnectionError, 44 | APIError, 45 | TryAgain, 46 | ) as e: 47 | if i == OPENAI_MAX_RETRY: 48 | logger.error( 49 | f"Retry, TooManyRequests or Server Error. {str(e)}" 50 | ) 51 | raise e 52 | else: 53 | logger.warning( 54 | f"Waiting {round(wait_time, 2)} seconds for API...", 55 | ) 56 | await asyncio.sleep(wait_time) 57 | except AuthenticationError as e: 58 | # No way to handle 59 | logger.error(f"AuthenticationError: {str(e)}") 60 | raise Exception( 61 | "AuthenticationError: Incorrect API key is provided.", 62 | ) 63 | except InvalidRequestError as e: 64 | logger.error(f"InvalidRequestError: {str(e)}") 65 | raise e 66 | except OpenAIError as e: 67 | logger.error(f"API Request failed. {str(e)}") 68 | raise e 69 | except Exception as e: 70 | logger.error(f"Error unrelated to API. {str(e)}") 71 | raise e 72 | 73 | return wrapper 74 | 75 | 76 | async def interact_with_api(func, *args, **kwargs): 77 | @OpenAINetworkManager.async_retry_with_exp_backoff 78 | def interact(): 79 | return func(*args, **kwargs) 80 | 81 | return await interact() 82 | -------------------------------------------------------------------------------- /src/lm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/acikyazilimagi/deprem_openai_apis/53aeaa4984b8616cbf7faef2d7c946de451c9e6e/src/lm/__init__.py -------------------------------------------------------------------------------- /src/lm/tokenizer.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | 3 | 4 | tokenizer = AutoTokenizer.from_pretrained("gpt2") 5 | 6 | 7 | class GPTTokenizer: 8 | MAX_TOKENS = 4096 9 | 10 | @classmethod 11 | def token_count(cls, text: str) -> int: 12 | return len(tokenizer(text, truncation=False)["input_ids"]) 13 | 14 | @classmethod 15 | def truncate(self, text: str, max_tokens: int) -> str: 16 | encoded = tokenizer(text, truncation=True, max_length=max_tokens) 17 | return tokenizer.decode(encoded["input_ids"]) 18 | -------------------------------------------------------------------------------- /src/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | 5 | def setup_logging(): 6 | handler = logging.StreamHandler(sys.stdout) 7 | formatter = logging.Formatter( 8 | '%(asctime)s [%(levelname)s] %(name)s: %(message)s' 9 | ) 10 | handler.setFormatter(formatter) 11 | 12 | # default logger 13 | default_logger = logging.getLogger() 14 | # remove default handler with formatter 15 | default_logger.handlers.clear() 16 | default_logger.setLevel(logging.INFO) 17 | default_logger.addHandler(handler) 18 | 19 | -------------------------------------------------------------------------------- /src/models.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | from typing import List 3 | 4 | 5 | class RequestIntent(BaseModel): 6 | inputs: List[str] = Field( 7 | description="list of tweets to classify or parse", 8 | default=""" ["İskenderun Hatay Mustafa Kemal mahallesi 544 sokak no:11 (Batı Göz hastanesi sokağı) Selahattin Yurt Dudu Yurt Sezer Yurt GÖÇÜK ALTINDALAR!!! #DEPREMOLDU #depremhatay #deprem #Hatay #hatayacil #HatayaYardım #hataydepremi", "LÜTFEN YAYIN!!!! 8 katlı bina HATAYDA Odabaşı mah. Uğur Mumcu caddesi no 4 Mahmut Karakaş kat 4"]""", 9 | ) 10 | 11 | 12 | class IntentRequest(BaseModel): 13 | inputs: List[str] = Field( 14 | description="list of tweets to classify or parse", 15 | default=""" ["İskenderun Hatay Mustafa Kemal mahallesi 544 sokak no:11 (Batı Göz hastanesi sokağı) Selahattin Yurt Dudu Yurt Sezer Yurt GÖÇÜK ALTINDALAR!!! #DEPREMOLDU #depremhatay #deprem #Hatay #hatayacil #HatayaYardım #hataydepremi", "LÜTFEN YAYIN!!!! 8 katlı bina HATAYDA Odabaşı mah. Uğur Mumcu caddesi no 4 Mahmut Karakaş kat 4"]""", 16 | ) 17 | 18 | 19 | class IntentResponse(BaseModel): 20 | response: List[dict] 21 | -------------------------------------------------------------------------------- /task-definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "ipcMode": null, 3 | "executionRoleArn": "arn:aws:iam::366354050833:role/ecsServiceRole", 4 | "containerDefinitions": [ 5 | { 6 | "dnsSearchDomains": null, 7 | "environmentFiles": [ 8 | { 9 | "value": "arn:aws:s3:::credent-bucket/open-api-apis.env", 10 | "type": "s3" 11 | } 12 | ], 13 | "logConfiguration": { 14 | "logDriver": "awslogs", 15 | "secretOptions": null, 16 | "options": { 17 | "awslogs-group": "/ecs/deprem-openai-api", 18 | "awslogs-region": "eu-central-1", 19 | "awslogs-create-group": "true", 20 | "awslogs-stream-prefix": "ecs" 21 | } 22 | }, 23 | "entryPoint": null, 24 | "portMappings": [ 25 | { 26 | "hostPort": 80, 27 | "protocol": "tcp", 28 | "containerPort": 80 29 | } 30 | ], 31 | "command": null, 32 | "linuxParameters": null, 33 | "cpu": 4096, 34 | "environment": [], 35 | "resourceRequirements": null, 36 | "ulimits": null, 37 | "dnsServers": null, 38 | "mountPoints": [], 39 | "workingDirectory": null, 40 | "secrets": null, 41 | "dockerSecurityOptions": null, 42 | "memory": 8192, 43 | "memoryReservation": null, 44 | "volumesFrom": [], 45 | "stopTimeout": null, 46 | "image": "deprem-openai-api", 47 | "startTimeout": null, 48 | "firelensConfiguration": null, 49 | "dependsOn": null, 50 | "disableNetworking": null, 51 | "interactive": null, 52 | "healthCheck": null, 53 | "essential": true, 54 | "links": null, 55 | "hostname": null, 56 | "extraHosts": null, 57 | "pseudoTerminal": null, 58 | "user": null, 59 | "readonlyRootFilesystem": null, 60 | "dockerLabels": null, 61 | "systemControls": null, 62 | "privileged": null, 63 | "name": "container-name" 64 | } 65 | ], 66 | "placementConstraints": [], 67 | "memory": "8192", 68 | "taskRoleArn": null, 69 | "compatibilities": [ 70 | "EC2", 71 | "FARGATE" 72 | ], 73 | "taskDefinitionArn": "arn:aws:ecs:eu-central-1:366354050833:task-definition/deprem-openai-api-TD:1", 74 | "family": "deprem-openai-api-TD", 75 | "requiresAttributes": [ 76 | { 77 | "targetId": null, 78 | "targetType": null, 79 | "value": null, 80 | "name": "com.amazonaws.ecs.capability.logging-driver.awslogs" 81 | }, 82 | { 83 | "targetId": null, 84 | "targetType": null, 85 | "value": null, 86 | "name": "ecs.capability.execution-role-awslogs" 87 | }, 88 | { 89 | "targetId": null, 90 | "targetType": null, 91 | "value": null, 92 | "name": "com.amazonaws.ecs.capability.docker-remote-api.1.19" 93 | }, 94 | { 95 | "targetId": null, 96 | "targetType": null, 97 | "value": null, 98 | "name": "com.amazonaws.ecs.capability.docker-remote-api.1.18" 99 | }, 100 | { 101 | "targetId": null, 102 | "targetType": null, 103 | "value": null, 104 | "name": "ecs.capability.task-eni" 105 | }, 106 | { 107 | "targetId": null, 108 | "targetType": null, 109 | "value": null, 110 | "name": "com.amazonaws.ecs.capability.docker-remote-api.1.29" 111 | } 112 | ], 113 | "pidMode": null, 114 | "requiresCompatibilities": [ 115 | "FARGATE" 116 | ], 117 | "networkMode": "awsvpc", 118 | "runtimePlatform": null, 119 | "cpu": "4096", 120 | "revision": 1, 121 | "status": "ACTIVE", 122 | "inferenceAccelerators": null, 123 | "proxyConfiguration": null, 124 | "volumes": [] 125 | } 126 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/acikyazilimagi/deprem_openai_apis/53aeaa4984b8616cbf7faef2d7c946de451c9e6e/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_ai.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from fastapi.testclient import TestClient 5 | from main import app 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | client = TestClient(app=app) 10 | 11 | PAYLOAD = { 12 | "inputs": [ 13 | "İskenderun Hatay Mustafa Kemal mahallesi 544 sokak no:11 (Batı Göz hastanesi" 14 | " sokağı) Selahattin Yurt Dudu Yurt Sezer Yurt GÖÇÜK ALTINDALAR!!! #DEPREMOLDU" 15 | " #depremhatay #deprem #Hatay #hatayacil #HatayaYardım #hataydepremi", 16 | "LÜTFEN YAYIN!!!! 8 katlı bina HATAYDA Odabaşı mah. Uğur Mumcu caddesi no 4" 17 | " Mahmut Karakaş kat 4", 18 | ], 19 | } 20 | 21 | 22 | def test_intent(): 23 | correct_token = os.getenv("NEEDS_RESOLVER_API_KEY") 24 | headers = {"Authorization": f"Bearer {correct_token}"} 25 | response = client.post("/intent-extractor/", json=PAYLOAD, headers=headers) 26 | assert response.status_code == 200 27 | outputs = response.json()["response"] 28 | assert isinstance(outputs, list) 29 | 30 | for obj in outputs: 31 | logger.debug(obj) 32 | assert isinstance(obj, dict) 33 | assert "string" in obj 34 | assert "processed" in obj 35 | assert isinstance(obj["processed"]["intent"], list) 36 | assert len(obj["processed"]["intent"]) > 0 37 | assert isinstance(obj["processed"]["detailed_intent_tags"], list) 38 | assert len(obj["processed"]["detailed_intent_tags"]) > 0 39 | --------------------------------------------------------------------------------