├── cache └── cache_db.db ├── dataset ├── processed.xlsx ├── Full Dataset Kotor updated 2.0.xlsx └── README.md ├── pipeline-diagram-stable ├── pipeline.png └── interface.jpeg ├── src ├── secret.py └── extract_enrichment.py ├── dags ├── crawler │ ├── twitter_scrapper.py │ ├── news_scrapper.py │ └── final_dataset │ │ └── twitter_prabowo_subianto.csv └── preprocess │ ├── stopwords.txt │ ├── extraction.py │ ├── process.py │ └── enrich.py ├── README.md ├── .gitignore ├── requirements-data-engineer.txt ├── app.py ├── LICENSE └── notebook ├── enrichment.ipynb └── enrichment - 2.ipynb /cache/cache_db.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NnA301023/ElectionAspectAnalyzer/HEAD/cache/cache_db.db -------------------------------------------------------------------------------- /dataset/processed.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NnA301023/ElectionAspectAnalyzer/HEAD/dataset/processed.xlsx -------------------------------------------------------------------------------- /pipeline-diagram-stable/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NnA301023/ElectionAspectAnalyzer/HEAD/pipeline-diagram-stable/pipeline.png -------------------------------------------------------------------------------- /src/secret.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | 4 | load_dotenv() 5 | OPENAI_KEY = os.getenv("OPENAI_API_KEY", default = None) -------------------------------------------------------------------------------- /pipeline-diagram-stable/interface.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NnA301023/ElectionAspectAnalyzer/HEAD/pipeline-diagram-stable/interface.jpeg -------------------------------------------------------------------------------- /dataset/Full Dataset Kotor updated 2.0.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NnA301023/ElectionAspectAnalyzer/HEAD/dataset/Full Dataset Kotor updated 2.0.xlsx -------------------------------------------------------------------------------- /dataset/README.md: -------------------------------------------------------------------------------- 1 | ### Disclaimer 2 | 3 | Dataset merupakan hasil pengambilan dari twitter yang tersedia di kaggle dengan notebook scrapping data tertera. 4 | Special regards untuk mas [@andree cy](https://www.kaggle.com/andreecy) dan mas [@Mc Affandi](https://www.kaggle.com/mcaffandi) atas hasil scrapping data nya, sehingga bisa kami manfaatkan untuk kelancaran pengerjaan final project kami. 5 | 6 | - https://www.kaggle.com/code/andreecy/paslon-2024-tweet-data/notebook -------------------------------------------------------------------------------- /dags/crawler/twitter_scrapper.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import tweepy 3 | import pandas as pd 4 | 5 | 6 | class TwitterScrapper: 7 | def __init__(self, token, secret, api_key, api_secret): 8 | auth = tweepy.OAuthHandler(api_key, api_secret) 9 | auth.set_access_token(token, secret) 10 | self.api = tweepy.API(auth, wait_on_rate_limit=True) 11 | def search(self, keyword): 12 | c, i, u, t = [], [], [], [] 13 | output_filename = open( 14 | keyword + ".csv", mode="a+", 15 | newline="", encoding="utf-8" 16 | ) 17 | csv_file = csv.writer(output_filename) 18 | 19 | for tweet in tweepy.Cursos( 20 | self.api.search_tweets, 21 | q=keyword, count=15, lang="id", 22 | start_time="2023-01-01T00:00:00Z", end_time="2023-11-30T23:59:59Z" 23 | ).items(): 24 | c.append(tweet.created_at) 25 | i.append(tweet.id) 26 | u.append(tweet.user.name) 27 | t.append(tweet.text.encode("utf-8")) 28 | tweets = [tweet.created_at, tweet.id, tweet.user.name, tweet.text.encode("utf-8")] 29 | csv_file.writerow(tweets) 30 | 31 | dictTweets = {"waktu": c, "id": i, "username": u, "teks": t} 32 | df = pd.DataFrame(dictTweets, columns=["waktu", "id", "username", "teks"]) 33 | 34 | 35 | if __name__ == "__main__": 36 | scrapping = TwitterScrapper(..., ..., ..., ...) 37 | scrapping.search('...') -------------------------------------------------------------------------------- /dags/preprocess/stopwords.txt: -------------------------------------------------------------------------------- 1 | kah 2 | oh 3 | sebagai 4 | kami 5 | tanpa 6 | daripada 7 | sambil 8 | sementara 9 | kecuali 10 | sekitar 11 | ke 12 | ia 13 | sampai 14 | ini 15 | bisa 16 | secara 17 | untuk 18 | pula 19 | yakni 20 | dst 21 | demi 22 | walau 23 | sudah 24 | kemana 25 | ok 26 | nanti 27 | saja 28 | bahwa 29 | telah 30 | yang 31 | ada 32 | seterusnya 33 | serta 34 | dahulu 35 | saat 36 | akan 37 | itulah 38 | saya 39 | sehingga 40 | karena 41 | adalah 42 | dua 43 | toh 44 | ya 45 | sesuatu 46 | nggak 47 | tidak 48 | para 49 | kepada 50 | jika 51 | melainkan 52 | anda 53 | atau 54 | dari 55 | pun 56 | pada 57 | dsb 58 | amat 59 | begitu 60 | sebelum 61 | menurut 62 | sebab 63 | seraya 64 | hanya 65 | antara 66 | sebetulnya 67 | seperti 68 | seolah 69 | selain 70 | di 71 | namun 72 | kembali 73 | setiap 74 | ketika 75 | maka 76 | mengapa 77 | dengan 78 | selagi 79 | lagi 80 | anu 81 | agak 82 | supaya 83 | dapat 84 | tapi 85 | masih 86 | tentu 87 | pasti 88 | bagi 89 | seharusnya 90 | tentang 91 | agar 92 | boleh 93 | ingin 94 | guna 95 | tolong 96 | apalagi 97 | utk 98 | kenapa 99 | yaitu 100 | dll 101 | dulunya 102 | itu 103 | dimana 104 | sedangkan 105 | lain 106 | kita 107 | dan 108 | mereka 109 | harus 110 | belum 111 | dia 112 | tetapi 113 | sesudah 114 | mari 115 | setidaknya 116 | oleh 117 | terhadap 118 | hal 119 | apakah 120 | demikian 121 | juga 122 | bagaimanapun 123 | setelah 124 | dalam 125 | -------------------------------------------------------------------------------- /dags/preprocess/extraction.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from tqdm import tqdm 3 | from random import choice 4 | pd.set_option("display.max_columns", None) 5 | 6 | 7 | list_sentiment = ['positive', 'negative', 'neutral'] 8 | list_validity = ['fake', 'real'] 9 | 10 | data = pd.read_csv("final_dataset/clean_data.csv") 11 | 12 | source, keyword, timestamp, author, content, topic = [], [], [], [], [], [] 13 | for i, row in tqdm(data.iterrows()): 14 | _source = row['source'] 15 | _keyword = row['keyword'] 16 | _timestamp = row['timestamp'] 17 | _author = row['author'] 18 | _content = row['clean_content'] 19 | topics = row['topic_extract'] 20 | if isinstance(topics, float): 21 | continue 22 | topics = topics.split(", ") 23 | 24 | for _topic in topics: 25 | source.append(_source) 26 | keyword.append(_keyword) 27 | timestamp.append(_timestamp) 28 | author.append(_author) 29 | content.append(_content) 30 | topic.append(_topic) 31 | 32 | data_final = pd.DataFrame({ 33 | "source": source, 34 | "keyword": keyword, 35 | "timestamp": timestamp, 36 | "author": author, 37 | "content": content, 38 | "topic": topic 39 | }) 40 | 41 | data_final['timestamp'] = pd.to_datetime(data_final['timestamp']) 42 | data_final['timestamp'] = data_final['timestamp'].dt.strftime("%Y-%m-%d %H:%M:%S") 43 | data_final['timestamp'] = pd.to_datetime(data_final['timestamp'], format="%Y-%m-%d %H:%M:%S") 44 | data_final['sentiment'] = list(map(lambda _: choice(list_sentiment), range(len(data_final)))) 45 | data_final['source_validity'] = list(map(lambda _: choice(list_validity), range(len(data_final)))) 46 | data_final.to_csv("final_dataset/result.csv", index=False) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ElectionAspectAnalyzer 2 | Indonesian-AI Final Project aimed at providing in-depth insights into the 2024 election through social network analysis and sentiment assessment. We employ Social Network Analysis (SNA) and Aspect-Based Sentiment Analysis (ABSA) techniques to understand public interactions and opinions regarding the election. 3 | 4 |
5 | Dashboard Interface 6 |
7 | 8 | ## TODO's 9 | 10 | - [x] Collecting Dataset 11 | - [x] Find Open Source Model to Enhance Insight from Dataset 12 | - [x] Model Integration 13 | - [x] Dashboard Creation 14 | - [x] Internal Testing 15 | - [ ] Write Proper Project Documentation 16 | 17 | ## How To Use 18 | 19 | ``` 20 | # Incoming... 21 | ``` 22 | 23 | ## Project Outcome 24 | Gain a comprehensive understanding of the 2024 election in Indonesia through the use of Social Network Analysis (SNA) and Aspect-Based Sentiment Analysis (ABSA) techniques. 25 | 26 | ## Impact Outcome 27 | 28 | 1. **Understanding Public Aspect regards several Election Candidate** 29 | 2. **Assesing Public Sentiments** 30 | 3. **Proper Information using Interactive QA** 31 | 32 | ## Features 33 | 34 | - [x] SNA Graph 35 | - [x] Interactive QA 36 | 37 | ## Technique Used 38 | 39 | - Prompting: One Shot Learning. 40 | - Part-Of-Speech for Aspect Detection. 41 | - Aspect Pair Classification for Sentiment Detection per Sentence & Aspect. 42 | - Named Entity Recognition for Person or Organization Recognize in Tweets Comment. 43 | 44 | ## Tech Stack 45 | 46 | - [OpenAI](https://github.com/openai/openai-python) 47 | - [Langchain](https://python.langchain.com/docs/get_started/introduction) 48 | - [Streamlit](https://streamlit.io/) 49 | - [Streamlit Agraph](https://github.com/ChrisDelClea/streamlit-agraph) 50 | 51 | ## Contributors 52 | 53 | - [Muhammad Alif Ramadhan](https://github.com/NnA301023) 54 | - [Devandra Alandra Wicaksono](https://github.com/DevaraAlandra) 55 | - [Eko Prasetyo](https://github.com/eko-prstyw) 56 | - [Yuliana](https://github.com/yuliana4763) 57 | - [Raphon Galuh C.](https://github.com/RaphonGaluh) 58 | 59 | ## Future Improvements 60 | 61 | Continue development MVP prorotype into mentioned diagram 62 |
63 | Dashboard Interface 64 |
65 | -------------------------------------------------------------------------------- /src/extract_enrichment.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | from typing import Union, List, Tuple 4 | 5 | 6 | def read_file(filename: str) -> Union[pd.DataFrame, None]: 7 | data = None 8 | try: 9 | if filename.endswith("csv"): 10 | data = pd.read_csv(filename) 11 | if filename.endswith("xlsx"): 12 | data = pd.read_excel(filename) 13 | data = data.dropna().reset_index(drop = True) 14 | except Exception as E: 15 | print(f"[ERROR] - {E}") 16 | return data 17 | 18 | def clean_result(result: str) -> List[str]: 19 | return result.split(": ")[-1].replace('[','').replace(']', '').replace('"', '').split(", ") 20 | 21 | def parse_result(result: str) -> Tuple[List[str], List[str], List[str]]: 22 | aspect, entity, sentiment = [], [], [] 23 | try: 24 | tokenize = [res for res in result.split("\n") if res != ""] 25 | entity_raw = clean_result(tokenize[0]) 26 | if len(entity_raw) >= 1 and entity_raw[0] != "": 27 | entity.extend(entity_raw) 28 | aspect_raw = clean_result(tokenize[1]) 29 | if len(aspect_raw) >= 1 and aspect_raw[0] != "": 30 | for asp_sen in aspect_raw: 31 | try: 32 | asp, sen = asp_sen.split("(") 33 | sen = sen.replace(")", "") 34 | aspect.append(asp.strip()) 35 | sentiment.append(sen.strip()) 36 | except ValueError as E: 37 | pass 38 | except AttributeError as E: 39 | print(f"[ERROR] - {E}") 40 | print(result) 41 | # print(f"[ENTITY]: {entity} - [ASPECT]: {aspect} - [SENTIMENT]: {sentiment}") 42 | return aspect, entity, sentiment 43 | 44 | def main(): 45 | 46 | parser = argparse.ArgumentParser( 47 | description = "Read and ELT .csv file using pandas" 48 | ) 49 | parser.add_argument("--filename", help = "Name of .csv file based on extracted dataset", required = True) 50 | args = parser.parse_args() 51 | filename = args.filename 52 | 53 | path = "dataset/data_clean.csv" 54 | data = read_file(filename = filename) 55 | aspects, entities, sentiments = [], [], [] 56 | for result in data['result extraction']: 57 | aspect, entity, sentiment = parse_result(result = result) 58 | aspects.append(",".join(aspect)) 59 | entities.append(",".join(entity)) 60 | sentiments.append(",".join(sentiment)) 61 | 62 | data["entity"] = entities 63 | data["aspect"] = aspects 64 | data["sentiment"] = sentiments 65 | 66 | data.to_csv(path, index = False) 67 | print(f'[INFO] - Save File into {path}') 68 | 69 | if __name__ == "__main__": 70 | """ 71 | Usage: 72 | python src/extract_enrichment.py --filename dataset/data_twitter_pemilu_2024_enrich.csv 73 | """ 74 | main() -------------------------------------------------------------------------------- /dags/preprocess/process.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import pandas as pd 4 | from tqdm import tqdm 5 | from enrich import ( 6 | TopicGenerator, HoaxDetection, 7 | AspectDetection, KeywordExtraction 8 | ) 9 | tqdm.pandas() 10 | pd.set_option("display.max_columns", None) 11 | 12 | 13 | def clean_text(text, min_length=3): 14 | try: 15 | text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) 16 | text = re.sub(r'\s+', ' ', text).strip() 17 | text = re.sub(r'[^a-zA-Z0-9.\s]', '', text) 18 | text = ' '.join(word for word in text.split() if len(word) >= min_length) 19 | except Exception as E: 20 | print(f'[ERROR] - {E}') 21 | print(text) 22 | return text 23 | 24 | 25 | def integration(folder): 26 | dataset = [] 27 | column_used = { 28 | "news": ["keyword", "publish_date", "publisher", "content"], 29 | "twitter": ["created_at", "username", "full_text"] 30 | } 31 | column_normalization = ["keyword", "timestamp", "author", "content"] 32 | for fname in os.listdir(folder): 33 | filename = os.path.join(folder, fname) 34 | source = fname.split("_")[0] 35 | column_required = column_used[source] 36 | df = pd.read_csv(filename, usecols=column_required) 37 | if len(df.columns) != len(column_normalization): 38 | keywords = " ".join(fname.split(".")[0].split("_")[1:]).title() 39 | keywords = [keywords] * len(df) 40 | df = df[column_required] 41 | df.insert(0, column_normalization[0], keywords) 42 | else: 43 | df = df[column_required] 44 | df.columns = column_normalization 45 | df.insert(0, 'source', source) 46 | dataset.append(df) 47 | data = pd.concat(dataset) 48 | data = data.dropna().reset_index(drop=True) 49 | return data 50 | 51 | 52 | if __name__ == "__main__": 53 | """ 54 | Usage 55 | ~/ElectionAspectAnalyzer/dags >>> python process.py 56 | """ 57 | 58 | result_data = integration("../crawler/final_dataset") 59 | result_data['clean_content'] = result_data['content'].progress_apply(clean_text) 60 | 61 | # Hoax Detection 62 | # hoax = HoaxDetection() 63 | # list_hoax = hoax.batch_inference(result_data['clean_content'].tolist()) 64 | # result_data['hoax_extract'] = list_hoax 65 | 66 | # Keyword Extraction 67 | # kw_extract = KeywordExtraction() 68 | # result_data['keyword_extract'] = result_data['clean_content'].progress_apply( 69 | # lambda i: ", ".join(kw_extract.single_inference(i)) 70 | # ) 71 | 72 | # Topic generation 73 | topic_gen = TopicGenerator() 74 | result_data['topic_extract'] = result_data['clean_content'].progress_apply( 75 | lambda i: ", ".join(topic_gen.generate_topic(i)) 76 | ) 77 | 78 | # Sentiment Extraction 79 | # ... 80 | 81 | result_data.to_csv("final_dataset/clean_data.csv", index=False) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # Taxonomy corpus private 163 | dags/preprocess/corpus 164 | dags/preprocess/final_dataset/result.csv 165 | dataset/result.csv -------------------------------------------------------------------------------- /requirements-data-engineer.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.25.0 2 | aiodns==3.1.1 3 | aiohttp==3.8.6 4 | aiohttp-socks==0.8.4 5 | aiosignal==1.3.1 6 | altair==4.0.0 7 | anyio==3.7.1 8 | astor==0.8.1 9 | asttokens==2.4.0 10 | async-generator==1.10 11 | async-timeout==4.0.3 12 | attrs==21.4.0 13 | backcall==0.2.0 14 | beautifulsoup4==4.11.1 15 | blinker==1.6.3 16 | blis==0.7.11 17 | bs4==0.0.1 18 | cachetools==5.3.2 19 | catalogue==2.0.10 20 | cchardet==2.1.7 21 | certifi==2022.5.18.1 22 | cffi==1.15.0 23 | charset-normalizer==2.0.12 24 | click==8.1.7 25 | cloudpathlib==0.16.0 26 | colorama==0.4.6 27 | comm==0.1.4 28 | confection==0.1.4 29 | contourpy==1.1.1 30 | cssselect==1.2.0 31 | cycler==0.12.1 32 | cymem==2.0.8 33 | dataclasses-json==0.6.1 34 | debugpy==1.8.0 35 | decorator==5.1.1 36 | distlib==0.3.4 37 | distro==1.7.0 38 | dnspython==1.16.0 39 | duckdb==0.8.1 40 | elastic-transport==8.10.0 41 | elasticsearch==8.11.0 42 | entrypoints==0.4 43 | et-xmlfile==1.1.0 44 | exceptiongroup==1.1.3 45 | executing==2.0.0 46 | fake-useragent==1.4.0 47 | feedfinder2==0.0.4 48 | feedparser==6.0.10 49 | filelock==3.7.0 50 | fonttools==4.43.1 51 | frozenlist==1.4.0 52 | fsspec==2023.12.1 53 | future==0.18.3 54 | geographiclib==2.0 55 | geopy==2.4.1 56 | gitdb==4.0.11 57 | GitPython==3.1.40 58 | gnews==0.3.6 59 | googletransx==2.4.2 60 | greenlet==3.0.1 61 | h11==0.13.0 62 | huggingface-hub==0.19.4 63 | idna==3.3 64 | importlib-metadata==4.11.3 65 | importlib-resources==6.1.0 66 | ipykernel==6.25.2 67 | ipython==8.16.1 68 | isodate==0.6.1 69 | jedi==0.19.1 70 | jieba3k==0.35.1 71 | Jinja2==3.1.2 72 | joblib==1.3.2 73 | jsonpatch==1.33 74 | jsonpointer==2.4 75 | jsonschema==4.19.1 76 | jsonschema-specifications==2023.7.1 77 | jupyter_client==8.4.0 78 | jupyter_core==5.4.0 79 | keybert==0.8.3 80 | kiwisolver==1.4.5 81 | langcodes==3.3.0 82 | langsmith==0.0.51 83 | lxml==4.9.3 84 | markdown-it-py==3.0.0 85 | MarkupSafe==2.1.3 86 | marshmallow==3.20.1 87 | matplotlib==3.8.0 88 | matplotlib-inline==0.1.6 89 | mdurl==0.1.2 90 | mozfile==2.1.0 91 | mpmath==1.3.0 92 | multidict==6.0.4 93 | murmurhash==1.0.10 94 | mypy-extensions==1.0.0 95 | nest-asyncio==1.5.8 96 | networkx==3.2 97 | newspaper3k==0.2.8 98 | nltk==3.8.1 99 | numpy==1.26.2 100 | openai==0.27.10 101 | openpyxl==3.1.2 102 | packaging==21.3 103 | pandas==1.3.5 104 | parso==0.8.3 105 | pep517==0.12.0 106 | pickleshare==0.7.5 107 | Pillow==10.1.0 108 | pke @ git+https://github.com/boudinfl/pke.git@69871ffdb720b83df23684fea53ec8776fd87e63 109 | platformdirs==2.5.2 110 | platinfo==0.15.0 111 | preshed==3.0.9 112 | progressbar2==4.0.0 113 | prompt-toolkit==3.0.39 114 | protobuf==3.20.3 115 | psutil==5.9.6 116 | pure-eval==0.2.2 117 | pyarrow==13.0.0 118 | pycares==4.4.0 119 | pycparser==2.21 120 | pydantic==1.10.13 121 | pydeck==0.8.1b0 122 | Pygments==2.16.1 123 | pymongo==3.12.3 124 | Pympler==1.0.1 125 | pyparsing==3.0.9 126 | PySocks==1.7.1 127 | python-dateutil==2.8.2 128 | python-dotenv==0.19.2 129 | python-socks==2.4.3 130 | python-utils==3.3.0 131 | pytz==2022.1 132 | pywin32==306 133 | pywin32-ctypes==0.2.0 134 | PyYAML==6.0.1 135 | pyzmq==25.1.1 136 | rdflib==7.0.0 137 | redo==2.0.3 138 | referencing==0.30.2 139 | regex==2023.10.3 140 | requests==2.27.1 141 | requests-file==1.5.1 142 | rich==13.6.0 143 | rpds-py==0.10.6 144 | safetensors==0.4.1 145 | schedule==1.2.1 146 | scikit-learn==1.3.2 147 | scipy==1.11.3 148 | semver==3.0.2 149 | sentence-transformers==2.2.2 150 | sentencepiece==0.1.99 151 | sgmllib3k==1.0.0 152 | six==1.16.0 153 | smart-open==6.4.0 154 | smmap==5.0.1 155 | sniffio==1.2.0 156 | sortedcontainers==2.4.0 157 | soupsieve==2.3.2.post1 158 | spacy==3.7.2 159 | spacy-legacy==3.0.12 160 | spacy-loggers==1.0.5 161 | SQLAlchemy==1.4.49 162 | srsly==2.4.8 163 | stack-data==0.6.3 164 | streamlit==1.12.0 165 | streamlit-agraph==0.0.45 166 | style==1.1.0 167 | sympy==1.12 168 | tabulate==0.9.0 169 | tenacity==8.2.3 170 | thinc==8.2.1 171 | threadpoolctl==3.2.0 172 | tinysegmenter==0.3 173 | tldextract==5.1.1 174 | tokenizers==0.15.0 175 | toml==0.10.2 176 | tomli==2.0.1 177 | toolz==0.12.0 178 | torch==2.1.1 179 | torchvision==0.16.1 180 | tornado==6.3.3 181 | tqdm==4.66.1 182 | traitlets==5.11.2 183 | transformers==4.35.2 184 | twint==2.1.20 185 | typer==0.9.0 186 | typing-inspect==0.9.0 187 | typing_extensions==4.2.0 188 | tzdata==2023.3 189 | tzlocal==5.2 190 | Unidecode==1.3.7 191 | update==0.0.1 192 | urllib3==1.26.9 193 | validators==0.22.0 194 | virtualenv==20.14.1 195 | wasabi==1.1.2 196 | watchdog==3.0.0 197 | wcwidth==0.2.8 198 | weasel==0.3.4 199 | wsproto==1.1.0 200 | yarl==1.9.2 201 | zipp==3.8.0 202 | -------------------------------------------------------------------------------- /dags/crawler/news_scrapper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pandas as pd 3 | from tqdm import tqdm 4 | from gnews import GNews 5 | from bs4 import BeautifulSoup 6 | from datetime import timedelta, date 7 | from newspaper import Config, Article 8 | from urllib.parse import unquote, urlparse 9 | 10 | 11 | # Instantiate object 12 | config = Config() 13 | config.browser_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36" 14 | 15 | def parsing_rss_url(rss_url): 16 | parse_url = None 17 | try: 18 | resp = requests.get(rss_url) 19 | soup = BeautifulSoup(resp.content, "html.parser") 20 | links = soup.find_all("link") 21 | for link in links: 22 | link = link.get("href") 23 | # TODO: Implement proper logic. 24 | try: 25 | if "embed" in link: 26 | parse_url = unquote(urlparse(link).query.split("=")[1]) 27 | break 28 | if "amp" in link: 29 | parse_url = unquote(link) 30 | except Exception as E: 31 | print(f"Error Occurede, Required Improve Link Parser: {links}") 32 | except Exception as E: 33 | print(f"Connection Broken with Error: {E}") 34 | return parse_url 35 | 36 | # NOTE: This solution is generated based on @alearjun comment on Gnews Issue. 37 | def crawling_news(keyword, start_date=date(2023, 1, 1), total_news=1000): 38 | list_url = [] 39 | list_title = [] 40 | list_article = [] 41 | list_authors = [] 42 | list_publisher = [] 43 | list_description = [] 44 | list_published_date = [] 45 | n_news = 0 46 | n_tolerance = 0 47 | google_news = GNews(language="id", country="ID") 48 | max_tolerance = 250 49 | while n_news < total_news: 50 | scope_date = start_date + timedelta(days = 8) 51 | google_news.start_date = (start_date.year, start_date.month, start_date.day) 52 | google_news.end_date = (scope_date.year, scope_date.month, scope_date.day) 53 | results = google_news.get_news(keyword) 54 | for res in results: 55 | print('[INFO] - Tolerance: ', n_tolerance) 56 | if n_tolerance > max_tolerance: 57 | n_news = 10_000 58 | break 59 | print(f'Total News: {n_news}') 60 | url = res['url'] 61 | url = parsing_rss_url(url) 62 | if url is None: 63 | n_tolerance += 1 64 | continue 65 | try: 66 | article = Article(url, config=config) 67 | article.download() 68 | article.parse() 69 | except Exception: 70 | n_tolerance += 1 71 | pass 72 | if n_news >= total_news: 73 | break 74 | if ( 75 | url in list_url or 76 | article is None or 77 | len(article.text.strip()) == 0 78 | ): 79 | n_tolerance += 1 80 | continue 81 | else: 82 | list_url.append(url) 83 | list_title.append(res['title']) 84 | list_article.append(article.text) 85 | list_authors.append(", ".join(article.authors)) 86 | list_publisher.append(res['publisher']['title']) 87 | list_description.append(res['description']) 88 | list_published_date.append(res['published date']) 89 | n_news += 1 90 | start_date += timedelta(days = 7) 91 | return ( 92 | list_url, list_title, list_article, 93 | list_authors, list_publisher, list_description, 94 | list_published_date 95 | ) 96 | 97 | if __name__ == "__main__": 98 | 99 | # Define List of keywords 100 | list_keywords = [ 101 | "Anies Baswedan", 102 | "Muhaimin Iskandar", 103 | "Ganjar Pranowo", 104 | "Mahfud MD", 105 | "Gibran Rakabuming", 106 | "Prabowo Subianto" 107 | ] 108 | for keyword in tqdm(list_keywords, desc="Crawling Google News..."): 109 | keywords = [] 110 | urls, titles, contents, authors, publisher, description, publish_date = [], [], [], [], [], [], [] 111 | ( 112 | list_url, list_title, list_article, 113 | list_authors, list_publisher, 114 | list_description, list_published_date 115 | ) = crawling_news(keyword=keyword) 116 | urls.extend(list_url) 117 | titles.extend(list_title) 118 | contents.extend(list_article) 119 | authors.extend(list_authors) 120 | publisher.extend(list_publisher) 121 | description.extend(list_description) 122 | publish_date.extend(list_published_date) 123 | keywords.extend([keyword] * len(list_url)) 124 | 125 | # Check dimension 126 | print( 127 | len(urls), len(titles), len(contents), len(authors), 128 | len(publisher), len(description), len(publish_date) 129 | ) 130 | 131 | # Save into csv 132 | df = pd.DataFrame({ 133 | "keyword": keywords, 134 | "url": urls, 135 | "title": titles, 136 | "content": contents, 137 | "author": authors, 138 | "publisher": publisher, 139 | "description": description, 140 | "publish_date": publish_date 141 | }) 142 | df.to_csv(f"result_news_{keyword.replace(' ', '_')}.csv", index=False) -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import pandas as pd 3 | import streamlit as st 4 | from src.secret import OPENAI_KEY 5 | from typing import Tuple, List, Dict 6 | from streamlit_agraph import agraph, Node, Edge, Config 7 | openai.api_key = OPENAI_KEY 8 | st.set_page_config(layout = "wide") 9 | 10 | @st.cache 11 | def load_clean_data(dataset: str = "dataset/result.csv") -> pd.DataFrame: 12 | data = pd.read_csv(dataset) 13 | data = data.drop_duplicates(subset=['keyword', 'author', 'topic']) 14 | data['keyword'] = data["keyword"].replace("Mahfud Md", "Mahfud MD") 15 | # data = data.sample(500).reset_index(drop=True) 16 | data = pd.concat([df.sample(100, replace=True) for _, df in data.groupby("keyword")]) 17 | return data 18 | 19 | @st.cache 20 | def create_graph(data_filter: pd.DataFrame, use_sentiment_aspect: bool = False, mapping_sentiment: Dict[str, str] = { 21 | "positive" : "#B3FFAE", "negative" : "#FF7D7D", "neutral" : "#F8FFDB" 22 | }) -> Tuple[List[str], List[str]]: 23 | # aspect_global = [] 24 | # nodes, edges = [], [] 25 | # candidates, organizations = [], [] 26 | # for _, i in data_filter.iterrows(): 27 | # candidate_name = i['name'] 28 | # organization = i['entity'] 29 | # if organization != "" and isinstance(organization, str): 30 | # for person in organization.split(","): 31 | # if candidate_name not in candidates: 32 | # nodes.append(Node(id = candidate_name, label = candidate_name, symbolType = "diamond", color = "#FFF6F6", size = 20)) 33 | # candidates.append(candidate_name) 34 | # if person not in organizations and person not in candidates and person not in aspect_global: 35 | # nodes.append(Node(id = person, label = person, color = "#A7D397", size = 15)) 36 | # organizations.append(person) 37 | # edges.append(Edge(source = person, target = candidate_name)) 38 | # if use_sentiment_aspect: 39 | # sentiments = i['sentiment'] 40 | # aspects = i['aspect'] 41 | # if aspects != "" and isinstance(aspects, str): 42 | # for aspect, sentiment in zip(aspects.split(","), sentiments.split(",")): 43 | # if aspect not in aspect_global and aspect not in organizations and aspect not in candidates: 44 | # # print(f'[ASPECT] - {aspect} is not available in, orgs: {aspect not in organizations} asp: {aspect not in aspect_global}') 45 | # nodes.append(Node(id = aspect, label = aspect, size = 10, color = mapping_sentiment.get(sentiment))) 46 | # edges.append(Edge(source = aspect, target = person, label = sentiment)) 47 | # aspect_global.append(aspect) 48 | aspect_global = [] 49 | nodes, edges = [], [] 50 | candidates, authors = [], [] 51 | for _, i in data_filter.iterrows(): 52 | candidate = i['keyword'] 53 | author = i['author'] 54 | if author != "" and isinstance(author, str): 55 | if candidate not in candidates: 56 | nodes.append(Node(id = candidate, label = candidate, symbolType = "diamond", color = "#FFF6F6", size = 25)) 57 | candidates.append(candidate) 58 | elif author not in authors: 59 | nodes.append(Node(id = author, label = author, symbolType = "diamond", color = "#A7D397", size = 15)) 60 | authors.append(author) 61 | # elif author in authors and candidate in candidates: 62 | edges.append(Edge(source = author, target = candidate)) 63 | if use_sentiment_aspect: 64 | sentiment = i['sentiment'] 65 | aspect = i['topic'] 66 | if aspect != "" and isinstance(aspect, str) and aspect not in aspect_global: 67 | nodes.append(Node(id = aspect, label = aspect, size = 10, color = mapping_sentiment.get(sentiment))) 68 | edges.append(Edge(source = aspect, target = author, label = sentiment)) 69 | aspect_global.append(aspect) 70 | return nodes, edges 71 | 72 | def prompt_qa(data: pd.DataFrame, query: str) -> str: 73 | prompt = \ 74 | f""" 75 | data = {data.to_dict('records')} 76 | 77 | jawaban pertanyaan berikut berdasarkan informasi diatas yang diolah sesuai reasoning yang masuk akal. 78 | pertanyaan: Siapa Pemenang pemilu 2024? 79 | jawaban: Ganjar Pranowo 80 | 81 | pertanyaan: {query} 82 | 83 | dengan format dibawah: 84 | jawaban: 85 | """ 86 | return prompt 87 | 88 | 89 | def agent_qa_zero_shot(data: pd.DataFrame, query: str, model_base: str = "gpt-3.5-turbo-16k"): 90 | token_usage = 0 91 | response_extraction = "" 92 | try: 93 | response = openai.ChatCompletion.create( 94 | model = model_base, 95 | messages = [{"role" : "user", "content" : prompt_qa(data, query)}], 96 | temperature = 0.5, max_tokens = 512, top_p = 1.0, 97 | frequency_penalty = 0.0, presence_penalty = 0.0 98 | ) 99 | response_extraction = response["choices"][0]["message"]["content"] 100 | token_usage = response["usage"]["total_tokens"] 101 | except Exception as E: 102 | print(f"[ERROR] - {E}") 103 | print("Retry with Recursive Func") 104 | # agent_qa_zero_shot(data, query) 105 | return response_extraction, token_usage 106 | 107 | 108 | def app(data: pd.DataFrame, config: Config): 109 | 110 | # Interface section 111 | st.sidebar.header("ElectionAspectAnalyzer v.0.1") 112 | 113 | # Sidebar section 114 | candidates = data["keyword"].unique().tolist() 115 | filter_candidate = st.sidebar.multiselect( 116 | "Select Candidates:", 117 | options = candidates, 118 | default = candidates[:3] 119 | ) 120 | filter_data = data[data['keyword'].isin(filter_candidate)].reset_index(drop = True) 121 | use_aspect_sentiment = st.sidebar.checkbox("Use Aspect-Sentiment") 122 | 123 | # Graph section 124 | with st.spinner("Preprocess Data..."): 125 | filter_node, filter_edge = create_graph(filter_data, use_sentiment_aspect = use_aspect_sentiment) 126 | st.success("Total Nodes Loaded: " + str(len(filter_node))) 127 | return_value = agraph( 128 | nodes = filter_node, 129 | edges = filter_edge, 130 | config = config 131 | ) 132 | 133 | # QnA section 134 | # NOTE: Reduce token usage OpenAI Cost :) 135 | data_sample = pd.concat([df.sample(3, replace=False) for _, df in filter_data.groupby("keyword")]) 136 | query = st.sidebar.text_input(label = "Any Question about Election 2024?") 137 | if query != "": 138 | response, _ = agent_qa_zero_shot(data = data_sample, query = query) 139 | st.sidebar.success(response) 140 | 141 | if __name__ == "__main__": 142 | config = Config( 143 | width = 1000, height = 500, 144 | directed = True, physics = True, hierarchical = False 145 | ) 146 | data = load_clean_data() 147 | app(data = data, config = config) -------------------------------------------------------------------------------- /dags/preprocess/enrich.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pandas as pd 3 | from tqdm import tqdm 4 | from keybert import KeyBERT 5 | from typing import List, Tuple 6 | from nltk.corpus import stopwords 7 | from warnings import filterwarnings 8 | from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification 9 | filterwarnings("ignore") 10 | 11 | 12 | class TopicGenerator: 13 | def __init__(self, corpus_path: str = "corpus/taxo.xlsx"): 14 | self.corpus_path = corpus_path 15 | taxonomy_general = self._load_taxonomy("General ID") 16 | taxonomy_specific = self._load_taxonomy("Government ID") 17 | self.attr_noun = pd.concat([taxonomy_specific, taxonomy_general]) 18 | 19 | def _load_taxonomy(self, sheet_name: str) -> pd.DataFrame: 20 | required_columns = ['category', 'attr_noun'] 21 | normalize_columns = ['category', 'attr_noun'] 22 | return self._process_corpus(sheet_name, required_columns, normalize_columns) 23 | 24 | def _process_corpus(self, sheet_name: str, required_columns: List[str], normalize_columns: List[str]) -> pd.DataFrame: 25 | data = pd.read_excel(self.corpus_path, usecols=required_columns, sheet_name=sheet_name) 26 | data.columns = normalize_columns 27 | return data.dropna().drop_duplicates().reset_index(drop=True) 28 | 29 | @staticmethod 30 | def _apply_regex(phrase: List[str]) -> str: 31 | return r"\b(?:" + "|".join(map(re.escape, phrase)) + r")\b" 32 | 33 | @staticmethod 34 | def _process_term(list_term: List[str]) -> Tuple[List[str], List[str]]: 35 | single_term, phrase_term = [], [] 36 | for word in list_term: 37 | try: 38 | if word == "": 39 | continue 40 | if len(word.split()) == 1: 41 | single_term.append(word) 42 | else: 43 | phrase_term.append(word) 44 | except AttributeError: 45 | continue 46 | return single_term, phrase_term 47 | 48 | def generate_topic(self, content: str) -> List[str]: 49 | attr_noun_single, attr_noun_phrase = self._process_term(self.attr_noun['attr_noun']) 50 | attr_noun_single = self.attr_noun[self.attr_noun['attr_noun'].isin(attr_noun_single)].reset_index(drop=True) 51 | attr_noun_phrase = self.attr_noun[self.attr_noun['attr_noun'].isin(attr_noun_phrase)].reset_index(drop=True) 52 | regex_noun_phrase = self._apply_regex(attr_noun_phrase['attr_noun'].str.lower().tolist()) 53 | regex_noun_single = self._apply_regex(attr_noun_single['attr_noun'].str.lower().tolist()) 54 | list_noun_phrase = attr_noun_phrase['attr_noun'].str.lower().tolist() 55 | list_noun_single = attr_noun_single['attr_noun'].str.lower().tolist() 56 | 57 | content = content.lower() 58 | aspect_term, aspect_category = [], [] 59 | phrase_noun = re.findall(regex_noun_phrase, content) 60 | single_noun = re.findall(regex_noun_single, content) 61 | if phrase_noun: 62 | for phrase in phrase_noun: 63 | for word in phrase.split(): 64 | if word in single_noun: 65 | single_noun.remove(word) 66 | if phrase not in aspect_term: 67 | aspect_term.append(phrase) 68 | aspect_category.append(attr_noun_phrase['category'][list_noun_phrase.index(phrase)]) 69 | if single_noun: 70 | for word in single_noun: 71 | if word not in aspect_term: 72 | aspect_term.append(word) 73 | aspect_category.append(attr_noun_single['category'][list_noun_single.index(word)]) 74 | 75 | aspect_category = list(set(aspect_category)) 76 | return aspect_category 77 | 78 | 79 | class AspectDetection: 80 | 81 | def __init__(self, model="mdhugol/indonesia-bert-sentiment-classification", task = "text-classification"): 82 | tokenizer = AutoTokenizer.from_pretrained(model) 83 | finetune_model = AutoModelForSequenceClassification.from_pretrained(model) 84 | self.pipe = pipeline(task, model=finetune_model, tokenizer=tokenizer) 85 | self.label_index = {'LABEL_0': 'positif', 'LABEL_1': 'netral', 'LABEL_2': 'negatif'} 86 | 87 | def preprocess(self, content, aspect_category): 88 | return f'[CLS] {content} [ASP] {aspect_category} [ASP]' 89 | 90 | def single_inference(self, content, aspect_category): 91 | results = self.pipe(self.preprocess(content, aspect_category)) 92 | results = list(map(lambda i: self.label_index[i['label']], results)) 93 | return results 94 | 95 | 96 | class HoaxDetection: 97 | 98 | def __init__(self, model="khavitidala/xlmroberta-large-fine-tuned-indo-hoax-classification", task = "text-classification"): 99 | self.pipe = pipeline(task, model=model, max_length=512, truncation=True) 100 | 101 | def batch_inference(self, contents): 102 | results = [] 103 | for content in tqdm(contents): 104 | result = self.pipe(content) 105 | res = list(map(lambda i: i['label'], result)) 106 | results.extend(res) 107 | return results 108 | 109 | class KeywordExtraction: 110 | 111 | def __init__(self): 112 | self.extract = KeyBERT(model="indobenchmark/indobert-base-p1") 113 | stop_words = list(set(stopwords.words('indonesian'))) 114 | data = 'https://raw.githubusercontent.com/Braincore-id/IndoTWEEST/main/stopwords_twitter.csv' 115 | df_stopwords = pd.read_csv(data, names=['stopword']) 116 | stop_words = stop_words + df_stopwords['stopword'].unique().tolist() 117 | self.stopwords = list(set(stop_words)) 118 | 119 | def single_inference(self, content): 120 | result = self.extract.extract_keywords( 121 | content, keyphrase_ngram_range = (1, 1), 122 | stop_words=self.stopwords 123 | ) 124 | result = list(map(lambda i: i[0], result)) 125 | return result 126 | 127 | 128 | 129 | if __name__ == "__main__": 130 | 131 | """ 132 | Aspect Category | Aspect Sentiment | Keywords (top 5) | Hoax Classification 133 | """ 134 | # Define example content 135 | content = "VIVA Politik Anies Baswedan akan kembali melakukan safari politik seluruh Tanah Air. Akhir Januari 2023 ini Anies akan mengunjungi NTB. Beberapa wilayah Pulau Lombok dan Sumbawa telah disiapkan. Bakal calon presiden yang diusung Partai NasDem Anies Baswedan dijadwalkan akan mengunjungi Nusa Tenggara Barat pada 3031 Januari 2023 mendatang. Agendanya satu hari Pulau Lombok dan satu hari Pulau Sumbawa kata Ketua Panitia yang juga Ketua DPD Partai NasDem Lombok Timur Rumaksi Kantor DPW Partai NasDem NTB Kota Mataram Jumat dikutip dari Antara. mengatakan dalam kunjungannya NTB Anies Baswedan dijadwalkan akan bertemu dengan sejumlah tokoh agama tokoh masyarakat pemuda dan relawan dari lintas agama baik yang ada Pulau Lombok dan Pulau Sumbawa. Selain bertemu dengan tokoh lintas agama dan relawan. Anies Baswedan juga akan mengunjungi Desa Wisata Sade Lombok Tengah. Kemudian juga Pondok Pesantren Yatofa Bodak Lombok Tengah. Ponpes Yatofa Pak Anies akan bersilaturahmi dan melakukan pengajian bersama pimpinan pondok pesantren dan masyarakat ujarnya. Setelah itu dilanjutkan dengan mengunjungi peternak sapi Desa Wanaseba Kabupaten Lombok Timur. Tidak hanya itu bakal calon presiden Partai NasDem itu juga akan melaksanakan shalat berjemaah Masjid Jamik Masbagik Lombok Timur. Setelah dari Masjid Masbagik Pak Anies akan diarak pakai kuda menuju Lapangan Gotong Royong Masbagik. Pak Anies akan mengukuhkan pengurus ranting Partai Nasdem sePulau Lombok. Dari situ melanjutkan perjalanan Lombok Barat untuk silaturahmi dengan tokoh agama lintas agama bersama Bupati Lombok Barat selaku Ketua Dewan Pakar Partai NasDem terang Rumaksi. Kemudian pada Januari Anies Baswedan akan terbang Pulau Sumbawa. Pulau Sumbawa Anies Baswedan akan mengunjungi Kota Bima. Setelah itu akan bergerak menuju Kabupaten Sumbawa. Jadi kegiatan Pulau Sumbawa juga sama dengan yang ada Lombok ucapnya didampingi anggota DPR dapil NTB dari Fraksi Partai NasDem Syamsul Luthfi Ketua DPD NasDem Lombok Tengah Syamsul Hadi Ketua DPD NasDem Lombok Barat Tarmizi dan Ketua DPD NasDem Kota Bima Muthmainnah. Menurut dia panitia daerah siap mengawal kunjungan mantan Gubernur DKI Jakarta tersebut selama mengunjungi NTB bahkan DPW Partai NasDem memastikan bahwa kedatangan bakal calon presiden Anies Baswedan NTB akan berjalan aman dan kondusif. Karena itu pihaknya mempersilakan bagi siapapun khususnya tim relawan untuk melakukan komunikasi dengan DPW Partai NasDem terkait rencana kedatangan Anies Baswedan NTB. Khusus saat pertemuan Ponpes Yatofa akan ada kejutan yang disampaikan. Cuma apa kejutan itu nanti disampaikan saat Pak Anies Baswedan datang NTB katanya. Ant" 136 | 137 | # Example usage of TopicGenerator class 138 | generator = TopicGenerator() 139 | topics = generator.generate_topic(content) 140 | print(topics) 141 | # >>> ['Community', 'Institution', 'Public Figure', 'Leader', 'Transportation', 'Networking', 'Activities', 'Spokesperson', 'Social Media', 'Product Launch', 'Public Rally', 'Volunteer', 'Public Opinion', 'Election', 'Tourism', 'Geographical', 'Political Parties'] 142 | 143 | # Example usage of HoaxDetection class 144 | hoax_detection = HoaxDetection() 145 | result = hoax_detection.batch_inference([content]) 146 | print(result) 147 | # >>> ['Fakta'] 148 | 149 | # Example usage of AspectDetection class 150 | aspect_detection = AspectDetection() 151 | for topic in topics: 152 | res = aspect_detection.single_inference(content, topic) 153 | print(f"{topic} - {res}") 154 | # >>> Institution - ['netral'] 155 | # >>> Networking - ['netral'] 156 | # >>> Public Rally - ['netral'] 157 | 158 | # Example usage of HoaxDetection class 159 | keyword_extraction = KeywordExtraction() 160 | result = keyword_extraction.single_inference(content) 161 | print(result) 162 | # >>> ['agendanya', 'anies', 'nasdem', 'sumbawa', 'ntb'] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /notebook/enrichment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "```\n", 8 | "Proof on Concept:\n", 9 | "\n", 10 | "Twitter Post (1 baris):\n", 11 | "\n", 12 | " (ideal) \n", 13 | " - Filtrasi keyword (udah ada di dalam dataset)\n", 14 | " - Ekstraksi Person (by PoS atau @)\n", 15 | " - Ekstraksi Aspect (by Noun dari NER / PoS / KBBI)\n", 16 | " - Generate Sentiment (per aspect terdeteksi) -> translate ke bahasa inggris (bert pair cls)\n", 17 | " \n", 18 | " (mvp)\n", 19 | " - utilize openAI at all. :D\n", 20 | "\n", 21 | "Expected Result (tabular format)\n", 22 | "\n", 23 | "source data:\n", 24 | "| name | tweets | re-tweets | ... |\n", 25 | "\n", 26 | "result enrichment:\n", 27 | "| name | tweets | re-tweets | Person / Organization (NER) | Aspect - Sentiment (ABSA) | Topic - (input by user)\n", 28 | "```" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 36, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# Load libraries \n", 38 | "import os\n", 39 | "import re\n", 40 | "import time\n", 41 | "import openai \n", 42 | "import pandas as pd \n", 43 | "from tqdm import tqdm\n", 44 | "from typing import Tuple\n", 45 | "from dotenv import load_dotenv\n", 46 | "\n", 47 | "\n", 48 | "load_dotenv()\n", 49 | "pd.set_option(\"display.max_columns\", None)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 11, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "# Setting credentials\n", 59 | "OPENAI_KEY = os.getenv(\"OPENAI_API_KEY\", default = None) \n", 60 | "openai.api_key = OPENAI_KEY" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 37, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/html": [ 71 | "
\n", 72 | "\n", 85 | "\n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | "
nametextrtid
0prabowoMegawati Soekarnoputri, diyakini akan menjadik...01552261054964461568
1prabowoDiremehkan, Citra Pak @prabowo menjadi terting...31551415694738313216
2prabowoDulu Tuhan disuruh menangin Prabowo atau kagak...01551415694738313216
3prabowo@SantorinisSun Loh miss valak masih menyembah ...01551415694738313216
4prabowoYth bapak Presiden republik Indonesia Ir Haji ...391552234605419237376
\n", 133 | "
" 134 | ], 135 | "text/plain": [ 136 | " name text rt \\\n", 137 | "0 prabowo Megawati Soekarnoputri, diyakini akan menjadik... 0 \n", 138 | "1 prabowo Diremehkan, Citra Pak @prabowo menjadi terting... 3 \n", 139 | "2 prabowo Dulu Tuhan disuruh menangin Prabowo atau kagak... 0 \n", 140 | "3 prabowo @SantorinisSun Loh miss valak masih menyembah ... 0 \n", 141 | "4 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n", 142 | "\n", 143 | " id \n", 144 | "0 1552261054964461568 \n", 145 | "1 1551415694738313216 \n", 146 | "2 1551415694738313216 \n", 147 | "3 1551415694738313216 \n", 148 | "4 1552234605419237376 " 149 | ] 150 | }, 151 | "execution_count": 37, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "# Load dataset\n", 158 | "data = pd.read_csv(\"../dataset/data_twitter_pemilu_2024.csv\")\n", 159 | "data.head()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 38, 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "data": { 169 | "text/plain": [ 170 | "False 625\n", 171 | "True 160\n", 172 | "Name: count, dtype: int64" 173 | ] 174 | }, 175 | "execution_count": 38, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "# Data Duplicate checking\n", 182 | "data.duplicated(subset = ['text', 'id', 'rt']).value_counts()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 39, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/html": [ 193 | "
\n", 194 | "\n", 207 | "\n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | "
nametextrtid
10prabowoYth bapak Presiden republik Indonesia Ir Haji ...391552234605419237376
15prabowoKapolri Jenderal Listyo Sigit Prabowo mengatak...11552476373855244289
21prabowoNgopi daring tayang siang ini di Youtube @kemh...21551476092749447168
22prabowoYth bapak Presiden republik Indonesia Ir Haji ...391552234605419237376
23prabowoYth bapak Presiden republik Indonesia Ir Haji ...391552234605419237376
33prabowoYth bapak Presiden republik Indonesia Ir Haji ...391552234605419237376
37prabowoYth bapak Presiden republik Indonesia Ir Haji ...391552234605419237376
42prabowoYth bapak Presiden republik Indonesia Ir Haji ...391552234605419237376
45prabowoCatat nih, Pak Prabowo menduduki tempat dipunc...21551415694880677891
49prabowoYth bapak Presiden republik Indonesia Ir Haji ...391552234605419237376
\n", 290 | "
" 291 | ], 292 | "text/plain": [ 293 | " name text rt \\\n", 294 | "10 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n", 295 | "15 prabowo Kapolri Jenderal Listyo Sigit Prabowo mengatak... 1 \n", 296 | "21 prabowo Ngopi daring tayang siang ini di Youtube @kemh... 2 \n", 297 | "22 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n", 298 | "23 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n", 299 | "33 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n", 300 | "37 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n", 301 | "42 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n", 302 | "45 prabowo Catat nih, Pak Prabowo menduduki tempat dipunc... 2 \n", 303 | "49 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n", 304 | "\n", 305 | " id \n", 306 | "10 1552234605419237376 \n", 307 | "15 1552476373855244289 \n", 308 | "21 1551476092749447168 \n", 309 | "22 1552234605419237376 \n", 310 | "23 1552234605419237376 \n", 311 | "33 1552234605419237376 \n", 312 | "37 1552234605419237376 \n", 313 | "42 1552234605419237376 \n", 314 | "45 1551415694880677891 \n", 315 | "49 1552234605419237376 " 316 | ] 317 | }, 318 | "execution_count": 39, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "# Overview duplicated data\n", 325 | "data[data.duplicated(subset = ['text', 'id', 'rt'])].head(10)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 40, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "# Duplicate data filtering\n", 335 | "data = data.drop_duplicates(subset = ['text', 'id', 'rt'])" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 41, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "text/plain": [ 346 | "False 625\n", 347 | "Name: count, dtype: int64" 348 | ] 349 | }, 350 | "execution_count": 41, 351 | "metadata": {}, 352 | "output_type": "execute_result" 353 | } 354 | ], 355 | "source": [ 356 | "# Data Duplicate checking - validation\n", 357 | "data.duplicated(subset = ['text', 'id', 'rt']).value_counts()" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 42, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "# Define prompt and ingestion script\n", 367 | "def prompt_enrichment(tweet_comment: str) -> str:\n", 368 | " prompt = \\\n", 369 | " f\"\"\"\n", 370 | " Ekstraksi informasi yang dibutuhkan berdasarkan komentar twitter dibawah, dengan response cukup sesuai yang di definisikan tanpa penjelasan tambahan.\n", 371 | "\n", 372 | " komentar_twitter: \"{tweet_comment}\"\n", 373 | "\n", 374 | " Untuk response cukup isi dengan format dibawah.\n", 375 | " named_entity_recognition: [Jawaban anda: cakupan NER sesuai label \"PERSON\" atau \"ORGANIZATION\" saja]\n", 376 | " aspect_sentiment: [Identifikasi verb / noun-phrase hasil dari part-of-speech di dalam komentar, disertai dengan nilai sentiment masing-masing aspect dengan format ]\n", 377 | " \"\"\"\n", 378 | " return prompt\n", 379 | "\n", 380 | "def ingest_openai(tweet_comment: str, model_base: str = \"gpt-3.5-turbo\") -> Tuple[str, int]: \n", 381 | " token_usage = 0\n", 382 | " response_extraction = \"\"\n", 383 | " try:\n", 384 | " response = openai.ChatCompletion.create(\n", 385 | " model = model_base, \n", 386 | " messages = [{\"role\" : \"user\", \"content\" : prompt_enrichment(tweet_comment)}], \n", 387 | " temperature = 0.1, max_tokens = 512, top_p = 1.0, \n", 388 | " frequency_penalty = 0.0, presence_penalty = 0.0\n", 389 | " )\n", 390 | " response_extraction = response[\"choices\"][0][\"message\"][\"content\"]\n", 391 | " token_usage = response[\"usage\"][\"total_tokens\"]\n", 392 | " except Exception as E:\n", 393 | " print(f\"[ERROR] - {E}\")\n", 394 | " print(\"Retry with Recursive Func\")\n", 395 | " time.sleep(5)\n", 396 | " ingest_openai(tweet_comment = tweet_comment)\n", 397 | " return response_extraction, token_usage" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 45, 403 | "metadata": {}, 404 | "outputs": [ 405 | { 406 | "name": "stdout", 407 | "output_type": "stream", 408 | "text": [ 409 | "[COMMENT]\n", 410 | "Puan tak masalah bahkan Ganjar jadi salah satu bacapres. Waktunya Puan Maharani\n", 411 | "[RESULT - Token Usage: 216]\n", 412 | "named_entity_recognition: [Puan, Ganjar, Puan Maharani]\n", 413 | "aspect_sentiment: [Puan (positive), Ganjar (positive), bacapres (positive), Waktunya Puan Maharani (neutral)]\n" 414 | ] 415 | } 416 | ], 417 | "source": [ 418 | "# Test ingestion\n", 419 | "comment = data['text'].sample(1).values[0]\n", 420 | "extraction, token_usage = ingest_openai(tweet_comment = comment)\n", 421 | "print(f\"[COMMENT]\\n{comment}\\n[RESULT - Token Usage: {token_usage}]\\n{extraction}\")" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 46, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "# Apply on entire dataset\n", 431 | "final_result_extraction, final_token_usage = [], []" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 48, 437 | "metadata": {}, 438 | "outputs": [ 439 | { 440 | "name": "stderr", 441 | "output_type": "stream", 442 | "text": [ 443 | "Ingestion Start: 13%|█▎ | 84/625 [10:42<46:56, 5.21s/it] " 444 | ] 445 | }, 446 | { 447 | "name": "stdout", 448 | "output_type": "stream", 449 | "text": [ 450 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 451 | "Retry with Recursive Func\n" 452 | ] 453 | }, 454 | { 455 | "name": "stderr", 456 | "output_type": "stream", 457 | "text": [ 458 | "Ingestion Start: 25%|██▌ | 158/625 [28:13<33:08, 4.26s/it] " 459 | ] 460 | }, 461 | { 462 | "name": "stdout", 463 | "output_type": "stream", 464 | "text": [ 465 | "[ERROR] - The server is overloaded or not ready yet.\n", 466 | "Retry with Recursive Func\n" 467 | ] 468 | }, 469 | { 470 | "name": "stderr", 471 | "output_type": "stream", 472 | "text": [ 473 | "Ingestion Start: 45%|████▌ | 284/625 [40:18<22:14, 3.91s/it] " 474 | ] 475 | }, 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 481 | "Retry with Recursive Func\n" 482 | ] 483 | }, 484 | { 485 | "name": "stderr", 486 | "output_type": "stream", 487 | "text": [ 488 | "Ingestion Start: 61%|██████ | 379/625 [59:05<35:12, 8.59s/it] " 489 | ] 490 | }, 491 | { 492 | "name": "stdout", 493 | "output_type": "stream", 494 | "text": [ 495 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 496 | "Retry with Recursive Func\n" 497 | ] 498 | }, 499 | { 500 | "name": "stderr", 501 | "output_type": "stream", 502 | "text": [ 503 | "Ingestion Start: 61%|██████ | 380/625 [1:09:17<12:53:12, 189.36s/it]" 504 | ] 505 | }, 506 | { 507 | "name": "stdout", 508 | "output_type": "stream", 509 | "text": [ 510 | "[ERROR] - HTTP code 502 from API (\n", 511 | "502 Bad Gateway\n", 512 | "\n", 513 | "

502 Bad Gateway

\n", 514 | "
cloudflare
\n", 515 | "\n", 516 | "\n", 517 | ")\n", 518 | "Retry with Recursive Func\n" 519 | ] 520 | }, 521 | { 522 | "name": "stderr", 523 | "output_type": "stream", 524 | "text": [ 525 | "Ingestion Start: 100%|██████████| 625/625 [1:38:27<00:00, 9.45s/it] \n" 526 | ] 527 | } 528 | ], 529 | "source": [ 530 | "# Iter and push into array\n", 531 | "for comment in tqdm(data[\"text\"], desc = \"Ingestion Start\"):\n", 532 | " result, token = ingest_openai(tweet_comment = comment)\n", 533 | " final_result_extraction.append(result)\n", 534 | " final_token_usage.append(token)" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": 49, 540 | "metadata": {}, 541 | "outputs": [], 542 | "source": [ 543 | "# Assign result into dataframe\n", 544 | "data['result extraction'] = final_result_extraction\n", 545 | "data['token usage'] = final_token_usage" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": 50, 551 | "metadata": {}, 552 | "outputs": [], 553 | "source": [ 554 | "# Save into dataframe\n", 555 | "data.to_csv(\"../dataset/data_twitter_pemilu_2024_enrich.csv\", index = False)" 556 | ] 557 | } 558 | ], 559 | "metadata": { 560 | "kernelspec": { 561 | "display_name": "Python 3 (ipykernel)", 562 | "language": "python", 563 | "name": "python3" 564 | }, 565 | "language_info": { 566 | "codemirror_mode": { 567 | "name": "ipython", 568 | "version": 3 569 | }, 570 | "file_extension": ".py", 571 | "mimetype": "text/x-python", 572 | "name": "python", 573 | "nbconvert_exporter": "python", 574 | "pygments_lexer": "ipython3", 575 | "version": "3.9.7" 576 | } 577 | }, 578 | "nbformat": 4, 579 | "nbformat_minor": 2 580 | } 581 | -------------------------------------------------------------------------------- /dags/crawler/final_dataset/twitter_prabowo_subianto.csv: -------------------------------------------------------------------------------- 1 | created_at,id_str,full_text,quote_count,reply_count,retweet_count,favorite_count,lang,user_id_str,conversation_id_str,username,tweet_url 2 | Wed Nov 29 23:55:15 +0000 2023,1730012580595528120,"yang menegaskan agar anggota KORPRI Kemhan dapat terus berkontribusi, berprestasi dan berinovasi dalam mewujudkan Indonesia yang lebih baik dan maju. #Prabowo #PrabowoSubianto #MenhanPrabowo #Herindra #WamenhanRI #Kemhan #KemhanRI #HUTKorpri2023",0,0,0,5,in,714719021816541184,1730012575465889999,Kemhan_RI,https://twitter.com/Kemhan_RI/status/1730012580595528120 3 | Wed Nov 29 23:55:14 +0000 2023,1730012575465889999,"Momen Wamenhan, M. Herindra @herindra87, memimpin Upacara HUT ke-52 KORPRI, di Lapangan Bhinneka Tunggal Ika, Kemhan Jakarta (29/11). Wamenhan M. Herindra membacakan Amanat Menteri Pertahanan Prabowo Subianto @prabowo, https://t.co/UN6DBPUbKI",0,1,5,27,in,714719021816541184,1730012575465889999,Kemhan_RI,https://twitter.com/Kemhan_RI/status/1730012575465889999 4 | Wed Nov 29 23:55:10 +0000 2023,1730012558126682481,"Akhir pekan ini capres dan cawapres nomor urut dua, Prabowo Subianto dan Gibran Rakabuming Raka dikabarkan akan mulai berkampanye di sejumlah daerah. https://t.co/VpjCI5Ntgc",0,0,0,1,in,154102750,1730012558126682481,Beritasatu,https://twitter.com/Beritasatu/status/1730012558126682481 5 | Wed Nov 29 23:50:41 +0000 2023,1730011427820454264,"Presiden Jokowi adalah seorang presiden, mentor , Guru , tauladan dan Bpk yg sangat baik rendah hati . Beliau bisa mengajarkan seorang Prabowo Subianto menjadi seorang yang humble . Jauh dari sosok Prabowo yg sebelum 2019 . Terima Kasih Pak @jokowi 🙏🏼✌🏼🫰🏼 https://t.co/d0TkIoEKqa",0,0,0,0,in,1598344201409699841,1730011427820454264,TitaMar80993092,https://twitter.com/TitaMar80993092/status/1730011427820454264 6 | Wed Nov 29 23:49:10 +0000 2023,1730011046113632550,@barubikinlol Ganjar Pranowo? Prabowo Subianto? Anis Baswedano? O semua ini saya rubah.,0,2,1,42,in,593490220,1729852949877137761,Goodwindology,https://twitter.com/Goodwindology/status/1730011046113632550 7 | Wed Nov 29 23:46:44 +0000 2023,1730010434306977842,Sri Mulyani mengungkap isi pertemuan antara Presiden Joko Widodo (Jokowi) dan Menteri Pertahanan Prabowo Subianto di Istana Bogor pada Selasa (28/11). https://t.co/mXhR6BJNjj,2,11,7,41,in,69183155,1730010434306977842,detikcom,https://twitter.com/detikcom/status/1730010434306977842 8 | Wed Nov 29 23:35:08 +0000 2023,1730007517206577450,"@DedynurPalakka @jokowi Cuma mengingatkan di surat suara pilpres tidak ada foto Jokowi. Yang ada Prabowo Subianto, Anis Baswedan dan Ganjar Pranowo, yang salah satunya menggantikan Jokowi. Setelah Oktober 2024 Jokowi pensiun dari presiden dan pulang ke Solo, menempati rumah hadiah negara'.",0,0,0,0,in,1589160626101764096,1729771749959426239,surosohariyamt3,https://twitter.com/surosohariyamt3/status/1730007517206577450 9 | Wed Nov 29 23:30:38 +0000 2023,1730006384488648796,Jaringan sudah terstruktur dengan baik yaa #PolriTidakNetral #Polri #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/T7db460FfE,0,0,0,1,in,1615964192313606144,1730006384488648796,sejatidindaa,https://twitter.com/sejatidindaa/status/1730006384488648796 10 | Wed Nov 29 23:30:22 +0000 2023,1730006315685347793,Keterlibatan Pratikno dalam operasi pengkondisian NU dan merah #PolriTidakNetral #Polri #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/kPcqJBWJrJ,0,0,0,1,in,1615964192313606144,1730006315685347793,sejatidindaa,https://twitter.com/sejatidindaa/status/1730006315685347793 11 | Wed Nov 29 23:29:12 +0000 2023,1730006025024258141,Ternyata sudah terstruktur ya Polri #PolriTidakNetral #Polri #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/lOEd2h27Bm,0,0,0,1,in,1615964192313606144,1730006025024258141,sejatidindaa,https://twitter.com/sejatidindaa/status/1730006025024258141 12 | Wed Nov 29 23:29:32 +0000 2023,1730006106762789310,Polri emang beda #PolriTidakNetral #Polri #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/bYaW3FdryC,0,0,0,1,in,1615964192313606144,1730006106762789310,sejatidindaa,https://twitter.com/sejatidindaa/status/1730006106762789310 13 | Wed Nov 29 23:29:58 +0000 2023,1730006216083112261,Kompaknya Kapolri dan wakapolri sokong Gibran meresahkan tubuh Kepolisian #PolriTidakNetral #Polri #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/cPdedKFENn,0,0,0,1,in,1615964192313606144,1730006216083112261,sejatidindaa,https://twitter.com/sejatidindaa/status/1730006216083112261 14 | Wed Nov 29 23:27:47 +0000 2023,1730005665496887795,Udah jelas salah malah mau menyangkal #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/fSdPOKN4zF,0,0,0,1,in,1615919266729193473,1730005665496887795,infovalidd,https://twitter.com/infovalidd/status/1730005665496887795 15 | Wed Nov 29 23:26:15 +0000 2023,1730005279465693301,"Baliho Besar Gagasan Kecil, wkwkwkw #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/OxGqczxYGY",0,0,0,1,in,1615919266729193473,1730005279465693301,infovalidd,https://twitter.com/infovalidd/status/1730005279465693301 16 | Wed Nov 29 23:24:21 +0000 2023,1730004804267573740,"Rayyanza Diajak Raffi Ahmad Ketemu Prabowo Subianto, Netizen: Apakah Gibran Akan Digantikan Cipung? https://t.co/dHzUgzY6JH",0,0,0,0,in,1349806712,1730004804267573740,nurasmi69,https://twitter.com/nurasmi69/status/1730004804267573740 17 | Wed Nov 29 23:21:24 +0000 2023,1730004061460791336,Banyak Netizen yang menilai Prabowo - GIbran maju capres cawapres tapi minim gagasan #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/XUk1m7XuKZ,0,0,0,1,in,1615919266729193473,1730004061460791336,infovalidd,https://twitter.com/infovalidd/status/1730004061460791336 18 | Wed Nov 29 23:20:41 +0000 2023,1730003878131970490,"wkwkwk wajar sering tidak nyambung umurnya udah tua, tidak cocok dipilih #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/CmzS22Lzhj",0,0,0,2,in,1615919266729193473,1730003878131970490,infovalidd,https://twitter.com/infovalidd/status/1730003878131970490 19 | Wed Nov 29 23:20:03 +0000 2023,1730003719331397700,Rakyat Paham mana yang tulus mana rakus #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/ftlmcYBnPO,0,0,0,1,in,1615919266729193473,1730003719331397700,infovalidd,https://twitter.com/infovalidd/status/1730003719331397700 20 | Wed Nov 29 22:53:16 +0000 2023,1729996981492486216,@zy_zy_its_me @adri_7i Prabowo Subianto BUKAN AKI AKI ? SUDAH AKI AKI JADI IDOLAMU 🤮,1,4,0,1,in,1629815240790773760,1729868053201252726,1973Suharjito,https://twitter.com/1973Suharjito/status/1729996981492486216 21 | Wed Nov 29 22:11:42 +0000 2023,1729986518704590915,"Sekjen Partai Gerindra Ahmad Muzani menegaskan, proses pencapresan Prabowo Subianto tidak dilakukan secara instan. Ia mengatakan, Prabowo merupakan sosok pemimpin yang matang dan kaya pengalaman. < #Prabowo #Gerindra https://t.co/2kaOoujsmS",5,51,1,10,in,23343960,1729986518704590915,kompascom,https://twitter.com/kompascom/status/1729986518704590915 22 | Wed Nov 29 22:09:06 +0000 2023,1729985865697681425,"Ketua TKN Prabowo Subianto-Gibran Rakabuming Raka, Rosan Perkasa Roeslani memastikan Prabowo dan Presiden Jokowi tidak berbicara mengenai politik saat bertemu di Istana Kepresidenan Bogor, Jawa Barat, Selasa (28/11/2023) kemarin. < #Prabowo #Jokowi https://t.co/vkKjTHvBXX",0,3,0,0,in,23343960,1729985865697681425,kompascom,https://twitter.com/kompascom/status/1729985865697681425 23 | Wed Nov 29 22:07:56 +0000 2023,1729985571014213740,"Juru bicara TKN pasangan capres dan cawapres nomor urut 2, Prabowo Subianto-Gibran Rakabuming, Dedek Prayudi mengatakan, narasi politik “gemoy” dipakai untuk menggaet milenial dan gen Z. < #Prabowo #Gibran #PrabowoGibran #Pemilu2024KCM #JernihMemilih https://t.co/wAUyuhpJTt",0,5,0,0,in,23343960,1729985571014213740,kompascom,https://twitter.com/kompascom/status/1729985571014213740 24 | Wed Nov 29 22:07:09 +0000 2023,1729985372501971034,"Prabowo Subianto TELAH MELAKUKAN PELANGGARAN HAM BERAT ...dan dipecat (dipaksa pensiun dini) dari TNI (SK DKP)..!! SBY , sebagai anggota DKP ikut tanda tangan ...tapi sekarang PLIN PLAN ,ingin cari kekuasaan, ..SBY malah mendukung Prabowo... BOTOL dan KOPLAK...!! https://t.co/jtFhqNCUDY",0,0,1,0,in,1583805020910551042,1729985372501971034,AsharIbnu,https://twitter.com/AsharIbnu/status/1729985372501971034 25 | Wed Nov 29 22:05:34 +0000 2023,1729984977943818417,Prabowo Subianto Diprediksi Menang Pilpres 2024 Gara-gara Fuji: Ada Uti Makin Komplit https://t.co/UIliVnrR9S,30,277,27,182,in,41730943,1729984977943818417,VIVAcoid,https://twitter.com/VIVAcoid/status/1729984977943818417 26 | Wed Nov 29 21:42:44 +0000 2023,1729979231499608236,Pasangan Prabowo Subianto-Gibran Rakabuming Raka akan memaksimalkan kampanye di akhir pekan. #Polhuk #AdadiKompas https://t.co/j374nlirXp,0,0,0,0,in,771030588,1729979231499608236,KompasData,https://twitter.com/KompasData/status/1729979231499608236 27 | Wed Nov 29 21:40:04 +0000 2023,1729978560428093455,"Sukses Meremajakan C-130H, Kinerja GMF AeroAsia Diapresiasi Prabowo Subianto: Tujuh Hercules Juga Melakukan Upgrade & nbsp https://t.co/8I543qlbTY",0,0,0,0,in,1219947729155047424,1729978560428093455,zonajakarta1,https://twitter.com/zonajakarta1/status/1729978560428093455 28 | Wed Nov 29 21:38:11 +0000 2023,1729978086437818734,Pasangan Prabowo Subianto-Gibran Rakabuming Raka akan memaksimalkan kampanye di akhir pekan. #Polhuk #AdadiKompas https://t.co/eD7IqkVBud,0,0,0,2,in,255866913,1729978086437818734,hariankompas,https://twitter.com/hariankompas/status/1729978086437818734 29 | Wed Nov 29 21:35:17 +0000 2023,1729977355622584723,AHY pimpin Deklarasi Capres koalisi Indonesia Maju Prabowo Subianto. pqc AgusYudhoyono PDemokrat DemokratS14P https://t.co/J3W7YDi2jM,0,0,0,0,in,763658966098403328,1729977355622584723,ayangmn,https://twitter.com/ayangmn/status/1729977355622584723 30 | Wed Nov 29 21:35:10 +0000 2023,1729977323552915581,"AHY Deklarasikan Prabowo Subianto Sebagai Capres Hari Ini, semangat perubahan tetap diperjuangkan. zkk PDemokrat AgusYudhoyono DemokratS14P https://t.co/928w3yizOD",0,0,0,0,in,763658966098403328,1729977323552915581,ayangmn,https://twitter.com/ayangmn/status/1729977323552915581 31 | Wed Nov 29 21:34:42 +0000 2023,1729977208851374383,Mars Partai Demokrat bergema di kediaman bacapres Prabowo Subianto. Serdadu AHY S14P all out di pilpres dan pileg 2024! 💪. im` PDemokrat AgusYudhoyono DemokratS14P https://t.co/L5bq40pVwi,0,0,0,0,in,763658966098403328,1729977208851374383,ayangmn,https://twitter.com/ayangmn/status/1729977208851374383 32 | Wed Nov 29 21:34:20 +0000 2023,1729977117906227415,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/4RBX35MWwu",0,0,0,0,in,763658966098403328,1729977117906227415,ayangmn,https://twitter.com/ayangmn/status/1729977117906227415 33 | Wed Nov 29 21:34:05 +0000 2023,1729977053167157685,Mas AHY sudah berpamitan kepada mba Puan soal keputusan dukung Prabowo Subianto.. \t^ PDemokrat AgusYudhoyono DemokratS14P https://t.co/5og08arFkH,0,0,0,0,in,763658966098403328,1729977053167157685,ayangmn,https://twitter.com/ayangmn/status/1729977053167157685 34 | Wed Nov 29 21:32:57 +0000 2023,1729976767685996827,AHY menginstruksikan kpd seluruh pimpinan DPD & DPC Demokrat di 38 provinsi utk menyusun strategi bersama memenangkan Prabowo Subianto dlm Pilpres 2024.. pxx AgusYudhoyono PDemokrat DemokratS14P https://t.co/ZJD1p1tasx,0,0,0,0,in,763658966098403328,1729976767685996827,ayangmn,https://twitter.com/ayangmn/status/1729976767685996827 35 | Wed Nov 29 21:32:47 +0000 2023,1729976724996464694,Partai Demokrat resmi mendukung Prabowo Subianto sebagai calon presiden 2024. Ketum Demokrat AHY mengeluarkan instruksi untuk para kader Demokrat.. k_d PDemokrat AgusYudhoyono DemokratS14P https://t.co/QbDp7wutfx,0,0,0,0,in,763658966098403328,1729976724996464694,ayangmn,https://twitter.com/ayangmn/status/1729976724996464694 36 | Wed Nov 29 21:32:44 +0000 2023,1729976715173388738,"Ketum Partai Demokrat Agus Harimurti Yudhoyono (AHY) meminta para kadernya untuk memenangkan bacapres Koalisi Indonesia Maju (KIM), Prabowo Subianto.. z]w AgusYudhoyono PDemokrat DemokratS14P https://t.co/dbLCKBYkUb",0,0,0,0,in,763658966098403328,1729976715173388738,ayangmn,https://twitter.com/ayangmn/status/1729976715173388738 37 | Wed Nov 29 21:29:36 +0000 2023,1729975924882567660,AHY pimpin Deklarasi Capres koalisi Indonesia Maju Prabowo Subianto. pqc AgusYudhoyono PDemokrat DemokratS14P https://t.co/whn7yl7HeD,0,0,0,0,in,2877262597,1729975924882567660,raninuran_,https://twitter.com/raninuran_/status/1729975924882567660 38 | Wed Nov 29 21:29:30 +0000 2023,1729975901331640521,"AHY Deklarasikan Prabowo Subianto Sebagai Capres Hari Ini, semangat perubahan tetap diperjuangkan. zkk PDemokrat AgusYudhoyono DemokratS14P https://t.co/DlS7pBmure",0,0,0,0,in,2877262597,1729975901331640521,raninuran_,https://twitter.com/raninuran_/status/1729975901331640521 39 | Wed Nov 29 21:29:13 +0000 2023,1729975829101465689,Mars Partai Demokrat bergema di kediaman bacapres Prabowo Subianto. Serdadu AHY S14P all out di pilpres dan pileg 2024! 💪. im` PDemokrat AgusYudhoyono DemokratS14P https://t.co/hWnJuf1xBU,0,0,0,0,in,2877262597,1729975829101465689,raninuran_,https://twitter.com/raninuran_/status/1729975829101465689 40 | Wed Nov 29 21:29:04 +0000 2023,1729975791218475425,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/x69jnJ2GQq",0,0,0,0,in,2877262597,1729975791218475425,raninuran_,https://twitter.com/raninuran_/status/1729975791218475425 41 | Wed Nov 29 21:28:47 +0000 2023,1729975718418026634,Mas AHY sudah berpamitan kepada mba Puan soal keputusan dukung Prabowo Subianto.. \t^ PDemokrat AgusYudhoyono DemokratS14P https://t.co/a3T4lDEARE,0,0,0,0,in,2877262597,1729975718418026634,raninuran_,https://twitter.com/raninuran_/status/1729975718418026634 42 | Wed Nov 29 21:27:48 +0000 2023,1729975471394480622,AHY menginstruksikan kpd seluruh pimpinan DPD & DPC Demokrat di 38 provinsi utk menyusun strategi bersama memenangkan Prabowo Subianto dlm Pilpres 2024.. pxx AgusYudhoyono PDemokrat DemokratS14P https://t.co/MSrJt19igo,0,0,0,0,in,2877262597,1729975471394480622,raninuran_,https://twitter.com/raninuran_/status/1729975471394480622 43 | Wed Nov 29 21:27:46 +0000 2023,1729975463463014868,Partai Demokrat resmi mendukung Prabowo Subianto sebagai calon presiden 2024. Ketum Demokrat AHY mengeluarkan instruksi untuk para kader Demokrat.. k_d PDemokrat AgusYudhoyono DemokratS14P https://t.co/Ju5Ka0X6gq,0,0,0,0,in,2877262597,1729975463463014868,raninuran_,https://twitter.com/raninuran_/status/1729975463463014868 44 | Wed Nov 29 21:27:44 +0000 2023,1729975453476323519,"Ketum Partai Demokrat Agus Harimurti Yudhoyono (AHY) meminta para kadernya untuk memenangkan bacapres Koalisi Indonesia Maju (KIM), Prabowo Subianto.. z]w AgusYudhoyono PDemokrat DemokratS14P https://t.co/OmPIHbfoBW",0,0,0,0,in,2877262597,1729975453476323519,raninuran_,https://twitter.com/raninuran_/status/1729975453476323519 45 | Wed Nov 29 21:21:10 +0000 2023,1729973802518606068,"Tegas, Susi Pudjiastuti Tolak Bergabung dengan Partai Gerindra Besutan Prabowo Subianto - Wartakota https://t.co/Z2hfsCFnSV #Prabowo #BangkitBersama",0,0,0,0,in,465423257,1729973802518606068,haelamarie,https://twitter.com/haelamarie/status/1729973802518606068 46 | Wed Nov 29 21:15:52 +0000 2023,1729972468264427713,AHY pimpin Deklarasi Capres koalisi Indonesia Maju Prabowo Subianto. pqc AgusYudhoyono PDemokrat DemokratS14P https://t.co/em1QAuxlst,0,0,0,0,in,326270868,1729972468264427713,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729972468264427713 47 | Wed Nov 29 21:15:45 +0000 2023,1729972437155193078,"AHY Deklarasikan Prabowo Subianto Sebagai Capres Hari Ini, semangat perubahan tetap diperjuangkan. zkk PDemokrat AgusYudhoyono DemokratS14P https://t.co/JhrpYp0pGM",0,0,0,0,in,326270868,1729972437155193078,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729972437155193078 48 | Wed Nov 29 21:15:21 +0000 2023,1729972340547805288,Mars Partai Demokrat bergema di kediaman bacapres Prabowo Subianto. Serdadu AHY S14P all out di pilpres dan pileg 2024! 💪. im` PDemokrat AgusYudhoyono DemokratS14P https://t.co/Z04fZUkMRy,0,0,0,0,in,326270868,1729972340547805288,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729972340547805288 49 | Wed Nov 29 21:15:13 +0000 2023,1729972305118507294,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/rEnxXouR8v",0,0,0,0,in,326270868,1729972305118507294,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729972305118507294 50 | Wed Nov 29 21:14:52 +0000 2023,1729972217587638641,Mas AHY sudah berpamitan kepada mba Puan soal keputusan dukung Prabowo Subianto.. \t^ PDemokrat AgusYudhoyono DemokratS14P https://t.co/qSqbCyhW9Q,0,0,0,0,in,326270868,1729972217587638641,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729972217587638641 51 | Wed Nov 29 21:13:57 +0000 2023,1729971984715710964,AHY menginstruksikan kpd seluruh pimpinan DPD & DPC Demokrat di 38 provinsi utk menyusun strategi bersama memenangkan Prabowo Subianto dlm Pilpres 2024.. pxx AgusYudhoyono PDemokrat DemokratS14P https://t.co/4KLg9qWX46,0,0,0,0,in,326270868,1729971984715710964,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729971984715710964 52 | Wed Nov 29 21:13:53 +0000 2023,1729971970941518260,Partai Demokrat resmi mendukung Prabowo Subianto sebagai calon presiden 2024. Ketum Demokrat AHY mengeluarkan instruksi untuk para kader Demokrat.. k_d PDemokrat AgusYudhoyono DemokratS14P https://t.co/3JoFiStBbe,0,0,0,0,in,326270868,1729971970941518260,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729971970941518260 53 | Wed Nov 29 21:13:50 +0000 2023,1729971957632995579,"Ketum Partai Demokrat Agus Harimurti Yudhoyono (AHY) meminta para kadernya untuk memenangkan bacapres Koalisi Indonesia Maju (KIM), Prabowo Subianto.. z]w AgusYudhoyono PDemokrat DemokratS14P https://t.co/WwZNJ5gsnv",0,0,0,0,in,326270868,1729971957632995579,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729971957632995579 54 | Wed Nov 29 21:02:25 +0000 2023,1729969082626380119,@hasyimmah Dron sodron si GEMOY (GEMBROT & LETOY) alias PRABOWO SUBIANTO tdk akn Menang slm msh ada org Jawa asli yg nyalon Presiden . Kecuali dlm keadaan darurat.,0,1,0,0,in,1713944739043811328,1729742804077944956,SuciptoHad53960,https://twitter.com/SuciptoHad53960/status/1729969082626380119 55 | Wed Nov 29 20:18:12 +0000 2023,1729957955208921492,Wahlkampf in Indonesien gestartet - Wer folgt Präsident Jokowi? 3 Kandidaten: Umfrage: ehem. General u Verteidigungsminister Prabowo Subianto (72): 40% Ganjar Pranowo (Gouverneur Zentraljava): 28% Anies Baswedan: 24% https://t.co/O6fbEAnWvE,0,0,0,1,de,1238447220,1729957955208921492,Milatrud11,https://twitter.com/Milatrud11/status/1729957955208921492 56 | Wed Nov 29 19:21:19 +0000 2023,1729943639810568341,Pak Prabowo Subianto Rangkul SDM Putra Bangsa yang berprestasi Kelas Dunia .. Salam Prabowo Subianto Presiden RI 2024 .. #PrabowoGibranJuara2024 #rk08 #rkrijuara #JabarRumahPrabowo #JabarTetapPrabowo #JabarSolidPrabowo Sembilan Partai All-in @prabowo @gibran_tweet https://t.co/0tCxs6Opyc,1,0,3,1,in,1531971112929615872,1729943639810568341,rkrijuara,https://twitter.com/rkrijuara/status/1729943639810568341 57 | Wed Nov 29 19:05:40 +0000 2023,1729939700620915150,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/6CHcTQxIgk",0,0,0,0,in,1493150581602467840,1729939700620915150,kariimm_4,https://twitter.com/kariimm_4/status/1729939700620915150 58 | Wed Nov 29 18:55:03 +0000 2023,1729937028811518181,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/b12hOnKPDt",0,0,0,0,in,69874051,1729937028811518181,setiawann_24,https://twitter.com/setiawann_24/status/1729937028811518181 59 | Wed Nov 29 18:51:09 +0000 2023,1729936049642836017,Napro 08 siap bergerak menangkan Prabowo Subianto - ANTARA https://t.co/xO33Gojl73 #BersamaIndonesiaMaju #PrabowoGemoy #KodeKita08Gemoy,0,0,10,0,in,4579831158,1729936049642836017,ChairudinN6548,https://twitter.com/ChairudinN6548/status/1729936049642836017 60 | Wed Nov 29 18:48:10 +0000 2023,1729935297381855658,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/hCaldL1uM4",0,0,0,0,in,69891770,1729935297381855658,nadianurulazmii,https://twitter.com/nadianurulazmii/status/1729935297381855658 61 | Wed Nov 29 18:43:44 +0000 2023,1729934182078673278,AHY pimpin Deklarasi Capres koalisi Indonesia Maju Prabowo Subianto. pqc AgusYudhoyono PDemokrat DemokratS14P https://t.co/MYlUbIRSkk,0,0,0,0,in,1378473318,1729934182078673278,samueleto78,https://twitter.com/samueleto78/status/1729934182078673278 62 | Wed Nov 29 18:43:41 +0000 2023,1729934171240591417,"AHY Deklarasikan Prabowo Subianto Sebagai Capres Hari Ini, semangat perubahan tetap diperjuangkan. zkk PDemokrat AgusYudhoyono DemokratS14P https://t.co/ewT4rQmGSd",0,0,0,0,in,1378473318,1729934171240591417,samueleto78,https://twitter.com/samueleto78/status/1729934171240591417 63 | Wed Nov 29 18:43:18 +0000 2023,1729934074528329947,Mars Partai Demokrat bergema di kediaman bacapres Prabowo Subianto. Serdadu AHY S14P all out di pilpres dan pileg 2024! 💪. im` PDemokrat AgusYudhoyono DemokratS14P https://t.co/n5MneCU1qv,0,0,0,0,in,1378473318,1729934074528329947,samueleto78,https://twitter.com/samueleto78/status/1729934074528329947 64 | Wed Nov 29 18:43:12 +0000 2023,1729934047214997582,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/U3kUFLr2f0",0,0,0,0,in,1378473318,1729934047214997582,samueleto78,https://twitter.com/samueleto78/status/1729934047214997582 65 | Wed Nov 29 18:42:53 +0000 2023,1729933968617959710,Mas AHY sudah berpamitan kepada mba Puan soal keputusan dukung Prabowo Subianto.. \t^ PDemokrat AgusYudhoyono DemokratS14P https://t.co/J28OunKEBS,0,0,0,0,in,1378473318,1729933968617959710,samueleto78,https://twitter.com/samueleto78/status/1729933968617959710 66 | Wed Nov 29 18:14:56 +0000 2023,1729926933146956016,@theo12_ini PRABOWO SUBIANTO,0,1,0,1,pl,1702935093701201920,1729925957979664726,BeyondTheJoker,https://twitter.com/BeyondTheJoker/status/1729926933146956016 67 | Wed Nov 29 18:41:56 +0000 2023,1729933727973961946,Partai Demokrat resmi mendukung Prabowo Subianto sebagai calon presiden 2024. Ketum Demokrat AHY mengeluarkan instruksi untuk para kader Demokrat.. k_d PDemokrat AgusYudhoyono DemokratS14P https://t.co/SyZdBJPhv4,0,0,0,0,in,1378473318,1729933727973961946,samueleto78,https://twitter.com/samueleto78/status/1729933727973961946 68 | Wed Nov 29 18:41:51 +0000 2023,1729933710869590122,"Ketum Partai Demokrat Agus Harimurti Yudhoyono (AHY) meminta para kadernya untuk memenangkan bacapres Koalisi Indonesia Maju (KIM), Prabowo Subianto.. z]w AgusYudhoyono PDemokrat DemokratS14P https://t.co/NXlBC0AbSB",0,0,0,0,in,1378473318,1729933710869590122,samueleto78,https://twitter.com/samueleto78/status/1729933710869590122 69 | Wed Nov 29 18:28:44 +0000 2023,1729930406072611170,"Capres no.urut 2 Prabowo Subianto dicecar habis²an oleh National Corruption Watch"" 😢😢 Apa gak bahaya tah!! https://t.co/P2B0q3dczd""",9,14,164,403,in,1186243531037270017,1729930406072611170,msobri99,https://twitter.com/msobri99/status/1729930406072611170 70 | Wed Nov 29 18:24:07 +0000 2023,1729929245953290351,"Serikat Pekerja Nusantara Dukung Prabowo Subianto di Pilpres 2024, Siap Ikut Memenangkan https://t.co/NxMoX2ziNj",0,4,1,2,in,41730943,1729929245953290351,VIVAcoid,https://twitter.com/VIVAcoid/status/1729929245953290351 71 | Wed Nov 29 18:41:58 +0000 2023,1729933738048655587,AHY menginstruksikan kpd seluruh pimpinan DPD & DPC Demokrat di 38 provinsi utk menyusun strategi bersama memenangkan Prabowo Subianto dlm Pilpres 2024.. pxx AgusYudhoyono PDemokrat DemokratS14P https://t.co/dq22FLYUtI,0,0,0,0,in,1378473318,1729933738048655587,samueleto78,https://twitter.com/samueleto78/status/1729933738048655587 72 | Wed Nov 29 18:14:08 +0000 2023,1729926735792345406,@siregar_elang Apa yang diharapkan dari Prabowo Subianto... TNI sudah PECAT dia artinya sudah tak dibutuhkan... lantas ada yang ingin dia jadi panglima tertinggi di Indonesia...olala... sudah tidak ada lagi kah rakyat di negeri sebesar ini...????,0,0,0,0,in,1656204151590883328,1729310357582479617,CMediio63787,https://twitter.com/CMediio63787/status/1729926735792345406 73 | Wed Nov 29 17:36:58 +0000 2023,1729917379025760741,@dsvwaikdjns @tvOneNews Kejujuran Pak Prabowo Subianto adalah landasan kuat yang membedakannya sebagai pemimpin yang dapat diandalkan dan dapat dipercaya.,0,0,0,0,in,2388963246,1729652211435860474,Gojokaisen86,https://twitter.com/Gojokaisen86/status/1729917379025760741 74 | Wed Nov 29 17:33:33 +0000 2023,1729916521714192423,Sri Mulyani: Pinjaman Luar Negeri (UTANG) Kementerian Prabowo Tembus Rp385 Triliun. https://t.co/wBuIumeO4F Menteri Keuangan Sri Mulyani mengungkapkan HASIL RAPAT bersama Menhan Prabowo Subianto terkait BELANJA Alat Utama Sistem Pertahanan (ALUTSISTA) dari PINJAMAN LUAR NEGERI. https://t.co/8rEVFD8qhM,0,1,0,0,in,4517167514,1729916521714192423,sirajapadoha,https://twitter.com/sirajapadoha/status/1729916521714192423 75 | Wed Nov 29 17:33:21 +0000 2023,1729916468585005182,"@hddfsysgdcoghu @habiburokhman @prabowo @gibran_tweet Dalam memilih pemimpin, kejujuran adalah kualitas yang tak ternilai, dan saya yakin bahwa Pak Prabowo Subianto adalah sosok yang memenuhi kriteria tersebut.",0,0,0,0,in,707639560843468800,1729705301106913318,dsvwaikdjns,https://twitter.com/dsvwaikdjns/status/1729916468585005182 76 | Wed Nov 29 17:31:08 +0000 2023,1729915914420941148,"@hddfsysgdcoghu @ajengcute16__ Pilihannya untuk Pak Prabowo Subianto adalah pilihan untuk memiliki pemimpin yang jujur dan tulus, yang diharapkan membawa integritas dan kejujuran dalam kepemimpinan.",0,1,0,0,in,707639560843468800,1729821879903375505,dsvwaikdjns,https://twitter.com/dsvwaikdjns/status/1729915914420941148 77 | Wed Nov 29 17:29:49 +0000 2023,1729915580780835287,Mana nih yang belum gabung ke barisan Pak @prabowo ? #prabowosubianto #GibranRakabuming #prabowopresiden #PrabowoGibran2024 #indonesia #indonesian #viral #fyp #politik #ganjarpranowo #aniesbaswedan #pinterpolitik https://t.co/QTuDH4qkv1,3,21,19,47,in,1579126996537290752,1729915580780835287,mypresidentid,https://twitter.com/mypresidentid/status/1729915580780835287 78 | Wed Nov 29 17:29:10 +0000 2023,1729915416355705123,@ruyasagit900 @RcyberProj0 @prabowo @gibran_tweet @Projo_Pusat Saya semakin yakin bahwa memilih Pak Prabowo Subianto adalah langkah cerdas untuk membawa Indonesia ke tingkat kejayaan yang baru dan lebih baik.,0,0,0,0,in,2170027031,1729652171434541424,kingslandsjay,https://twitter.com/kingslandsjay/status/1729915416355705123 79 | Wed Nov 29 17:28:11 +0000 2023,1729915171257380927,"Prabowo Cinta Rakyat Prabowo Subianto Sumringah Dapat Dukungang Langsung oleh Dua Presiden, Termotivasi Menang di Pilpres - https://t.co/2aLVkgOAqp https://t.co/iVMbNlgMe8 #BersamaIndonesiaMaju #PrabowoGemoy #KodeKita08Gemoy",0,0,8,0,in,814012104768307201,1729915171257380927,RondangKomariah,https://twitter.com/RondangKomariah/status/1729915171257380927 80 | Wed Nov 29 17:23:44 +0000 2023,1729914052191907910,@Gojokaisen86 @tvOneNews Saya semakin yakin bahwa Pak Prabowo Subianto adalah sosok yang memiliki kepekaan terhadap berbagai isu sosial dan ekonomi yang dihadapi oleh masyarakat Indonesia.,0,0,0,0,in,4153801512,1729652211435860474,mirashahabudin,https://twitter.com/mirashahabudin/status/1729914052191907910 81 | Wed Nov 29 17:22:18 +0000 2023,1729913691678871690,"@Gojokaisen86 @Chaves1305 Setiap kali saya mendalami visi dan misi Pak Prabowo Subianto, semakin kuat keyakinan saya bahwa beliau adalah pemimpin yang memiliki pandangan jauh ke depan.",1,0,0,0,in,4153801512,1729796437863518249,mirashahabudin,https://twitter.com/mirashahabudin/status/1729913691678871690 82 | Wed Nov 29 17:21:16 +0000 2023,1729913427743928498,Budiman Sudjatmiko nyatakan dukungan pada Prabowo Subianto ... - ANTARA https://t.co/5eUxBb88yl #BersamaIndonesiaMaju #PrabowoGemoy #KodeKita08Gemoy,0,0,10,0,in,797615281455525888,1729913427743928498,AzzrielAzaryahu,https://twitter.com/AzzrielAzaryahu/status/1729913427743928498 83 | Wed Nov 29 17:21:15 +0000 2023,1729913427127366076,Budiman Sudjatmiko nyatakan dukungan pada Prabowo Subianto ... - ANTARA https://t.co/WkscP4NgZU #BersamaIndonesiaMaju #PrabowoGemoy #KodeKita08Gemoy,0,0,10,0,in,4579831158,1729913427127366076,ChairudinN6548,https://twitter.com/ChairudinN6548/status/1729913427127366076 84 | Wed Nov 29 17:21:15 +0000 2023,1729913425281871905,Budiman Sudjatmiko nyatakan dukungan pada Prabowo Subianto ... - ANTARA https://t.co/XhI7hOxE6U #BersamaIndonesiaMaju #PrabowoGemoy #KodeKita08Gemoy,0,0,10,0,in,795445014864203777,1729913425281871905,AzhareAkiba,https://twitter.com/AzhareAkiba/status/1729913425281871905 85 | Wed Nov 29 17:20:54 +0000 2023,1729913338363310372,"@Gojokaisen86 @ajengcute16__ Semakin saya mendalami visi dan misi Pak Prabowo Subianto, semakin yakin bahwa beliau memiliki komitmen yang kuat untuk membawa Indonesia ke arah yang lebih baik.",0,0,0,0,in,4153801512,1729821879903375505,mirashahabudin,https://twitter.com/mirashahabudin/status/1729913338363310372 86 | Wed Nov 29 17:17:37 +0000 2023,1729912510743245236,@RcyberProj0 @prabowo @gibran_tweet @Projo_Pusat Semoga dukungan kita terhadap Pak Prabowo Subianto dapat memberikan inspirasi bagi masyarakat untuk tetap bersatu dan menjaga persatuan dalam perbedaan.,0,0,0,0,in,4153801512,1729652171434541424,mirashahabudin,https://twitter.com/mirashahabudin/status/1729912510743245236 87 | Wed Nov 29 17:10:05 +0000 2023,1729910615807005061,@DaengWahidin2 Biarkan rakyat memilih pilihannya yang jelas aku pilih Prabowo Subianto aja oke,0,0,0,0,in,1675018752772149249,1729364216082231802,KangAnom377663,https://twitter.com/KangAnom377663/status/1729910615807005061 88 | Wed Nov 29 17:08:35 +0000 2023,1729910239070449982,"@tvOneNews Dalam mendukung Pak Prabowo Subianto, saya mengajak semua pihak untuk menghormati perbedaan pendapat dan membangun dialog yang konstruktif.",0,0,0,0,in,2170027031,1729652211435860474,kingslandsjay,https://twitter.com/kingslandsjay/status/1729910239070449982 89 | Wed Nov 29 17:08:06 +0000 2023,1729910116856820046,"Tiba-tiba Hentikan Langkah Prabowo Subianto, Aksi Ajudan Ganteng Ini Bikin Heboh - https://t.co/BwJCocZuOd https://t.co/UETxOatZAb #Prabowo #PrabowoUnggul",0,0,0,0,in,1110932693426954240,1729910116856820046,fathw25,https://twitter.com/fathw25/status/1729910116856820046 90 | Wed Nov 29 17:06:10 +0000 2023,1729909631127056760,"Menhan Prabowo Subianto Hadiri Pelantikan KSAD Maruli Simanjuntak: Jakarta – Menteri Pertahanan Prabowo Subianto menghadiri pelantikan Kepala Staf Angkatan Darat (KASAD) di Istana Negara, Jakarta, Rabu (29/11). Presiden Joko Widodo… https://t.co/mHUItX9YYF via @kabar_tangsel",0,0,0,0,in,1115363858,1729909631127056760,tangselupdates,https://twitter.com/tangselupdates/status/1729909631127056760 91 | Wed Nov 29 17:06:07 +0000 2023,1729909617612984495,@ajengcute16__ Dukungan saya untuk Pak Prabowo Subianto adalah hasil dari keyakinan saya akan visi dan komitmen beliau untuk membawa perubahan positif di Indonesia.,0,7,0,0,in,2170027031,1729821879903375505,kingslandsjay,https://twitter.com/kingslandsjay/status/1729909617612984495 92 | Wed Nov 29 16:56:57 +0000 2023,1729907310036972000,Semua akan all in pak Prabowo Subianto pada waktunya ☺ #Allinprabowo #PrabowoGibranIstimewa,0,0,0,0,in,780718575271055360,1729907310036972000,hddfsysgdcoghu,https://twitter.com/hddfsysgdcoghu/status/1729907310036972000 93 | Wed Nov 29 16:53:00 +0000 2023,1729906315638796643,"@ajengcute16__ Semakin banyak anak muda yang memilih Pak Prabowo Subianto sebagai calon presidennya, semakin terasa semangat perubahan yang diinginkan generasi muda untuk masa depan Indonesia.",0,5,0,2,in,780718575271055360,1729821879903375505,hddfsysgdcoghu,https://twitter.com/hddfsysgdcoghu/status/1729906315638796643 94 | Wed Nov 29 16:49:30 +0000 2023,1729905433685692855,"Dari APBN Gak Cukup, Menhan Prabowo Utang ke Luar Negri Rp. 385 Triliun //.. Sri Mulyani: Pinjaman Luar Negeri Kementerian Prabowo Tembus Rp385 Triliun Menkeu Sri Mulyani buka suara soal pinjaman luar negeri Kementerian Pertahanan yang dipimpin Prabowo Subianto. https://t.co/mvbn0p9QpA",0,2,2,1,in,1319684366591639552,1729905433685692855,DahonoB,https://twitter.com/DahonoB/status/1729905433685692855 95 | Wed Nov 29 16:49:25 +0000 2023,1729905414412890420,"Bersama Indonesia Maju, @prabowo - @gibran_tweet satu putaran 💛 #airlanggahartarto #prabowosubianto #gibranrakabumingraka #golkarprabowo #gerindra #DUAsejolipaketkomplet https://t.co/yoo20na3Vm",0,0,0,0,in,1412673973129646084,1729905414412890420,g_politik2024,https://twitter.com/g_politik2024/status/1729905414412890420 96 | Wed Nov 29 16:46:09 +0000 2023,1729904592899707166,@RcyberProj0 @prabowo @gibran_tweet @Projo_Pusat Saya yakin bahwa mendukung Pak Prabowo Subianto adalah langkah yang tepat untuk menciptakan perubahan positif yang kita inginkan dalam perjalanan menuju masa depan Indonesia yang lebih baik.,0,1,0,1,in,983772276222029825,1729652171434541424,ruyasagit900,https://twitter.com/ruyasagit900/status/1729904592899707166 97 | Wed Nov 29 16:44:35 +0000 2023,1729904198043758664,@tvOneNews Saya yakin bahwa Pak Prabowo Subianto adalah pemimpin yang memiliki pandangan jauh ke depan dan mampu menghadirkan solusi inovatif untuk masalah-masalah yang kompleks,0,2,0,1,in,983772276222029825,1729652211435860474,ruyasagit900,https://twitter.com/ruyasagit900/status/1729904198043758664 98 | Wed Nov 29 16:39:19 +0000 2023,1729902870659764680,"@RcyberProj0 @prabowo @gibran_tweet @Projo_Pusat Saya semakin percaya bahwa Pak Prabowo Subianto adalah pemimpin yang memiliki integritas tinggi, dan kesetiaan beliau terhadap nilai-nilai moral adalah nilai tambah yang luar biasa.",0,3,0,1,in,2388963246,1729652171434541424,Gojokaisen86,https://twitter.com/Gojokaisen86/status/1729902870659764680 99 | Wed Nov 29 16:38:35 +0000 2023,1729902689310679408,Semangat capres 2024 KIM Prabowo Subianto tidak pernah padam untuk Indonesia 🔥 @prabowo @Gibran_rakabuming @Gerindra Gerindrajateng #gerindrajateng #kim #gerindrajawatengah #gerindrakotasemarang https://t.co/b5ZkYvxWJn,0,0,0,0,in,1610207637513318404,1729902689310679408,gerindrakotasmg,https://twitter.com/gerindrakotasmg/status/1729902689310679408 100 | Wed Nov 29 16:36:39 +0000 2023,1729902201982849523,"@Leonita_Lestari PDI-P no. 3 , Ganjar Mahfud no. 3, PS kalah 3 x. Tanda tanda alam semesta mendukung Prabowo Subianto melakukan hattrick kekalahan",0,0,0,0,in,1491438677674717186,1729745613947060471,Krisna0902022,https://twitter.com/Krisna0902022/status/1729902201982849523 101 | Wed Nov 29 16:34:14 +0000 2023,1729901591921332734,"@ajengcute16__ Semakin saya memahami visi dan misi Pak Prabowo Subianto, semakin yakin bahwa beliau memiliki rencana konkret untuk memajukan Indonesia ke arah yang lebih baik.",0,8,0,2,in,2388963246,1729821879903375505,Gojokaisen86,https://twitter.com/Gojokaisen86/status/1729901591921332734 102 | Wed Nov 29 21:21:27 +0000 2023,1729973875046482407,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/ykGOW4vGsS",0,0,0,0,in,2757822406,1729973875046482407,deriskyafriza,https://twitter.com/deriskyafriza/status/1729973875046482407 103 | Wed Nov 29 20:31:37 +0000 2023,1729961333096108096,"@ChrisJ_2211 @Gus_Raharjo @PDI_Perjuangan @adearmando61 2019 Prabowo Subianto dapat award kebohongan terlebay dari PSI, 2024 PSI akan dapat penghargaan dari Rakyat Partai paling Menggibrani....!!!",0,1,0,1,in,1717967580336721920,1729454331379040575,Melly4505483263,https://twitter.com/Melly4505483263/status/1729961333096108096 104 | Wed Nov 29 19:29:55 +0000 2023,1729945805107425495,Pemimpin zalim sudah merusak demokrasi yang masih seumur jagung #gibran #prabowosubianto #neoorba #anies #dinastipolitikjokowi #jokowi #kamimuak #cawapresboneka https://t.co/wtjPo0mYQx,0,0,0,0,in,1710266831326621697,1729945805107425495,inumalica,https://twitter.com/inumalica/status/1729945805107425495 105 | Wed Nov 29 18:08:21 +0000 2023,1729925278133350703,Kekecewaan mahasiswa atas ketidakpahaman Jokowi terhadap demokrasi dan konstitusi #gibran #prabowosubianto #neoorba #anies #dinastipolitikjokowi #jokowi #kamimuak #cawapresboneka || Yati Sukabumi Daesang Sebong Radja Nainggolan ROTY Niki KEREN BHT Sehun Tandanya|| https://t.co/4MI3MSN70a,0,0,0,229,in,1710266831326621697,1729925278133350703,inumalica,https://twitter.com/inumalica/status/1729925278133350703 106 | Wed Nov 29 18:04:59 +0000 2023,1729924432813289928,Jokowi dan kelompok yang telah merusak demokrasi harus taat konstitusi #gibran #prabowosubianto #neoorba #anies #dinastipolitikjokowi #jokowi #kamimuak #cawapresboneka || Yati Sukabumi Daesang Sebong Radja Nainggolan ROTY Niki KEREN BHT Sehun Tandanya|| https://t.co/L25Zlx2PCj,0,0,0,0,in,1710266831326621697,1729924432813289928,inumalica,https://twitter.com/inumalica/status/1729924432813289928 107 | Wed Nov 29 17:59:42 +0000 2023,1729923100798808155,Aksi mahasiswa yang peduli terhadap demokrasi dan benci terhadap nepotisme #gibran #prabowosubianto #neoorba #anies #dinastipolitikjokowi #jokowi #kamimuak #cawapresboneka || Yati Sukabumi Daesang Sebong Radja Nainggolan ROTY Niki KEREN BHT Sehun Tandanya|| https://t.co/MzIDdKKptU,0,0,0,0,in,1710266831326621697,1729923100798808155,inumalica,https://twitter.com/inumalica/status/1729923100798808155 108 | Wed Nov 29 17:50:01 +0000 2023,1729920665439105138,Gerakan mahasiswa untuk mencegah demokrasi semakin hancur #gibran #prabowosubianto #neoorba #anies #dinastipolitikjokowi #jokowi #kamimuak #cawapresboneka || Yati Sukabumi Daesang Sebong Radja Nainggolan ROTY Niki KEREN BHT Sehun Tandanya|| https://t.co/F8hzMXQT8U,0,0,0,0,in,1710258220605022208,1729920665439105138,f4jar_mjaya,https://twitter.com/f4jar_mjaya/status/1729920665439105138 109 | -------------------------------------------------------------------------------- /notebook/enrichment - 2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "a6b8a7b4", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Load libraries \n", 11 | "import os\n", 12 | "import re\n", 13 | "import time\n", 14 | "import openai \n", 15 | "import pandas as pd \n", 16 | "from tqdm import tqdm\n", 17 | "from typing import Tuple\n", 18 | "from dotenv import load_dotenv\n", 19 | "\n", 20 | "\n", 21 | "load_dotenv()\n", 22 | "pd.set_option(\"display.max_columns\", None)" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "id": "7f7ccfa1", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Setting credentials\n", 33 | "OPENAI_KEY = os.getenv(\"OPENAI_API_KEY\", default = None) \n", 34 | "openai.api_key = OPENAI_KEY" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 5, 40 | "id": "e205895d", 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/html": [ 46 | "
\n", 47 | "\n", 60 | "\n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | "
created_atid_strfull_textquote_countreply_countretweet_countfavorite_countlanguser_id_strconversation_id_strusernametweet_url
0Fri Nov 17 23:58:21 +0000 20231.725665e+18@gogo74070675957 @iina_surbakti @gibran_tweet ...0.00.00.01.0in1.213867e+181.725499e+18ArjunaOntheskyhttps://twitter.com/ArjunaOnthesky/status/1725...
1Fri Nov 17 23:58:19 +0000 20231.725665e+18@vendie7 Btw sy msh ingat omelanmu ttg prof MD...NaNNaNNaNNaNNaNNaNNaNNaNNaN
2Fri Nov 17 23:57:49 +0000 20231.725665e+18🔴⚪️ PEMILU terutama PILPRES adalah SATU...1.07.012.029.0in1.378303e+181.725665e+18_BungHerwinhttps://twitter.com/_BungHerwin/status/1725664...
3Fri Nov 17 23:57:34 +0000 20231.725665e+18@ekagumilars Indonesia aman &amp damai tanpa ...0.00.00.00.0in2.537213e+091.725384e+18irfandjayhttps://twitter.com/irfandjay/status/172566450...
4Fri Nov 17 23:57:31 +0000 20231.725664e+18Pilpres kali iniNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", 156 | "
" 157 | ], 158 | "text/plain": [ 159 | " created_at id_str \\\n", 160 | "0 Fri Nov 17 23:58:21 +0000 2023 1.725665e+18 \n", 161 | "1 Fri Nov 17 23:58:19 +0000 2023 1.725665e+18 \n", 162 | "2 Fri Nov 17 23:57:49 +0000 2023 1.725665e+18 \n", 163 | "3 Fri Nov 17 23:57:34 +0000 2023 1.725665e+18 \n", 164 | "4 Fri Nov 17 23:57:31 +0000 2023 1.725664e+18 \n", 165 | "\n", 166 | " full_text quote_count \\\n", 167 | "0 @gogo74070675957 @iina_surbakti @gibran_tweet ... 0.0 \n", 168 | "1 @vendie7 Btw sy msh ingat omelanmu ttg prof MD... NaN \n", 169 | "2 🔴⚪️ PEMILU terutama PILPRES adalah SATU... 1.0 \n", 170 | "3 @ekagumilars Indonesia aman & damai tanpa ... 0.0 \n", 171 | "4 Pilpres kali ini NaN \n", 172 | "\n", 173 | " reply_count retweet_count favorite_count lang user_id_str \\\n", 174 | "0 0.0 0.0 1.0 in 1.213867e+18 \n", 175 | "1 NaN NaN NaN NaN NaN \n", 176 | "2 7.0 12.0 29.0 in 1.378303e+18 \n", 177 | "3 0.0 0.0 0.0 in 2.537213e+09 \n", 178 | "4 NaN NaN NaN NaN NaN \n", 179 | "\n", 180 | " conversation_id_str username \\\n", 181 | "0 1.725499e+18 ArjunaOnthesky \n", 182 | "1 NaN NaN \n", 183 | "2 1.725665e+18 _BungHerwin \n", 184 | "3 1.725384e+18 irfandjay \n", 185 | "4 NaN NaN \n", 186 | "\n", 187 | " tweet_url \n", 188 | "0 https://twitter.com/ArjunaOnthesky/status/1725... \n", 189 | "1 NaN \n", 190 | "2 https://twitter.com/_BungHerwin/status/1725664... \n", 191 | "3 https://twitter.com/irfandjay/status/172566450... \n", 192 | "4 NaN " 193 | ] 194 | }, 195 | "execution_count": 5, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "# Load dataset\n", 202 | "data = pd.read_excel(\"../dataset/Full Dataset Kotor updated 2.0.xlsx\")\n", 203 | "data.head()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 6, 209 | "id": "4065ce80", 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "text/plain": [ 215 | "False 1564\n", 216 | "True 172\n", 217 | "dtype: int64" 218 | ] 219 | }, 220 | "execution_count": 6, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "# Data Duplicate checking\n", 227 | "data.duplicated(subset = ['full_text', 'id_str', 'retweet_count']).value_counts()" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 7, 233 | "id": "bc07851b", 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "data": { 238 | "text/html": [ 239 | "
\n", 240 | "\n", 253 | "\n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | "
created_atid_strfull_textquote_countreply_countretweet_countfavorite_countlanguser_id_strconversation_id_strusernametweet_url
151Fri Nov 17 21:00:46 +0000 20231.725620e+18@NenkMonica @gibran_tweet Salahnya dmn dateng ...NaNNaNNaNNaNNaNNaNNaNNaNNaN
152Fri Nov 17 20:49:46 +0000 20231.725617e+18Kreatif balon #GanjarMahfud2024 anti di Bongka...0.03.07.024.0in7.945097e+171.725617e+18Jayabay19479190https://twitter.com/Jayabay19479190/status/172...
352Fri Nov 17 16:24:41 +0000 20231.725551e+18@DPP_PKB @ninikwafiroh Maju bersama AMIN memen...0.00.00.00.0in1.618235e+181.725482e+18IndahRahalebhttps://twitter.com/IndahRahaleb/status/172555...
353Fri Nov 17 16:23:57 +0000 20231.725550e+18@triwul82 Mahfud MD mengingatkan bahwa tanggun...NaNNaNNaNNaNNaNNaNNaNNaNNaN
529Fri Nov 17 23:56:55 +0000 20231.725664e+18@Mdy_Asmara1701 Para Kabinda di banyak daerah ...NaNNaNNaNNaNNaNNaNNaNNaNNaN
586Fri Nov 17 23:58:21 +0000 20231.725665e+18@gogo74070675957 @iina_surbakti @gibran_tweet ...0.00.00.01.0in1.213867e+181.725499e+18ArjunaOntheskyhttps://twitter.com/ArjunaOnthesky/status/1725...
587Fri Nov 17 23:56:33 +0000 20231.725664e+18Ini Daftar Nama dan Struktur Lengkap TKN Prabo...0.00.00.00.0in1.110933e+181.725664e+18fathw25https://twitter.com/fathw25/status/17256642523...
588Fri Nov 17 23:53:50 +0000 20231.725664e+18Kecuali Jokowi jadi ketum Golkar dan Gibran ja...0.00.00.00.0in8.486730e+071.725664e+18tualanghttps://twitter.com/tualang/status/17256635676...
589Fri Nov 17 23:35:18 +0000 20231.725659e+18Temukan dan dapatkan Kaos Baju prabowo gemoy -...0.00.00.00.0in2.993486e+091.725659e+18aris_jenanghttps://twitter.com/aris_jenang/status/1725658...
590Fri Nov 17 23:35:18 +0000 20231.725659e+18Masyarakat Menilai Gibran Tidak Mempunyai Kapa...0.00.00.01.0in1.618289e+181.725659e+18Liza16144812https://twitter.com/Liza16144812/status/172565...
\n", 424 | "
" 425 | ], 426 | "text/plain": [ 427 | " created_at id_str \\\n", 428 | "151 Fri Nov 17 21:00:46 +0000 2023 1.725620e+18 \n", 429 | "152 Fri Nov 17 20:49:46 +0000 2023 1.725617e+18 \n", 430 | "352 Fri Nov 17 16:24:41 +0000 2023 1.725551e+18 \n", 431 | "353 Fri Nov 17 16:23:57 +0000 2023 1.725550e+18 \n", 432 | "529 Fri Nov 17 23:56:55 +0000 2023 1.725664e+18 \n", 433 | "586 Fri Nov 17 23:58:21 +0000 2023 1.725665e+18 \n", 434 | "587 Fri Nov 17 23:56:33 +0000 2023 1.725664e+18 \n", 435 | "588 Fri Nov 17 23:53:50 +0000 2023 1.725664e+18 \n", 436 | "589 Fri Nov 17 23:35:18 +0000 2023 1.725659e+18 \n", 437 | "590 Fri Nov 17 23:35:18 +0000 2023 1.725659e+18 \n", 438 | "\n", 439 | " full_text quote_count \\\n", 440 | "151 @NenkMonica @gibran_tweet Salahnya dmn dateng ... NaN \n", 441 | "152 Kreatif balon #GanjarMahfud2024 anti di Bongka... 0.0 \n", 442 | "352 @DPP_PKB @ninikwafiroh Maju bersama AMIN memen... 0.0 \n", 443 | "353 @triwul82 Mahfud MD mengingatkan bahwa tanggun... NaN \n", 444 | "529 @Mdy_Asmara1701 Para Kabinda di banyak daerah ... NaN \n", 445 | "586 @gogo74070675957 @iina_surbakti @gibran_tweet ... 0.0 \n", 446 | "587 Ini Daftar Nama dan Struktur Lengkap TKN Prabo... 0.0 \n", 447 | "588 Kecuali Jokowi jadi ketum Golkar dan Gibran ja... 0.0 \n", 448 | "589 Temukan dan dapatkan Kaos Baju prabowo gemoy -... 0.0 \n", 449 | "590 Masyarakat Menilai Gibran Tidak Mempunyai Kapa... 0.0 \n", 450 | "\n", 451 | " reply_count retweet_count favorite_count lang user_id_str \\\n", 452 | "151 NaN NaN NaN NaN NaN \n", 453 | "152 3.0 7.0 24.0 in 7.945097e+17 \n", 454 | "352 0.0 0.0 0.0 in 1.618235e+18 \n", 455 | "353 NaN NaN NaN NaN NaN \n", 456 | "529 NaN NaN NaN NaN NaN \n", 457 | "586 0.0 0.0 1.0 in 1.213867e+18 \n", 458 | "587 0.0 0.0 0.0 in 1.110933e+18 \n", 459 | "588 0.0 0.0 0.0 in 8.486730e+07 \n", 460 | "589 0.0 0.0 0.0 in 2.993486e+09 \n", 461 | "590 0.0 0.0 1.0 in 1.618289e+18 \n", 462 | "\n", 463 | " conversation_id_str username \\\n", 464 | "151 NaN NaN \n", 465 | "152 1.725617e+18 Jayabay19479190 \n", 466 | "352 1.725482e+18 IndahRahaleb \n", 467 | "353 NaN NaN \n", 468 | "529 NaN NaN \n", 469 | "586 1.725499e+18 ArjunaOnthesky \n", 470 | "587 1.725664e+18 fathw25 \n", 471 | "588 1.725664e+18 tualang \n", 472 | "589 1.725659e+18 aris_jenang \n", 473 | "590 1.725659e+18 Liza16144812 \n", 474 | "\n", 475 | " tweet_url \n", 476 | "151 NaN \n", 477 | "152 https://twitter.com/Jayabay19479190/status/172... \n", 478 | "352 https://twitter.com/IndahRahaleb/status/172555... \n", 479 | "353 NaN \n", 480 | "529 NaN \n", 481 | "586 https://twitter.com/ArjunaOnthesky/status/1725... \n", 482 | "587 https://twitter.com/fathw25/status/17256642523... \n", 483 | "588 https://twitter.com/tualang/status/17256635676... \n", 484 | "589 https://twitter.com/aris_jenang/status/1725658... \n", 485 | "590 https://twitter.com/Liza16144812/status/172565... " 486 | ] 487 | }, 488 | "execution_count": 7, 489 | "metadata": {}, 490 | "output_type": "execute_result" 491 | } 492 | ], 493 | "source": [ 494 | "# Overview duplicated data\n", 495 | "data[data.duplicated(subset = ['full_text', 'id_str', 'retweet_count'])].head(10)" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": 8, 501 | "id": "f1d94386", 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "# Duplicate data filtering\n", 506 | "data = data.drop_duplicates(subset = ['full_text', 'id_str', 'retweet_count'])" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 9, 512 | "id": "f5e82215", 513 | "metadata": {}, 514 | "outputs": [ 515 | { 516 | "data": { 517 | "text/plain": [ 518 | "False 1564\n", 519 | "dtype: int64" 520 | ] 521 | }, 522 | "execution_count": 9, 523 | "metadata": {}, 524 | "output_type": "execute_result" 525 | } 526 | ], 527 | "source": [ 528 | "# Data Duplicate checking - validation\n", 529 | "data.duplicated(subset = ['full_text', 'id_str', 'retweet_count']).value_counts()" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 10, 535 | "id": "b8e3d826", 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "# Define prompt and ingestion script\n", 540 | "def prompt_enrichment(tweet_comment: str) -> str:\n", 541 | " prompt = \\\n", 542 | " f\"\"\"\n", 543 | " Ekstraksi informasi yang dibutuhkan berdasarkan komentar twitter dibawah, dengan response cukup sesuai yang di definisikan tanpa penjelasan tambahan.\n", 544 | "\n", 545 | " komentar_twitter: \"{tweet_comment}\"\n", 546 | "\n", 547 | " Untuk response cukup isi dengan format dibawah.\n", 548 | " named_entity_recognition: [Jawaban anda: cakupan NER sesuai label \"PERSON\" atau \"ORGANIZATION\" saja]\n", 549 | " aspect_sentiment: [Identifikasi verb / noun-phrase hasil dari part-of-speech di dalam komentar, disertai dengan nilai sentiment masing-masing aspect dengan format ]\n", 550 | " \"\"\"\n", 551 | " return prompt\n", 552 | "\n", 553 | "def ingest_openai(tweet_comment: str, model_base: str = \"gpt-3.5-turbo\") -> Tuple[str, int]: \n", 554 | " token_usage = 0\n", 555 | " response_extraction = \"\"\n", 556 | " try:\n", 557 | " response = openai.ChatCompletion.create(\n", 558 | " model = model_base, \n", 559 | " messages = [{\"role\" : \"user\", \"content\" : prompt_enrichment(tweet_comment)}], \n", 560 | " temperature = 0.1, max_tokens = 512, top_p = 1.0, \n", 561 | " frequency_penalty = 0.0, presence_penalty = 0.0\n", 562 | " )\n", 563 | " response_extraction = response[\"choices\"][0][\"message\"][\"content\"]\n", 564 | " token_usage = response[\"usage\"][\"total_tokens\"]\n", 565 | " except Exception as E:\n", 566 | " print(f\"[ERROR] - {E}\")\n", 567 | " print(\"Retry with Recursive Func\")\n", 568 | " time.sleep(5)\n", 569 | " ingest_openai(tweet_comment = tweet_comment)\n", 570 | " return response_extraction, token_usage" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 13, 576 | "id": "6447e99d", 577 | "metadata": {}, 578 | "outputs": [ 579 | { 580 | "name": "stdout", 581 | "output_type": "stream", 582 | "text": [ 583 | "Total Rows: 1564\n" 584 | ] 585 | } 586 | ], 587 | "source": [ 588 | "# Check total rows\n", 589 | "print(f\"Total Rows: {data.shape[0]}\")" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 11, 595 | "id": "e8a8dc81", 596 | "metadata": {}, 597 | "outputs": [ 598 | { 599 | "name": "stdout", 600 | "output_type": "stream", 601 | "text": [ 602 | "[COMMENT]\n", 603 | "Komisi Pemilihan Umum (KPU) RI telah menetapkan nomor urut terhadap tiga calon presiden dan wakil presiden di Pilpres 2024. #BersamaIndonesiaMaju #PrabowoGibranIstimewa #PrabowoGemoy Prabowo Subianto\n", 604 | "[RESULT - Token Usage: 279]\n", 605 | "named_entity_recognition: [\"Komisi Pemilihan Umum (KPU) RI\", \"Prabowo Subianto\"]\n", 606 | "aspect_sentiment: [\"menetapkan nomor urut (positive)\", \"tiga calon presiden dan wakil presiden (neutral)\"]\n" 607 | ] 608 | } 609 | ], 610 | "source": [ 611 | "# Test ingestion\n", 612 | "comment = data['full_text'].sample(1).values[0]\n", 613 | "extraction, token_usage = ingest_openai(tweet_comment = comment)\n", 614 | "print(f\"[COMMENT]\\n{comment}\\n[RESULT - Token Usage: {token_usage}]\\n{extraction}\")" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": 14, 620 | "id": "36092a07", 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "# Apply on entire dataset\n", 625 | "final_result_extraction, final_token_usage = [], []" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": 15, 631 | "id": "7072d5e2", 632 | "metadata": {}, 633 | "outputs": [ 634 | { 635 | "name": "stderr", 636 | "output_type": "stream", 637 | "text": [ 638 | "Ingestion Start: 1%|▋ | 19/1564 [07:40<6:43:15, 15.66s/it]" 639 | ] 640 | }, 641 | { 642 | "name": "stdout", 643 | "output_type": "stream", 644 | "text": [ 645 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 646 | "Retry with Recursive Func\n" 647 | ] 648 | }, 649 | { 650 | "name": "stderr", 651 | "output_type": "stream", 652 | "text": [ 653 | "Ingestion Start: 2%|█▍ | 36/1564 [29:06<18:26:39, 43.46s/it]" 654 | ] 655 | }, 656 | { 657 | "name": "stdout", 658 | "output_type": "stream", 659 | "text": [ 660 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 661 | "Retry with Recursive Func\n" 662 | ] 663 | }, 664 | { 665 | "name": "stderr", 666 | "output_type": "stream", 667 | "text": [ 668 | "Ingestion Start: 4%|██▍ | 63/1564 [44:38<6:37:56, 15.91s/it]" 669 | ] 670 | }, 671 | { 672 | "name": "stdout", 673 | "output_type": "stream", 674 | "text": [ 675 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 676 | "Retry with Recursive Func\n" 677 | ] 678 | }, 679 | { 680 | "name": "stderr", 681 | "output_type": "stream", 682 | "text": [ 683 | "Ingestion Start: 5%|██▉ | 76/1564 [57:51<6:08:41, 14.87s/it]" 684 | ] 685 | }, 686 | { 687 | "name": "stdout", 688 | "output_type": "stream", 689 | "text": [ 690 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 691 | "Retry with Recursive Func\n" 692 | ] 693 | }, 694 | { 695 | "name": "stderr", 696 | "output_type": "stream", 697 | "text": [ 698 | "Ingestion Start: 7%|███▊ | 102/1564 [1:19:19<3:43:36, 9.18s/it]" 699 | ] 700 | }, 701 | { 702 | "name": "stdout", 703 | "output_type": "stream", 704 | "text": [ 705 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 706 | "Retry with Recursive Func\n", 707 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 708 | "Retry with Recursive Func\n" 709 | ] 710 | }, 711 | { 712 | "name": "stderr", 713 | "output_type": "stream", 714 | "text": [ 715 | "Ingestion Start: 8%|████▋ | 126/1564 [1:51:21<4:05:45, 10.25s/it]" 716 | ] 717 | }, 718 | { 719 | "name": "stdout", 720 | "output_type": "stream", 721 | "text": [ 722 | "[ERROR] - The server is overloaded or not ready yet.\n", 723 | "Retry with Recursive Func\n" 724 | ] 725 | }, 726 | { 727 | "name": "stderr", 728 | "output_type": "stream", 729 | "text": [ 730 | "Ingestion Start: 10%|█████▌ | 149/1564 [2:08:17<4:53:27, 12.44s/it]" 731 | ] 732 | }, 733 | { 734 | "name": "stdout", 735 | "output_type": "stream", 736 | "text": [ 737 | "[ERROR] - The server is overloaded or not ready yet.\n", 738 | "Retry with Recursive Func\n" 739 | ] 740 | }, 741 | { 742 | "name": "stderr", 743 | "output_type": "stream", 744 | "text": [ 745 | "Ingestion Start: 11%|██████▍ | 174/1564 [2:28:49<6:07:45, 15.87s/it]" 746 | ] 747 | }, 748 | { 749 | "name": "stdout", 750 | "output_type": "stream", 751 | "text": [ 752 | "[ERROR] - The server is overloaded or not ready yet.\n", 753 | "Retry with Recursive Func\n" 754 | ] 755 | }, 756 | { 757 | "name": "stderr", 758 | "output_type": "stream", 759 | "text": [ 760 | "Ingestion Start: 11%|██████▍ | 177/1564 [2:31:18<12:27:23, 32.33s/it]" 761 | ] 762 | }, 763 | { 764 | "name": "stdout", 765 | "output_type": "stream", 766 | "text": [ 767 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 768 | "Retry with Recursive Func\n" 769 | ] 770 | }, 771 | { 772 | "name": "stderr", 773 | "output_type": "stream", 774 | "text": [ 775 | "Ingestion Start: 12%|███████▏ | 195/1564 [2:46:45<8:09:27, 21.45s/it]" 776 | ] 777 | }, 778 | { 779 | "name": "stdout", 780 | "output_type": "stream", 781 | "text": [ 782 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 783 | "Retry with Recursive Func\n" 784 | ] 785 | }, 786 | { 787 | "name": "stderr", 788 | "output_type": "stream", 789 | "text": [ 790 | "Ingestion Start: 13%|███████▎ | 199/1564 [2:57:21<27:32:33, 72.64s/it]" 791 | ] 792 | }, 793 | { 794 | "name": "stdout", 795 | "output_type": "stream", 796 | "text": [ 797 | "[ERROR] - The server is overloaded or not ready yet.\n", 798 | "Retry with Recursive Func\n" 799 | ] 800 | }, 801 | { 802 | "name": "stderr", 803 | "output_type": "stream", 804 | "text": [ 805 | "Ingestion Start: 13%|███████▎ | 202/1564 [2:58:22<14:36:16, 38.60s/it]" 806 | ] 807 | }, 808 | { 809 | "name": "stdout", 810 | "output_type": "stream", 811 | "text": [ 812 | "[ERROR] - The server is overloaded or not ready yet.\n", 813 | "Retry with Recursive Func\n" 814 | ] 815 | }, 816 | { 817 | "name": "stderr", 818 | "output_type": "stream", 819 | "text": [ 820 | "Ingestion Start: 13%|███████▍ | 205/1564 [3:00:05<13:06:18, 34.72s/it]" 821 | ] 822 | }, 823 | { 824 | "name": "stdout", 825 | "output_type": "stream", 826 | "text": [ 827 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 828 | "Retry with Recursive Func\n", 829 | "[ERROR] - The server is overloaded or not ready yet.\n", 830 | "Retry with Recursive Func\n", 831 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 832 | "Retry with Recursive Func\n", 833 | "[ERROR] - The server is overloaded or not ready yet.\n", 834 | "Retry with Recursive Func\n" 835 | ] 836 | }, 837 | { 838 | "name": "stderr", 839 | "output_type": "stream", 840 | "text": [ 841 | "Ingestion Start: 14%|████████ | 218/1564 [3:35:01<6:53:09, 18.42s/it]" 842 | ] 843 | }, 844 | { 845 | "name": "stdout", 846 | "output_type": "stream", 847 | "text": [ 848 | "[ERROR] - The server is overloaded or not ready yet.\n", 849 | "Retry with Recursive Func\n" 850 | ] 851 | }, 852 | { 853 | "name": "stderr", 854 | "output_type": "stream", 855 | "text": [ 856 | "Ingestion Start: 15%|████████▌ | 230/1564 [3:43:20<5:50:06, 15.75s/it]" 857 | ] 858 | }, 859 | { 860 | "name": "stdout", 861 | "output_type": "stream", 862 | "text": [ 863 | "[ERROR] - The server is overloaded or not ready yet.\n", 864 | "Retry with Recursive Func\n" 865 | ] 866 | }, 867 | { 868 | "name": "stderr", 869 | "output_type": "stream", 870 | "text": [ 871 | "Ingestion Start: 16%|█████████▎ | 252/1564 [3:53:26<6:15:50, 17.19s/it]" 872 | ] 873 | }, 874 | { 875 | "name": "stdout", 876 | "output_type": "stream", 877 | "text": [ 878 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 879 | "Retry with Recursive Func\n" 880 | ] 881 | }, 882 | { 883 | "name": "stderr", 884 | "output_type": "stream", 885 | "text": [ 886 | "Ingestion Start: 17%|█████████▋ | 265/1564 [4:09:32<28:04:03, 77.79s/it]" 887 | ] 888 | }, 889 | { 890 | "name": "stdout", 891 | "output_type": "stream", 892 | "text": [ 893 | "[ERROR] - The server is overloaded or not ready yet.\n", 894 | "Retry with Recursive Func\n" 895 | ] 896 | }, 897 | { 898 | "name": "stderr", 899 | "output_type": "stream", 900 | "text": [ 901 | "Ingestion Start: 17%|█████████▊ | 269/1564 [4:17:01<24:25:14, 67.89s/it]" 902 | ] 903 | }, 904 | { 905 | "name": "stdout", 906 | "output_type": "stream", 907 | "text": [ 908 | "[ERROR] - The server is overloaded or not ready yet.\n", 909 | "Retry with Recursive Func\n", 910 | "[ERROR] - The server is overloaded or not ready yet.\n", 911 | "Retry with Recursive Func\n", 912 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 913 | "Retry with Recursive Func\n" 914 | ] 915 | }, 916 | { 917 | "name": "stderr", 918 | "output_type": "stream", 919 | "text": [ 920 | "Ingestion Start: 20%|███████████▎ | 305/1564 [4:38:32<4:02:15, 11.54s/it]" 921 | ] 922 | }, 923 | { 924 | "name": "stdout", 925 | "output_type": "stream", 926 | "text": [ 927 | "[ERROR] - The server is overloaded or not ready yet.\n", 928 | "Retry with Recursive Func\n" 929 | ] 930 | }, 931 | { 932 | "name": "stderr", 933 | "output_type": "stream", 934 | "text": [ 935 | "Ingestion Start: 20%|███████████▌ | 311/1564 [4:41:13<4:17:58, 12.35s/it]" 936 | ] 937 | }, 938 | { 939 | "name": "stdout", 940 | "output_type": "stream", 941 | "text": [ 942 | "[ERROR] - The server is overloaded or not ready yet.\n", 943 | "Retry with Recursive Func\n" 944 | ] 945 | }, 946 | { 947 | "name": "stderr", 948 | "output_type": "stream", 949 | "text": [ 950 | "Ingestion Start: 20%|███████████▍ | 313/1564 [4:47:41<30:43:09, 88.40s/it]" 951 | ] 952 | }, 953 | { 954 | "name": "stdout", 955 | "output_type": "stream", 956 | "text": [ 957 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 958 | "Retry with Recursive Func\n" 959 | ] 960 | }, 961 | { 962 | "name": "stderr", 963 | "output_type": "stream", 964 | "text": [ 965 | "Ingestion Start: 21%|████████████ | 326/1564 [4:59:41<4:37:20, 13.44s/it]" 966 | ] 967 | }, 968 | { 969 | "name": "stdout", 970 | "output_type": "stream", 971 | "text": [ 972 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 973 | "Retry with Recursive Func\n" 974 | ] 975 | }, 976 | { 977 | "name": "stderr", 978 | "output_type": "stream", 979 | "text": [ 980 | "Ingestion Start: 22%|████████████▊ | 345/1564 [5:13:19<3:59:37, 11.79s/it]" 981 | ] 982 | }, 983 | { 984 | "name": "stdout", 985 | "output_type": "stream", 986 | "text": [ 987 | "[ERROR] - The server is overloaded or not ready yet.\n", 988 | "Retry with Recursive Func\n" 989 | ] 990 | }, 991 | { 992 | "name": "stderr", 993 | "output_type": "stream", 994 | "text": [ 995 | "Ingestion Start: 23%|█████████████▍ | 362/1564 [5:17:50<3:48:55, 11.43s/it]" 996 | ] 997 | }, 998 | { 999 | "name": "stdout", 1000 | "output_type": "stream", 1001 | "text": [ 1002 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1003 | "Retry with Recursive Func\n" 1004 | ] 1005 | }, 1006 | { 1007 | "name": "stderr", 1008 | "output_type": "stream", 1009 | "text": [ 1010 | "\r", 1011 | "Ingestion Start: 23%|████████████▉ | 363/1564 [5:28:02<63:54:04, 191.54s/it]" 1012 | ] 1013 | }, 1014 | { 1015 | "name": "stdout", 1016 | "output_type": "stream", 1017 | "text": [ 1018 | "[ERROR] - The server is overloaded or not ready yet.\n", 1019 | "Retry with Recursive Func\n" 1020 | ] 1021 | }, 1022 | { 1023 | "name": "stderr", 1024 | "output_type": "stream", 1025 | "text": [ 1026 | "Ingestion Start: 23%|█████████████▏ | 367/1564 [5:40:14<67:54:21, 204.23s/it]" 1027 | ] 1028 | }, 1029 | { 1030 | "name": "stdout", 1031 | "output_type": "stream", 1032 | "text": [ 1033 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1034 | "Retry with Recursive Func\n" 1035 | ] 1036 | }, 1037 | { 1038 | "name": "stderr", 1039 | "output_type": "stream", 1040 | "text": [ 1041 | "Ingestion Start: 24%|█████████████▌ | 372/1564 [5:51:50<30:17:52, 91.50s/it]" 1042 | ] 1043 | }, 1044 | { 1045 | "name": "stdout", 1046 | "output_type": "stream", 1047 | "text": [ 1048 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1049 | "Retry with Recursive Func\n" 1050 | ] 1051 | }, 1052 | { 1053 | "name": "stderr", 1054 | "output_type": "stream", 1055 | "text": [ 1056 | "Ingestion Start: 25%|██████████████▋ | 395/1564 [6:11:23<5:00:03, 15.40s/it]" 1057 | ] 1058 | }, 1059 | { 1060 | "name": "stdout", 1061 | "output_type": "stream", 1062 | "text": [ 1063 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1064 | "Retry with Recursive Func\n" 1065 | ] 1066 | }, 1067 | { 1068 | "name": "stderr", 1069 | "output_type": "stream", 1070 | "text": [ 1071 | "Ingestion Start: 26%|██████████████▌ | 400/1564 [6:22:11<16:52:25, 52.19s/it]" 1072 | ] 1073 | }, 1074 | { 1075 | "name": "stdout", 1076 | "output_type": "stream", 1077 | "text": [ 1078 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1079 | "Retry with Recursive Func\n" 1080 | ] 1081 | }, 1082 | { 1083 | "name": "stderr", 1084 | "output_type": "stream", 1085 | "text": [ 1086 | "\r", 1087 | "Ingestion Start: 26%|██████████████▎ | 401/1564 [6:32:22<71:00:11, 219.79s/it]" 1088 | ] 1089 | }, 1090 | { 1091 | "name": "stdout", 1092 | "output_type": "stream", 1093 | "text": [ 1094 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1095 | "Retry with Recursive Func\n" 1096 | ] 1097 | }, 1098 | { 1099 | "name": "stderr", 1100 | "output_type": "stream", 1101 | "text": [ 1102 | "Ingestion Start: 28%|████████████████▏ | 438/1564 [6:47:18<2:08:55, 6.87s/it]" 1103 | ] 1104 | }, 1105 | { 1106 | "name": "stdout", 1107 | "output_type": "stream", 1108 | "text": [ 1109 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1110 | "Retry with Recursive Func\n" 1111 | ] 1112 | }, 1113 | { 1114 | "name": "stderr", 1115 | "output_type": "stream", 1116 | "text": [ 1117 | "Ingestion Start: 29%|████████████████▌ | 447/1564 [6:58:12<4:59:52, 16.11s/it]" 1118 | ] 1119 | }, 1120 | { 1121 | "name": "stdout", 1122 | "output_type": "stream", 1123 | "text": [ 1124 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1125 | "Retry with Recursive Func\n" 1126 | ] 1127 | }, 1128 | { 1129 | "name": "stderr", 1130 | "output_type": "stream", 1131 | "text": [ 1132 | "Ingestion Start: 31%|█████████████████▉ | 484/1564 [7:12:20<2:08:16, 7.13s/it]" 1133 | ] 1134 | }, 1135 | { 1136 | "name": "stdout", 1137 | "output_type": "stream", 1138 | "text": [ 1139 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1140 | "Retry with Recursive Func\n" 1141 | ] 1142 | }, 1143 | { 1144 | "name": "stderr", 1145 | "output_type": "stream", 1146 | "text": [ 1147 | "Ingestion Start: 38%|██████████████████████▎ | 601/1564 [7:34:58<2:50:47, 10.64s/it]" 1148 | ] 1149 | }, 1150 | { 1151 | "name": "stdout", 1152 | "output_type": "stream", 1153 | "text": [ 1154 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1155 | "Retry with Recursive Func\n" 1156 | ] 1157 | }, 1158 | { 1159 | "name": "stderr", 1160 | "output_type": "stream", 1161 | "text": [ 1162 | "Ingestion Start: 39%|██████████████████████ | 606/1564 [7:45:28<13:17:59, 49.98s/it]" 1163 | ] 1164 | }, 1165 | { 1166 | "name": "stdout", 1167 | "output_type": "stream", 1168 | "text": [ 1169 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1170 | "Retry with Recursive Func\n" 1171 | ] 1172 | }, 1173 | { 1174 | "name": "stderr", 1175 | "output_type": "stream", 1176 | "text": [ 1177 | "Ingestion Start: 39%|█████████████████████▊ | 609/1564 [7:55:46<28:57:12, 109.14s/it]" 1178 | ] 1179 | }, 1180 | { 1181 | "name": "stdout", 1182 | "output_type": "stream", 1183 | "text": [ 1184 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1185 | "Retry with Recursive Func\n" 1186 | ] 1187 | }, 1188 | { 1189 | "name": "stderr", 1190 | "output_type": "stream", 1191 | "text": [ 1192 | "Ingestion Start: 40%|███████████████████████ | 623/1564 [8:15:31<3:25:04, 13.08s/it]" 1193 | ] 1194 | }, 1195 | { 1196 | "name": "stdout", 1197 | "output_type": "stream", 1198 | "text": [ 1199 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1200 | "Retry with Recursive Func\n" 1201 | ] 1202 | }, 1203 | { 1204 | "name": "stderr", 1205 | "output_type": "stream", 1206 | "text": [ 1207 | "Ingestion Start: 42%|████████████████████████▌ | 662/1564 [8:38:18<1:34:09, 6.26s/it]" 1208 | ] 1209 | }, 1210 | { 1211 | "name": "stdout", 1212 | "output_type": "stream", 1213 | "text": [ 1214 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1215 | "Retry with Recursive Func\n" 1216 | ] 1217 | }, 1218 | { 1219 | "name": "stderr", 1220 | "output_type": "stream", 1221 | "text": [ 1222 | "Ingestion Start: 45%|██████████████████████████▎ | 710/1564 [9:00:09<3:27:47, 14.60s/it]" 1223 | ] 1224 | }, 1225 | { 1226 | "name": "stdout", 1227 | "output_type": "stream", 1228 | "text": [ 1229 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1230 | "Retry with Recursive Func\n" 1231 | ] 1232 | }, 1233 | { 1234 | "name": "stderr", 1235 | "output_type": "stream", 1236 | "text": [ 1237 | "Ingestion Start: 48%|███████████████████████████▋ | 746/1564 [9:41:35<9:45:53, 42.97s/it]" 1238 | ] 1239 | }, 1240 | { 1241 | "name": "stdout", 1242 | "output_type": "stream", 1243 | "text": [ 1244 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1245 | "Retry with Recursive Func\n" 1246 | ] 1247 | }, 1248 | { 1249 | "name": "stderr", 1250 | "output_type": "stream", 1251 | "text": [ 1252 | "Ingestion Start: 48%|███████████████████████████▉ | 753/1564 [9:52:47<8:15:51, 36.69s/it]" 1253 | ] 1254 | }, 1255 | { 1256 | "name": "stdout", 1257 | "output_type": "stream", 1258 | "text": [ 1259 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1260 | "Retry with Recursive Func\n" 1261 | ] 1262 | }, 1263 | { 1264 | "name": "stderr", 1265 | "output_type": "stream", 1266 | "text": [ 1267 | "Ingestion Start: 52%|█████████████████████████████▌ | 811/1564 [10:20:21<2:18:22, 11.03s/it]" 1268 | ] 1269 | }, 1270 | { 1271 | "name": "stdout", 1272 | "output_type": "stream", 1273 | "text": [ 1274 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1275 | "Retry with Recursive Func\n" 1276 | ] 1277 | }, 1278 | { 1279 | "name": "stderr", 1280 | "output_type": "stream", 1281 | "text": [ 1282 | "Ingestion Start: 57%|████████████████████████████████▍ | 889/1564 [10:55:13<1:22:39, 7.35s/it]" 1283 | ] 1284 | }, 1285 | { 1286 | "name": "stdout", 1287 | "output_type": "stream", 1288 | "text": [ 1289 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1290 | "Retry with Recursive Func\n" 1291 | ] 1292 | }, 1293 | { 1294 | "name": "stderr", 1295 | "output_type": "stream", 1296 | "text": [ 1297 | "Ingestion Start: 57%|████████████████████████████████▌ | 895/1564 [11:06:15<7:19:04, 39.38s/it]" 1298 | ] 1299 | }, 1300 | { 1301 | "name": "stdout", 1302 | "output_type": "stream", 1303 | "text": [ 1304 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1305 | "Retry with Recursive Func\n" 1306 | ] 1307 | }, 1308 | { 1309 | "name": "stderr", 1310 | "output_type": "stream", 1311 | "text": [ 1312 | "Ingestion Start: 62%|███████████████████████████████████▏ | 965/1564 [11:32:20<1:40:08, 10.03s/it]" 1313 | ] 1314 | }, 1315 | { 1316 | "name": "stdout", 1317 | "output_type": "stream", 1318 | "text": [ 1319 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1320 | "Retry with Recursive Func\n" 1321 | ] 1322 | }, 1323 | { 1324 | "name": "stderr", 1325 | "output_type": "stream", 1326 | "text": [ 1327 | "Ingestion Start: 62%|███████████████████████████████████▍ | 972/1564 [11:42:57<4:19:43, 26.32s/it]" 1328 | ] 1329 | }, 1330 | { 1331 | "name": "stdout", 1332 | "output_type": "stream", 1333 | "text": [ 1334 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1335 | "Retry with Recursive Func\n" 1336 | ] 1337 | }, 1338 | { 1339 | "name": "stderr", 1340 | "output_type": "stream", 1341 | "text": [ 1342 | "Ingestion Start: 64%|███████████████████████████████████▋ | 998/1564 [12:13:14<12:00:17, 76.36s/it]" 1343 | ] 1344 | }, 1345 | { 1346 | "name": "stdout", 1347 | "output_type": "stream", 1348 | "text": [ 1349 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1350 | "Retry with Recursive Func\n" 1351 | ] 1352 | }, 1353 | { 1354 | "name": "stderr", 1355 | "output_type": "stream", 1356 | "text": [ 1357 | "Ingestion Start: 64%|██████████████████████████████████▌ | 1000/1564 [12:23:35<26:26:48, 168.81s/it]" 1358 | ] 1359 | }, 1360 | { 1361 | "name": "stdout", 1362 | "output_type": "stream", 1363 | "text": [ 1364 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1365 | "Retry with Recursive Func\n" 1366 | ] 1367 | }, 1368 | { 1369 | "name": "stderr", 1370 | "output_type": "stream", 1371 | "text": [ 1372 | "Ingestion Start: 65%|████████████████████████████████████▍ | 1019/1564 [12:36:38<1:47:35, 11.84s/it]" 1373 | ] 1374 | }, 1375 | { 1376 | "name": "stdout", 1377 | "output_type": "stream", 1378 | "text": [ 1379 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1380 | "Retry with Recursive Func\n" 1381 | ] 1382 | }, 1383 | { 1384 | "name": "stderr", 1385 | "output_type": "stream", 1386 | "text": [ 1387 | "Ingestion Start: 65%|████████████████████████████████████▋ | 1024/1564 [12:47:09<7:29:42, 49.97s/it]" 1388 | ] 1389 | }, 1390 | { 1391 | "name": "stdout", 1392 | "output_type": "stream", 1393 | "text": [ 1394 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1395 | "Retry with Recursive Func\n" 1396 | ] 1397 | }, 1398 | { 1399 | "name": "stderr", 1400 | "output_type": "stream", 1401 | "text": [ 1402 | "Ingestion Start: 68%|██████████████████████████████████████▎ | 1071/1564 [13:28:24<2:33:12, 18.65s/it]" 1403 | ] 1404 | }, 1405 | { 1406 | "name": "stdout", 1407 | "output_type": "stream", 1408 | "text": [ 1409 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1410 | "Retry with Recursive Func\n" 1411 | ] 1412 | }, 1413 | { 1414 | "name": "stderr", 1415 | "output_type": "stream", 1416 | "text": [ 1417 | "Ingestion Start: 69%|██████████████████████████████████████▋ | 1079/1564 [13:39:21<3:10:21, 23.55s/it]" 1418 | ] 1419 | }, 1420 | { 1421 | "name": "stdout", 1422 | "output_type": "stream", 1423 | "text": [ 1424 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1425 | "Retry with Recursive Func\n" 1426 | ] 1427 | }, 1428 | { 1429 | "name": "stderr", 1430 | "output_type": "stream", 1431 | "text": [ 1432 | "Ingestion Start: 71%|███████████████████████████████████████▌ | 1104/1564 [13:56:41<1:19:00, 10.31s/it]" 1433 | ] 1434 | }, 1435 | { 1436 | "name": "stdout", 1437 | "output_type": "stream", 1438 | "text": [ 1439 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1440 | "Retry with Recursive Func\n" 1441 | ] 1442 | }, 1443 | { 1444 | "name": "stderr", 1445 | "output_type": "stream", 1446 | "text": [ 1447 | "Ingestion Start: 72%|████████████████████████████████████████▍ | 1128/1564 [14:11:15<1:14:02, 10.19s/it]" 1448 | ] 1449 | }, 1450 | { 1451 | "name": "stdout", 1452 | "output_type": "stream", 1453 | "text": [ 1454 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1455 | "Retry with Recursive Func\n" 1456 | ] 1457 | }, 1458 | { 1459 | "name": "stderr", 1460 | "output_type": "stream", 1461 | "text": [ 1462 | "Ingestion Start: 73%|████████████████████████████████████████▊ | 1139/1564 [14:22:51<1:52:28, 15.88s/it]" 1463 | ] 1464 | }, 1465 | { 1466 | "name": "stdout", 1467 | "output_type": "stream", 1468 | "text": [ 1469 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1470 | "Retry with Recursive Func\n" 1471 | ] 1472 | }, 1473 | { 1474 | "name": "stderr", 1475 | "output_type": "stream", 1476 | "text": [ 1477 | "Ingestion Start: 74%|█████████████████████████████████████████▍ | 1158/1564 [14:36:00<1:24:59, 12.56s/it]" 1478 | ] 1479 | }, 1480 | { 1481 | "name": "stdout", 1482 | "output_type": "stream", 1483 | "text": [ 1484 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1485 | "Retry with Recursive Func\n" 1486 | ] 1487 | }, 1488 | { 1489 | "name": "stderr", 1490 | "output_type": "stream", 1491 | "text": [ 1492 | "Ingestion Start: 78%|█████████████████████████████████████████████▍ | 1226/1564 [14:55:12<40:19, 7.16s/it]" 1493 | ] 1494 | }, 1495 | { 1496 | "name": "stdout", 1497 | "output_type": "stream", 1498 | "text": [ 1499 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1500 | "Retry with Recursive Func\n" 1501 | ] 1502 | }, 1503 | { 1504 | "name": "stderr", 1505 | "output_type": "stream", 1506 | "text": [ 1507 | "Ingestion Start: 81%|██████████████████████████████████████████████▉ | 1265/1564 [15:09:59<38:10, 7.66s/it]" 1508 | ] 1509 | }, 1510 | { 1511 | "name": "stdout", 1512 | "output_type": "stream", 1513 | "text": [ 1514 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1515 | "Retry with Recursive Func\n" 1516 | ] 1517 | }, 1518 | { 1519 | "name": "stderr", 1520 | "output_type": "stream", 1521 | "text": [ 1522 | "Ingestion Start: 81%|█████████████████████████████████████████████▌ | 1274/1564 [15:21:10<1:29:41, 18.56s/it]" 1523 | ] 1524 | }, 1525 | { 1526 | "name": "stdout", 1527 | "output_type": "stream", 1528 | "text": [ 1529 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1530 | "Retry with Recursive Func\n" 1531 | ] 1532 | }, 1533 | { 1534 | "name": "stderr", 1535 | "output_type": "stream", 1536 | "text": [ 1537 | "Ingestion Start: 83%|███████████████████████████████████████████████▉ | 1291/1564 [15:33:34<34:34, 7.60s/it]" 1538 | ] 1539 | }, 1540 | { 1541 | "name": "stdout", 1542 | "output_type": "stream", 1543 | "text": [ 1544 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1545 | "Retry with Recursive Func\n" 1546 | ] 1547 | }, 1548 | { 1549 | "name": "stderr", 1550 | "output_type": "stream", 1551 | "text": [ 1552 | "Ingestion Start: 84%|████████████████████████████████████████████████▍ | 1306/1564 [15:45:26<24:32, 5.71s/it]" 1553 | ] 1554 | }, 1555 | { 1556 | "name": "stdout", 1557 | "output_type": "stream", 1558 | "text": [ 1559 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1560 | "Retry with Recursive Func\n" 1561 | ] 1562 | }, 1563 | { 1564 | "name": "stderr", 1565 | "output_type": "stream", 1566 | "text": [ 1567 | "Ingestion Start: 87%|██████████████████████████████████████████████████▏ | 1354/1564 [16:02:15<32:11, 9.20s/it]" 1568 | ] 1569 | }, 1570 | { 1571 | "name": "stdout", 1572 | "output_type": "stream", 1573 | "text": [ 1574 | "[ERROR] - HTTP code 502 from API (\r\n", 1575 | "502 Bad Gateway\r\n", 1576 | "\r\n", 1577 | "

502 Bad Gateway

\r\n", 1578 | "
cloudflare
\r\n", 1579 | "\r\n", 1580 | "\r\n", 1581 | ")\n", 1582 | "Retry with Recursive Func\n" 1583 | ] 1584 | }, 1585 | { 1586 | "name": "stderr", 1587 | "output_type": "stream", 1588 | "text": [ 1589 | "\r", 1590 | "Ingestion Start: 87%|████████████████████████████████████████████████▌ | 1355/1564 [16:03:45<1:56:09, 33.35s/it]" 1591 | ] 1592 | }, 1593 | { 1594 | "name": "stdout", 1595 | "output_type": "stream", 1596 | "text": [ 1597 | "[ERROR] - HTTP code 502 from API (\r\n", 1598 | "502 Bad Gateway\r\n", 1599 | "\r\n", 1600 | "

502 Bad Gateway

\r\n", 1601 | "
cloudflare
\r\n", 1602 | "\r\n", 1603 | "\r\n", 1604 | ")\n", 1605 | "Retry with Recursive Func\n" 1606 | ] 1607 | }, 1608 | { 1609 | "name": "stderr", 1610 | "output_type": "stream", 1611 | "text": [ 1612 | "Ingestion Start: 87%|██████████████████████████████████████████████████▍ | 1360/1564 [16:04:37<43:57, 12.93s/it]" 1613 | ] 1614 | }, 1615 | { 1616 | "name": "stdout", 1617 | "output_type": "stream", 1618 | "text": [ 1619 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1620 | "Retry with Recursive Func\n" 1621 | ] 1622 | }, 1623 | { 1624 | "name": "stderr", 1625 | "output_type": "stream", 1626 | "text": [ 1627 | "\r", 1628 | "Ingestion Start: 87%|██████████████████████████████████████████████▉ | 1361/1564 [16:14:45<10:46:56, 191.21s/it]" 1629 | ] 1630 | }, 1631 | { 1632 | "name": "stdout", 1633 | "output_type": "stream", 1634 | "text": [ 1635 | "[ERROR] - HTTP code 502 from API (\r\n", 1636 | "502 Bad Gateway\r\n", 1637 | "\r\n", 1638 | "

502 Bad Gateway

\r\n", 1639 | "
cloudflare
\r\n", 1640 | "\r\n", 1641 | "\r\n", 1642 | ")\n", 1643 | "Retry with Recursive Func\n" 1644 | ] 1645 | }, 1646 | { 1647 | "name": "stderr", 1648 | "output_type": "stream", 1649 | "text": [ 1650 | "Ingestion Start: 94%|██████████████████████████████████████████████████████▋ | 1473/1564 [16:30:36<10:48, 7.13s/it]" 1651 | ] 1652 | }, 1653 | { 1654 | "name": "stdout", 1655 | "output_type": "stream", 1656 | "text": [ 1657 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1658 | "Retry with Recursive Func\n" 1659 | ] 1660 | }, 1661 | { 1662 | "name": "stderr", 1663 | "output_type": "stream", 1664 | "text": [ 1665 | "Ingestion Start: 95%|████████████████████████████████████████████████████▉ | 1478/1564 [16:41:08<1:10:51, 49.44s/it]" 1666 | ] 1667 | }, 1668 | { 1669 | "name": "stdout", 1670 | "output_type": "stream", 1671 | "text": [ 1672 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1673 | "Retry with Recursive Func\n" 1674 | ] 1675 | }, 1676 | { 1677 | "name": "stderr", 1678 | "output_type": "stream", 1679 | "text": [ 1680 | "Ingestion Start: 95%|█████████████████████████████████████████████████████▏ | 1486/1564 [16:55:33<1:06:36, 51.23s/it]" 1681 | ] 1682 | }, 1683 | { 1684 | "name": "stdout", 1685 | "output_type": "stream", 1686 | "text": [ 1687 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1688 | "Retry with Recursive Func\n" 1689 | ] 1690 | }, 1691 | { 1692 | "name": "stderr", 1693 | "output_type": "stream", 1694 | "text": [ 1695 | "Ingestion Start: 98%|████████████████████████████████████████████████████████▋ | 1530/1564 [17:19:09<12:27, 21.99s/it]" 1696 | ] 1697 | }, 1698 | { 1699 | "name": "stdout", 1700 | "output_type": "stream", 1701 | "text": [ 1702 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1703 | "Retry with Recursive Func\n" 1704 | ] 1705 | }, 1706 | { 1707 | "name": "stderr", 1708 | "output_type": "stream", 1709 | "text": [ 1710 | "Ingestion Start: 99%|█████████████████████████████████████████████████████████▎| 1547/1564 [17:31:34<02:05, 7.38s/it]" 1711 | ] 1712 | }, 1713 | { 1714 | "name": "stdout", 1715 | "output_type": "stream", 1716 | "text": [ 1717 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n", 1718 | "Retry with Recursive Func\n" 1719 | ] 1720 | }, 1721 | { 1722 | "name": "stderr", 1723 | "output_type": "stream", 1724 | "text": [ 1725 | "Ingestion Start: 100%|██████████████████████████████████████████████████████████| 1564/1564 [17:44:08<00:00, 40.82s/it]\n" 1726 | ] 1727 | } 1728 | ], 1729 | "source": [ 1730 | "# Iter and push into array\n", 1731 | "for comment in tqdm(data[\"full_text\"], desc = \"Ingestion Start\"):\n", 1732 | " result, token = ingest_openai(tweet_comment = comment)\n", 1733 | " final_result_extraction.append(result)\n", 1734 | " final_token_usage.append(token)" 1735 | ] 1736 | }, 1737 | { 1738 | "cell_type": "code", 1739 | "execution_count": 16, 1740 | "id": "290b2686", 1741 | "metadata": {}, 1742 | "outputs": [ 1743 | { 1744 | "data": { 1745 | "text/plain": [ 1746 | "(1564, 1564)" 1747 | ] 1748 | }, 1749 | "execution_count": 16, 1750 | "metadata": {}, 1751 | "output_type": "execute_result" 1752 | } 1753 | ], 1754 | "source": [ 1755 | "len(final_result_extraction), len(final_token_usage)" 1756 | ] 1757 | }, 1758 | { 1759 | "cell_type": "code", 1760 | "execution_count": 19, 1761 | "id": "06534b4e", 1762 | "metadata": {}, 1763 | "outputs": [], 1764 | "source": [ 1765 | "# Assign result into dataframe\n", 1766 | "data['result extraction'] = final_result_extraction\n", 1767 | "data['token usage'] = final_token_usage" 1768 | ] 1769 | }, 1770 | { 1771 | "cell_type": "code", 1772 | "execution_count": 20, 1773 | "id": "14520371", 1774 | "metadata": {}, 1775 | "outputs": [], 1776 | "source": [ 1777 | "# Save into dataframe\n", 1778 | "data.to_csv(\"../dataset/data_twitter_pemilu_2024_enrich [V2].csv\", index = False)" 1779 | ] 1780 | } 1781 | ], 1782 | "metadata": { 1783 | "kernelspec": { 1784 | "display_name": "Python 3 (ipykernel)", 1785 | "language": "python", 1786 | "name": "python3" 1787 | }, 1788 | "language_info": { 1789 | "codemirror_mode": { 1790 | "name": "ipython", 1791 | "version": 3 1792 | }, 1793 | "file_extension": ".py", 1794 | "mimetype": "text/x-python", 1795 | "name": "python", 1796 | "nbconvert_exporter": "python", 1797 | "pygments_lexer": "ipython3", 1798 | "version": "3.9.7" 1799 | } 1800 | }, 1801 | "nbformat": 4, 1802 | "nbformat_minor": 5 1803 | } 1804 | --------------------------------------------------------------------------------