├── cache
└── cache_db.db
├── dataset
├── processed.xlsx
├── Full Dataset Kotor updated 2.0.xlsx
└── README.md
├── pipeline-diagram-stable
├── pipeline.png
└── interface.jpeg
├── src
├── secret.py
└── extract_enrichment.py
├── dags
├── crawler
│ ├── twitter_scrapper.py
│ ├── news_scrapper.py
│ └── final_dataset
│ │ └── twitter_prabowo_subianto.csv
└── preprocess
│ ├── stopwords.txt
│ ├── extraction.py
│ ├── process.py
│ └── enrich.py
├── README.md
├── .gitignore
├── requirements-data-engineer.txt
├── app.py
├── LICENSE
└── notebook
├── enrichment.ipynb
└── enrichment - 2.ipynb
/cache/cache_db.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NnA301023/ElectionAspectAnalyzer/HEAD/cache/cache_db.db
--------------------------------------------------------------------------------
/dataset/processed.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NnA301023/ElectionAspectAnalyzer/HEAD/dataset/processed.xlsx
--------------------------------------------------------------------------------
/pipeline-diagram-stable/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NnA301023/ElectionAspectAnalyzer/HEAD/pipeline-diagram-stable/pipeline.png
--------------------------------------------------------------------------------
/src/secret.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dotenv import load_dotenv
3 |
4 | load_dotenv()
5 | OPENAI_KEY = os.getenv("OPENAI_API_KEY", default = None)
--------------------------------------------------------------------------------
/pipeline-diagram-stable/interface.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NnA301023/ElectionAspectAnalyzer/HEAD/pipeline-diagram-stable/interface.jpeg
--------------------------------------------------------------------------------
/dataset/Full Dataset Kotor updated 2.0.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NnA301023/ElectionAspectAnalyzer/HEAD/dataset/Full Dataset Kotor updated 2.0.xlsx
--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
1 | ### Disclaimer
2 |
3 | Dataset merupakan hasil pengambilan dari twitter yang tersedia di kaggle dengan notebook scrapping data tertera.
4 | Special regards untuk mas [@andree cy](https://www.kaggle.com/andreecy) dan mas [@Mc Affandi](https://www.kaggle.com/mcaffandi) atas hasil scrapping data nya, sehingga bisa kami manfaatkan untuk kelancaran pengerjaan final project kami.
5 |
6 | - https://www.kaggle.com/code/andreecy/paslon-2024-tweet-data/notebook
--------------------------------------------------------------------------------
/dags/crawler/twitter_scrapper.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import tweepy
3 | import pandas as pd
4 |
5 |
6 | class TwitterScrapper:
7 | def __init__(self, token, secret, api_key, api_secret):
8 | auth = tweepy.OAuthHandler(api_key, api_secret)
9 | auth.set_access_token(token, secret)
10 | self.api = tweepy.API(auth, wait_on_rate_limit=True)
11 | def search(self, keyword):
12 | c, i, u, t = [], [], [], []
13 | output_filename = open(
14 | keyword + ".csv", mode="a+",
15 | newline="", encoding="utf-8"
16 | )
17 | csv_file = csv.writer(output_filename)
18 |
19 | for tweet in tweepy.Cursos(
20 | self.api.search_tweets,
21 | q=keyword, count=15, lang="id",
22 | start_time="2023-01-01T00:00:00Z", end_time="2023-11-30T23:59:59Z"
23 | ).items():
24 | c.append(tweet.created_at)
25 | i.append(tweet.id)
26 | u.append(tweet.user.name)
27 | t.append(tweet.text.encode("utf-8"))
28 | tweets = [tweet.created_at, tweet.id, tweet.user.name, tweet.text.encode("utf-8")]
29 | csv_file.writerow(tweets)
30 |
31 | dictTweets = {"waktu": c, "id": i, "username": u, "teks": t}
32 | df = pd.DataFrame(dictTweets, columns=["waktu", "id", "username", "teks"])
33 |
34 |
35 | if __name__ == "__main__":
36 | scrapping = TwitterScrapper(..., ..., ..., ...)
37 | scrapping.search('...')
--------------------------------------------------------------------------------
/dags/preprocess/stopwords.txt:
--------------------------------------------------------------------------------
1 | kah
2 | oh
3 | sebagai
4 | kami
5 | tanpa
6 | daripada
7 | sambil
8 | sementara
9 | kecuali
10 | sekitar
11 | ke
12 | ia
13 | sampai
14 | ini
15 | bisa
16 | secara
17 | untuk
18 | pula
19 | yakni
20 | dst
21 | demi
22 | walau
23 | sudah
24 | kemana
25 | ok
26 | nanti
27 | saja
28 | bahwa
29 | telah
30 | yang
31 | ada
32 | seterusnya
33 | serta
34 | dahulu
35 | saat
36 | akan
37 | itulah
38 | saya
39 | sehingga
40 | karena
41 | adalah
42 | dua
43 | toh
44 | ya
45 | sesuatu
46 | nggak
47 | tidak
48 | para
49 | kepada
50 | jika
51 | melainkan
52 | anda
53 | atau
54 | dari
55 | pun
56 | pada
57 | dsb
58 | amat
59 | begitu
60 | sebelum
61 | menurut
62 | sebab
63 | seraya
64 | hanya
65 | antara
66 | sebetulnya
67 | seperti
68 | seolah
69 | selain
70 | di
71 | namun
72 | kembali
73 | setiap
74 | ketika
75 | maka
76 | mengapa
77 | dengan
78 | selagi
79 | lagi
80 | anu
81 | agak
82 | supaya
83 | dapat
84 | tapi
85 | masih
86 | tentu
87 | pasti
88 | bagi
89 | seharusnya
90 | tentang
91 | agar
92 | boleh
93 | ingin
94 | guna
95 | tolong
96 | apalagi
97 | utk
98 | kenapa
99 | yaitu
100 | dll
101 | dulunya
102 | itu
103 | dimana
104 | sedangkan
105 | lain
106 | kita
107 | dan
108 | mereka
109 | harus
110 | belum
111 | dia
112 | tetapi
113 | sesudah
114 | mari
115 | setidaknya
116 | oleh
117 | terhadap
118 | hal
119 | apakah
120 | demikian
121 | juga
122 | bagaimanapun
123 | setelah
124 | dalam
125 |
--------------------------------------------------------------------------------
/dags/preprocess/extraction.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from tqdm import tqdm
3 | from random import choice
4 | pd.set_option("display.max_columns", None)
5 |
6 |
7 | list_sentiment = ['positive', 'negative', 'neutral']
8 | list_validity = ['fake', 'real']
9 |
10 | data = pd.read_csv("final_dataset/clean_data.csv")
11 |
12 | source, keyword, timestamp, author, content, topic = [], [], [], [], [], []
13 | for i, row in tqdm(data.iterrows()):
14 | _source = row['source']
15 | _keyword = row['keyword']
16 | _timestamp = row['timestamp']
17 | _author = row['author']
18 | _content = row['clean_content']
19 | topics = row['topic_extract']
20 | if isinstance(topics, float):
21 | continue
22 | topics = topics.split(", ")
23 |
24 | for _topic in topics:
25 | source.append(_source)
26 | keyword.append(_keyword)
27 | timestamp.append(_timestamp)
28 | author.append(_author)
29 | content.append(_content)
30 | topic.append(_topic)
31 |
32 | data_final = pd.DataFrame({
33 | "source": source,
34 | "keyword": keyword,
35 | "timestamp": timestamp,
36 | "author": author,
37 | "content": content,
38 | "topic": topic
39 | })
40 |
41 | data_final['timestamp'] = pd.to_datetime(data_final['timestamp'])
42 | data_final['timestamp'] = data_final['timestamp'].dt.strftime("%Y-%m-%d %H:%M:%S")
43 | data_final['timestamp'] = pd.to_datetime(data_final['timestamp'], format="%Y-%m-%d %H:%M:%S")
44 | data_final['sentiment'] = list(map(lambda _: choice(list_sentiment), range(len(data_final))))
45 | data_final['source_validity'] = list(map(lambda _: choice(list_validity), range(len(data_final))))
46 | data_final.to_csv("final_dataset/result.csv", index=False)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ElectionAspectAnalyzer
2 | Indonesian-AI Final Project aimed at providing in-depth insights into the 2024 election through social network analysis and sentiment assessment. We employ Social Network Analysis (SNA) and Aspect-Based Sentiment Analysis (ABSA) techniques to understand public interactions and opinions regarding the election.
3 |
4 |
5 |

6 |
7 |
8 | ## TODO's
9 |
10 | - [x] Collecting Dataset
11 | - [x] Find Open Source Model to Enhance Insight from Dataset
12 | - [x] Model Integration
13 | - [x] Dashboard Creation
14 | - [x] Internal Testing
15 | - [ ] Write Proper Project Documentation
16 |
17 | ## How To Use
18 |
19 | ```
20 | # Incoming...
21 | ```
22 |
23 | ## Project Outcome
24 | Gain a comprehensive understanding of the 2024 election in Indonesia through the use of Social Network Analysis (SNA) and Aspect-Based Sentiment Analysis (ABSA) techniques.
25 |
26 | ## Impact Outcome
27 |
28 | 1. **Understanding Public Aspect regards several Election Candidate**
29 | 2. **Assesing Public Sentiments**
30 | 3. **Proper Information using Interactive QA**
31 |
32 | ## Features
33 |
34 | - [x] SNA Graph
35 | - [x] Interactive QA
36 |
37 | ## Technique Used
38 |
39 | - Prompting: One Shot Learning.
40 | - Part-Of-Speech for Aspect Detection.
41 | - Aspect Pair Classification for Sentiment Detection per Sentence & Aspect.
42 | - Named Entity Recognition for Person or Organization Recognize in Tweets Comment.
43 |
44 | ## Tech Stack
45 |
46 | - [OpenAI](https://github.com/openai/openai-python)
47 | - [Langchain](https://python.langchain.com/docs/get_started/introduction)
48 | - [Streamlit](https://streamlit.io/)
49 | - [Streamlit Agraph](https://github.com/ChrisDelClea/streamlit-agraph)
50 |
51 | ## Contributors
52 |
53 | - [Muhammad Alif Ramadhan](https://github.com/NnA301023)
54 | - [Devandra Alandra Wicaksono](https://github.com/DevaraAlandra)
55 | - [Eko Prasetyo](https://github.com/eko-prstyw)
56 | - [Yuliana](https://github.com/yuliana4763)
57 | - [Raphon Galuh C.](https://github.com/RaphonGaluh)
58 |
59 | ## Future Improvements
60 |
61 | Continue development MVP prorotype into mentioned diagram
62 |
63 |

64 |
65 |
--------------------------------------------------------------------------------
/src/extract_enrichment.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import pandas as pd
3 | from typing import Union, List, Tuple
4 |
5 |
6 | def read_file(filename: str) -> Union[pd.DataFrame, None]:
7 | data = None
8 | try:
9 | if filename.endswith("csv"):
10 | data = pd.read_csv(filename)
11 | if filename.endswith("xlsx"):
12 | data = pd.read_excel(filename)
13 | data = data.dropna().reset_index(drop = True)
14 | except Exception as E:
15 | print(f"[ERROR] - {E}")
16 | return data
17 |
18 | def clean_result(result: str) -> List[str]:
19 | return result.split(": ")[-1].replace('[','').replace(']', '').replace('"', '').split(", ")
20 |
21 | def parse_result(result: str) -> Tuple[List[str], List[str], List[str]]:
22 | aspect, entity, sentiment = [], [], []
23 | try:
24 | tokenize = [res for res in result.split("\n") if res != ""]
25 | entity_raw = clean_result(tokenize[0])
26 | if len(entity_raw) >= 1 and entity_raw[0] != "":
27 | entity.extend(entity_raw)
28 | aspect_raw = clean_result(tokenize[1])
29 | if len(aspect_raw) >= 1 and aspect_raw[0] != "":
30 | for asp_sen in aspect_raw:
31 | try:
32 | asp, sen = asp_sen.split("(")
33 | sen = sen.replace(")", "")
34 | aspect.append(asp.strip())
35 | sentiment.append(sen.strip())
36 | except ValueError as E:
37 | pass
38 | except AttributeError as E:
39 | print(f"[ERROR] - {E}")
40 | print(result)
41 | # print(f"[ENTITY]: {entity} - [ASPECT]: {aspect} - [SENTIMENT]: {sentiment}")
42 | return aspect, entity, sentiment
43 |
44 | def main():
45 |
46 | parser = argparse.ArgumentParser(
47 | description = "Read and ELT .csv file using pandas"
48 | )
49 | parser.add_argument("--filename", help = "Name of .csv file based on extracted dataset", required = True)
50 | args = parser.parse_args()
51 | filename = args.filename
52 |
53 | path = "dataset/data_clean.csv"
54 | data = read_file(filename = filename)
55 | aspects, entities, sentiments = [], [], []
56 | for result in data['result extraction']:
57 | aspect, entity, sentiment = parse_result(result = result)
58 | aspects.append(",".join(aspect))
59 | entities.append(",".join(entity))
60 | sentiments.append(",".join(sentiment))
61 |
62 | data["entity"] = entities
63 | data["aspect"] = aspects
64 | data["sentiment"] = sentiments
65 |
66 | data.to_csv(path, index = False)
67 | print(f'[INFO] - Save File into {path}')
68 |
69 | if __name__ == "__main__":
70 | """
71 | Usage:
72 | python src/extract_enrichment.py --filename dataset/data_twitter_pemilu_2024_enrich.csv
73 | """
74 | main()
--------------------------------------------------------------------------------
/dags/preprocess/process.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import pandas as pd
4 | from tqdm import tqdm
5 | from enrich import (
6 | TopicGenerator, HoaxDetection,
7 | AspectDetection, KeywordExtraction
8 | )
9 | tqdm.pandas()
10 | pd.set_option("display.max_columns", None)
11 |
12 |
13 | def clean_text(text, min_length=3):
14 | try:
15 | text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
16 | text = re.sub(r'\s+', ' ', text).strip()
17 | text = re.sub(r'[^a-zA-Z0-9.\s]', '', text)
18 | text = ' '.join(word for word in text.split() if len(word) >= min_length)
19 | except Exception as E:
20 | print(f'[ERROR] - {E}')
21 | print(text)
22 | return text
23 |
24 |
25 | def integration(folder):
26 | dataset = []
27 | column_used = {
28 | "news": ["keyword", "publish_date", "publisher", "content"],
29 | "twitter": ["created_at", "username", "full_text"]
30 | }
31 | column_normalization = ["keyword", "timestamp", "author", "content"]
32 | for fname in os.listdir(folder):
33 | filename = os.path.join(folder, fname)
34 | source = fname.split("_")[0]
35 | column_required = column_used[source]
36 | df = pd.read_csv(filename, usecols=column_required)
37 | if len(df.columns) != len(column_normalization):
38 | keywords = " ".join(fname.split(".")[0].split("_")[1:]).title()
39 | keywords = [keywords] * len(df)
40 | df = df[column_required]
41 | df.insert(0, column_normalization[0], keywords)
42 | else:
43 | df = df[column_required]
44 | df.columns = column_normalization
45 | df.insert(0, 'source', source)
46 | dataset.append(df)
47 | data = pd.concat(dataset)
48 | data = data.dropna().reset_index(drop=True)
49 | return data
50 |
51 |
52 | if __name__ == "__main__":
53 | """
54 | Usage
55 | ~/ElectionAspectAnalyzer/dags >>> python process.py
56 | """
57 |
58 | result_data = integration("../crawler/final_dataset")
59 | result_data['clean_content'] = result_data['content'].progress_apply(clean_text)
60 |
61 | # Hoax Detection
62 | # hoax = HoaxDetection()
63 | # list_hoax = hoax.batch_inference(result_data['clean_content'].tolist())
64 | # result_data['hoax_extract'] = list_hoax
65 |
66 | # Keyword Extraction
67 | # kw_extract = KeywordExtraction()
68 | # result_data['keyword_extract'] = result_data['clean_content'].progress_apply(
69 | # lambda i: ", ".join(kw_extract.single_inference(i))
70 | # )
71 |
72 | # Topic generation
73 | topic_gen = TopicGenerator()
74 | result_data['topic_extract'] = result_data['clean_content'].progress_apply(
75 | lambda i: ", ".join(topic_gen.generate_topic(i))
76 | )
77 |
78 | # Sentiment Extraction
79 | # ...
80 |
81 | result_data.to_csv("final_dataset/clean_data.csv", index=False)
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
162 | # Taxonomy corpus private
163 | dags/preprocess/corpus
164 | dags/preprocess/final_dataset/result.csv
165 | dataset/result.csv
--------------------------------------------------------------------------------
/requirements-data-engineer.txt:
--------------------------------------------------------------------------------
1 | accelerate==0.25.0
2 | aiodns==3.1.1
3 | aiohttp==3.8.6
4 | aiohttp-socks==0.8.4
5 | aiosignal==1.3.1
6 | altair==4.0.0
7 | anyio==3.7.1
8 | astor==0.8.1
9 | asttokens==2.4.0
10 | async-generator==1.10
11 | async-timeout==4.0.3
12 | attrs==21.4.0
13 | backcall==0.2.0
14 | beautifulsoup4==4.11.1
15 | blinker==1.6.3
16 | blis==0.7.11
17 | bs4==0.0.1
18 | cachetools==5.3.2
19 | catalogue==2.0.10
20 | cchardet==2.1.7
21 | certifi==2022.5.18.1
22 | cffi==1.15.0
23 | charset-normalizer==2.0.12
24 | click==8.1.7
25 | cloudpathlib==0.16.0
26 | colorama==0.4.6
27 | comm==0.1.4
28 | confection==0.1.4
29 | contourpy==1.1.1
30 | cssselect==1.2.0
31 | cycler==0.12.1
32 | cymem==2.0.8
33 | dataclasses-json==0.6.1
34 | debugpy==1.8.0
35 | decorator==5.1.1
36 | distlib==0.3.4
37 | distro==1.7.0
38 | dnspython==1.16.0
39 | duckdb==0.8.1
40 | elastic-transport==8.10.0
41 | elasticsearch==8.11.0
42 | entrypoints==0.4
43 | et-xmlfile==1.1.0
44 | exceptiongroup==1.1.3
45 | executing==2.0.0
46 | fake-useragent==1.4.0
47 | feedfinder2==0.0.4
48 | feedparser==6.0.10
49 | filelock==3.7.0
50 | fonttools==4.43.1
51 | frozenlist==1.4.0
52 | fsspec==2023.12.1
53 | future==0.18.3
54 | geographiclib==2.0
55 | geopy==2.4.1
56 | gitdb==4.0.11
57 | GitPython==3.1.40
58 | gnews==0.3.6
59 | googletransx==2.4.2
60 | greenlet==3.0.1
61 | h11==0.13.0
62 | huggingface-hub==0.19.4
63 | idna==3.3
64 | importlib-metadata==4.11.3
65 | importlib-resources==6.1.0
66 | ipykernel==6.25.2
67 | ipython==8.16.1
68 | isodate==0.6.1
69 | jedi==0.19.1
70 | jieba3k==0.35.1
71 | Jinja2==3.1.2
72 | joblib==1.3.2
73 | jsonpatch==1.33
74 | jsonpointer==2.4
75 | jsonschema==4.19.1
76 | jsonschema-specifications==2023.7.1
77 | jupyter_client==8.4.0
78 | jupyter_core==5.4.0
79 | keybert==0.8.3
80 | kiwisolver==1.4.5
81 | langcodes==3.3.0
82 | langsmith==0.0.51
83 | lxml==4.9.3
84 | markdown-it-py==3.0.0
85 | MarkupSafe==2.1.3
86 | marshmallow==3.20.1
87 | matplotlib==3.8.0
88 | matplotlib-inline==0.1.6
89 | mdurl==0.1.2
90 | mozfile==2.1.0
91 | mpmath==1.3.0
92 | multidict==6.0.4
93 | murmurhash==1.0.10
94 | mypy-extensions==1.0.0
95 | nest-asyncio==1.5.8
96 | networkx==3.2
97 | newspaper3k==0.2.8
98 | nltk==3.8.1
99 | numpy==1.26.2
100 | openai==0.27.10
101 | openpyxl==3.1.2
102 | packaging==21.3
103 | pandas==1.3.5
104 | parso==0.8.3
105 | pep517==0.12.0
106 | pickleshare==0.7.5
107 | Pillow==10.1.0
108 | pke @ git+https://github.com/boudinfl/pke.git@69871ffdb720b83df23684fea53ec8776fd87e63
109 | platformdirs==2.5.2
110 | platinfo==0.15.0
111 | preshed==3.0.9
112 | progressbar2==4.0.0
113 | prompt-toolkit==3.0.39
114 | protobuf==3.20.3
115 | psutil==5.9.6
116 | pure-eval==0.2.2
117 | pyarrow==13.0.0
118 | pycares==4.4.0
119 | pycparser==2.21
120 | pydantic==1.10.13
121 | pydeck==0.8.1b0
122 | Pygments==2.16.1
123 | pymongo==3.12.3
124 | Pympler==1.0.1
125 | pyparsing==3.0.9
126 | PySocks==1.7.1
127 | python-dateutil==2.8.2
128 | python-dotenv==0.19.2
129 | python-socks==2.4.3
130 | python-utils==3.3.0
131 | pytz==2022.1
132 | pywin32==306
133 | pywin32-ctypes==0.2.0
134 | PyYAML==6.0.1
135 | pyzmq==25.1.1
136 | rdflib==7.0.0
137 | redo==2.0.3
138 | referencing==0.30.2
139 | regex==2023.10.3
140 | requests==2.27.1
141 | requests-file==1.5.1
142 | rich==13.6.0
143 | rpds-py==0.10.6
144 | safetensors==0.4.1
145 | schedule==1.2.1
146 | scikit-learn==1.3.2
147 | scipy==1.11.3
148 | semver==3.0.2
149 | sentence-transformers==2.2.2
150 | sentencepiece==0.1.99
151 | sgmllib3k==1.0.0
152 | six==1.16.0
153 | smart-open==6.4.0
154 | smmap==5.0.1
155 | sniffio==1.2.0
156 | sortedcontainers==2.4.0
157 | soupsieve==2.3.2.post1
158 | spacy==3.7.2
159 | spacy-legacy==3.0.12
160 | spacy-loggers==1.0.5
161 | SQLAlchemy==1.4.49
162 | srsly==2.4.8
163 | stack-data==0.6.3
164 | streamlit==1.12.0
165 | streamlit-agraph==0.0.45
166 | style==1.1.0
167 | sympy==1.12
168 | tabulate==0.9.0
169 | tenacity==8.2.3
170 | thinc==8.2.1
171 | threadpoolctl==3.2.0
172 | tinysegmenter==0.3
173 | tldextract==5.1.1
174 | tokenizers==0.15.0
175 | toml==0.10.2
176 | tomli==2.0.1
177 | toolz==0.12.0
178 | torch==2.1.1
179 | torchvision==0.16.1
180 | tornado==6.3.3
181 | tqdm==4.66.1
182 | traitlets==5.11.2
183 | transformers==4.35.2
184 | twint==2.1.20
185 | typer==0.9.0
186 | typing-inspect==0.9.0
187 | typing_extensions==4.2.0
188 | tzdata==2023.3
189 | tzlocal==5.2
190 | Unidecode==1.3.7
191 | update==0.0.1
192 | urllib3==1.26.9
193 | validators==0.22.0
194 | virtualenv==20.14.1
195 | wasabi==1.1.2
196 | watchdog==3.0.0
197 | wcwidth==0.2.8
198 | weasel==0.3.4
199 | wsproto==1.1.0
200 | yarl==1.9.2
201 | zipp==3.8.0
202 |
--------------------------------------------------------------------------------
/dags/crawler/news_scrapper.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import pandas as pd
3 | from tqdm import tqdm
4 | from gnews import GNews
5 | from bs4 import BeautifulSoup
6 | from datetime import timedelta, date
7 | from newspaper import Config, Article
8 | from urllib.parse import unquote, urlparse
9 |
10 |
11 | # Instantiate object
12 | config = Config()
13 | config.browser_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
14 |
15 | def parsing_rss_url(rss_url):
16 | parse_url = None
17 | try:
18 | resp = requests.get(rss_url)
19 | soup = BeautifulSoup(resp.content, "html.parser")
20 | links = soup.find_all("link")
21 | for link in links:
22 | link = link.get("href")
23 | # TODO: Implement proper logic.
24 | try:
25 | if "embed" in link:
26 | parse_url = unquote(urlparse(link).query.split("=")[1])
27 | break
28 | if "amp" in link:
29 | parse_url = unquote(link)
30 | except Exception as E:
31 | print(f"Error Occurede, Required Improve Link Parser: {links}")
32 | except Exception as E:
33 | print(f"Connection Broken with Error: {E}")
34 | return parse_url
35 |
36 | # NOTE: This solution is generated based on @alearjun comment on Gnews Issue.
37 | def crawling_news(keyword, start_date=date(2023, 1, 1), total_news=1000):
38 | list_url = []
39 | list_title = []
40 | list_article = []
41 | list_authors = []
42 | list_publisher = []
43 | list_description = []
44 | list_published_date = []
45 | n_news = 0
46 | n_tolerance = 0
47 | google_news = GNews(language="id", country="ID")
48 | max_tolerance = 250
49 | while n_news < total_news:
50 | scope_date = start_date + timedelta(days = 8)
51 | google_news.start_date = (start_date.year, start_date.month, start_date.day)
52 | google_news.end_date = (scope_date.year, scope_date.month, scope_date.day)
53 | results = google_news.get_news(keyword)
54 | for res in results:
55 | print('[INFO] - Tolerance: ', n_tolerance)
56 | if n_tolerance > max_tolerance:
57 | n_news = 10_000
58 | break
59 | print(f'Total News: {n_news}')
60 | url = res['url']
61 | url = parsing_rss_url(url)
62 | if url is None:
63 | n_tolerance += 1
64 | continue
65 | try:
66 | article = Article(url, config=config)
67 | article.download()
68 | article.parse()
69 | except Exception:
70 | n_tolerance += 1
71 | pass
72 | if n_news >= total_news:
73 | break
74 | if (
75 | url in list_url or
76 | article is None or
77 | len(article.text.strip()) == 0
78 | ):
79 | n_tolerance += 1
80 | continue
81 | else:
82 | list_url.append(url)
83 | list_title.append(res['title'])
84 | list_article.append(article.text)
85 | list_authors.append(", ".join(article.authors))
86 | list_publisher.append(res['publisher']['title'])
87 | list_description.append(res['description'])
88 | list_published_date.append(res['published date'])
89 | n_news += 1
90 | start_date += timedelta(days = 7)
91 | return (
92 | list_url, list_title, list_article,
93 | list_authors, list_publisher, list_description,
94 | list_published_date
95 | )
96 |
97 | if __name__ == "__main__":
98 |
99 | # Define List of keywords
100 | list_keywords = [
101 | "Anies Baswedan",
102 | "Muhaimin Iskandar",
103 | "Ganjar Pranowo",
104 | "Mahfud MD",
105 | "Gibran Rakabuming",
106 | "Prabowo Subianto"
107 | ]
108 | for keyword in tqdm(list_keywords, desc="Crawling Google News..."):
109 | keywords = []
110 | urls, titles, contents, authors, publisher, description, publish_date = [], [], [], [], [], [], []
111 | (
112 | list_url, list_title, list_article,
113 | list_authors, list_publisher,
114 | list_description, list_published_date
115 | ) = crawling_news(keyword=keyword)
116 | urls.extend(list_url)
117 | titles.extend(list_title)
118 | contents.extend(list_article)
119 | authors.extend(list_authors)
120 | publisher.extend(list_publisher)
121 | description.extend(list_description)
122 | publish_date.extend(list_published_date)
123 | keywords.extend([keyword] * len(list_url))
124 |
125 | # Check dimension
126 | print(
127 | len(urls), len(titles), len(contents), len(authors),
128 | len(publisher), len(description), len(publish_date)
129 | )
130 |
131 | # Save into csv
132 | df = pd.DataFrame({
133 | "keyword": keywords,
134 | "url": urls,
135 | "title": titles,
136 | "content": contents,
137 | "author": authors,
138 | "publisher": publisher,
139 | "description": description,
140 | "publish_date": publish_date
141 | })
142 | df.to_csv(f"result_news_{keyword.replace(' ', '_')}.csv", index=False)
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import openai
2 | import pandas as pd
3 | import streamlit as st
4 | from src.secret import OPENAI_KEY
5 | from typing import Tuple, List, Dict
6 | from streamlit_agraph import agraph, Node, Edge, Config
7 | openai.api_key = OPENAI_KEY
8 | st.set_page_config(layout = "wide")
9 |
10 | @st.cache
11 | def load_clean_data(dataset: str = "dataset/result.csv") -> pd.DataFrame:
12 | data = pd.read_csv(dataset)
13 | data = data.drop_duplicates(subset=['keyword', 'author', 'topic'])
14 | data['keyword'] = data["keyword"].replace("Mahfud Md", "Mahfud MD")
15 | # data = data.sample(500).reset_index(drop=True)
16 | data = pd.concat([df.sample(100, replace=True) for _, df in data.groupby("keyword")])
17 | return data
18 |
19 | @st.cache
20 | def create_graph(data_filter: pd.DataFrame, use_sentiment_aspect: bool = False, mapping_sentiment: Dict[str, str] = {
21 | "positive" : "#B3FFAE", "negative" : "#FF7D7D", "neutral" : "#F8FFDB"
22 | }) -> Tuple[List[str], List[str]]:
23 | # aspect_global = []
24 | # nodes, edges = [], []
25 | # candidates, organizations = [], []
26 | # for _, i in data_filter.iterrows():
27 | # candidate_name = i['name']
28 | # organization = i['entity']
29 | # if organization != "" and isinstance(organization, str):
30 | # for person in organization.split(","):
31 | # if candidate_name not in candidates:
32 | # nodes.append(Node(id = candidate_name, label = candidate_name, symbolType = "diamond", color = "#FFF6F6", size = 20))
33 | # candidates.append(candidate_name)
34 | # if person not in organizations and person not in candidates and person not in aspect_global:
35 | # nodes.append(Node(id = person, label = person, color = "#A7D397", size = 15))
36 | # organizations.append(person)
37 | # edges.append(Edge(source = person, target = candidate_name))
38 | # if use_sentiment_aspect:
39 | # sentiments = i['sentiment']
40 | # aspects = i['aspect']
41 | # if aspects != "" and isinstance(aspects, str):
42 | # for aspect, sentiment in zip(aspects.split(","), sentiments.split(",")):
43 | # if aspect not in aspect_global and aspect not in organizations and aspect not in candidates:
44 | # # print(f'[ASPECT] - {aspect} is not available in, orgs: {aspect not in organizations} asp: {aspect not in aspect_global}')
45 | # nodes.append(Node(id = aspect, label = aspect, size = 10, color = mapping_sentiment.get(sentiment)))
46 | # edges.append(Edge(source = aspect, target = person, label = sentiment))
47 | # aspect_global.append(aspect)
48 | aspect_global = []
49 | nodes, edges = [], []
50 | candidates, authors = [], []
51 | for _, i in data_filter.iterrows():
52 | candidate = i['keyword']
53 | author = i['author']
54 | if author != "" and isinstance(author, str):
55 | if candidate not in candidates:
56 | nodes.append(Node(id = candidate, label = candidate, symbolType = "diamond", color = "#FFF6F6", size = 25))
57 | candidates.append(candidate)
58 | elif author not in authors:
59 | nodes.append(Node(id = author, label = author, symbolType = "diamond", color = "#A7D397", size = 15))
60 | authors.append(author)
61 | # elif author in authors and candidate in candidates:
62 | edges.append(Edge(source = author, target = candidate))
63 | if use_sentiment_aspect:
64 | sentiment = i['sentiment']
65 | aspect = i['topic']
66 | if aspect != "" and isinstance(aspect, str) and aspect not in aspect_global:
67 | nodes.append(Node(id = aspect, label = aspect, size = 10, color = mapping_sentiment.get(sentiment)))
68 | edges.append(Edge(source = aspect, target = author, label = sentiment))
69 | aspect_global.append(aspect)
70 | return nodes, edges
71 |
72 | def prompt_qa(data: pd.DataFrame, query: str) -> str:
73 | prompt = \
74 | f"""
75 | data = {data.to_dict('records')}
76 |
77 | jawaban pertanyaan berikut berdasarkan informasi diatas yang diolah sesuai reasoning yang masuk akal.
78 | pertanyaan: Siapa Pemenang pemilu 2024?
79 | jawaban: Ganjar Pranowo
80 |
81 | pertanyaan: {query}
82 |
83 | dengan format dibawah:
84 | jawaban:
85 | """
86 | return prompt
87 |
88 |
89 | def agent_qa_zero_shot(data: pd.DataFrame, query: str, model_base: str = "gpt-3.5-turbo-16k"):
90 | token_usage = 0
91 | response_extraction = ""
92 | try:
93 | response = openai.ChatCompletion.create(
94 | model = model_base,
95 | messages = [{"role" : "user", "content" : prompt_qa(data, query)}],
96 | temperature = 0.5, max_tokens = 512, top_p = 1.0,
97 | frequency_penalty = 0.0, presence_penalty = 0.0
98 | )
99 | response_extraction = response["choices"][0]["message"]["content"]
100 | token_usage = response["usage"]["total_tokens"]
101 | except Exception as E:
102 | print(f"[ERROR] - {E}")
103 | print("Retry with Recursive Func")
104 | # agent_qa_zero_shot(data, query)
105 | return response_extraction, token_usage
106 |
107 |
108 | def app(data: pd.DataFrame, config: Config):
109 |
110 | # Interface section
111 | st.sidebar.header("ElectionAspectAnalyzer v.0.1")
112 |
113 | # Sidebar section
114 | candidates = data["keyword"].unique().tolist()
115 | filter_candidate = st.sidebar.multiselect(
116 | "Select Candidates:",
117 | options = candidates,
118 | default = candidates[:3]
119 | )
120 | filter_data = data[data['keyword'].isin(filter_candidate)].reset_index(drop = True)
121 | use_aspect_sentiment = st.sidebar.checkbox("Use Aspect-Sentiment")
122 |
123 | # Graph section
124 | with st.spinner("Preprocess Data..."):
125 | filter_node, filter_edge = create_graph(filter_data, use_sentiment_aspect = use_aspect_sentiment)
126 | st.success("Total Nodes Loaded: " + str(len(filter_node)))
127 | return_value = agraph(
128 | nodes = filter_node,
129 | edges = filter_edge,
130 | config = config
131 | )
132 |
133 | # QnA section
134 | # NOTE: Reduce token usage OpenAI Cost :)
135 | data_sample = pd.concat([df.sample(3, replace=False) for _, df in filter_data.groupby("keyword")])
136 | query = st.sidebar.text_input(label = "Any Question about Election 2024?")
137 | if query != "":
138 | response, _ = agent_qa_zero_shot(data = data_sample, query = query)
139 | st.sidebar.success(response)
140 |
141 | if __name__ == "__main__":
142 | config = Config(
143 | width = 1000, height = 500,
144 | directed = True, physics = True, hierarchical = False
145 | )
146 | data = load_clean_data()
147 | app(data = data, config = config)
--------------------------------------------------------------------------------
/dags/preprocess/enrich.py:
--------------------------------------------------------------------------------
1 | import re
2 | import pandas as pd
3 | from tqdm import tqdm
4 | from keybert import KeyBERT
5 | from typing import List, Tuple
6 | from nltk.corpus import stopwords
7 | from warnings import filterwarnings
8 | from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
9 | filterwarnings("ignore")
10 |
11 |
12 | class TopicGenerator:
13 | def __init__(self, corpus_path: str = "corpus/taxo.xlsx"):
14 | self.corpus_path = corpus_path
15 | taxonomy_general = self._load_taxonomy("General ID")
16 | taxonomy_specific = self._load_taxonomy("Government ID")
17 | self.attr_noun = pd.concat([taxonomy_specific, taxonomy_general])
18 |
19 | def _load_taxonomy(self, sheet_name: str) -> pd.DataFrame:
20 | required_columns = ['category', 'attr_noun']
21 | normalize_columns = ['category', 'attr_noun']
22 | return self._process_corpus(sheet_name, required_columns, normalize_columns)
23 |
24 | def _process_corpus(self, sheet_name: str, required_columns: List[str], normalize_columns: List[str]) -> pd.DataFrame:
25 | data = pd.read_excel(self.corpus_path, usecols=required_columns, sheet_name=sheet_name)
26 | data.columns = normalize_columns
27 | return data.dropna().drop_duplicates().reset_index(drop=True)
28 |
29 | @staticmethod
30 | def _apply_regex(phrase: List[str]) -> str:
31 | return r"\b(?:" + "|".join(map(re.escape, phrase)) + r")\b"
32 |
33 | @staticmethod
34 | def _process_term(list_term: List[str]) -> Tuple[List[str], List[str]]:
35 | single_term, phrase_term = [], []
36 | for word in list_term:
37 | try:
38 | if word == "":
39 | continue
40 | if len(word.split()) == 1:
41 | single_term.append(word)
42 | else:
43 | phrase_term.append(word)
44 | except AttributeError:
45 | continue
46 | return single_term, phrase_term
47 |
48 | def generate_topic(self, content: str) -> List[str]:
49 | attr_noun_single, attr_noun_phrase = self._process_term(self.attr_noun['attr_noun'])
50 | attr_noun_single = self.attr_noun[self.attr_noun['attr_noun'].isin(attr_noun_single)].reset_index(drop=True)
51 | attr_noun_phrase = self.attr_noun[self.attr_noun['attr_noun'].isin(attr_noun_phrase)].reset_index(drop=True)
52 | regex_noun_phrase = self._apply_regex(attr_noun_phrase['attr_noun'].str.lower().tolist())
53 | regex_noun_single = self._apply_regex(attr_noun_single['attr_noun'].str.lower().tolist())
54 | list_noun_phrase = attr_noun_phrase['attr_noun'].str.lower().tolist()
55 | list_noun_single = attr_noun_single['attr_noun'].str.lower().tolist()
56 |
57 | content = content.lower()
58 | aspect_term, aspect_category = [], []
59 | phrase_noun = re.findall(regex_noun_phrase, content)
60 | single_noun = re.findall(regex_noun_single, content)
61 | if phrase_noun:
62 | for phrase in phrase_noun:
63 | for word in phrase.split():
64 | if word in single_noun:
65 | single_noun.remove(word)
66 | if phrase not in aspect_term:
67 | aspect_term.append(phrase)
68 | aspect_category.append(attr_noun_phrase['category'][list_noun_phrase.index(phrase)])
69 | if single_noun:
70 | for word in single_noun:
71 | if word not in aspect_term:
72 | aspect_term.append(word)
73 | aspect_category.append(attr_noun_single['category'][list_noun_single.index(word)])
74 |
75 | aspect_category = list(set(aspect_category))
76 | return aspect_category
77 |
78 |
79 | class AspectDetection:
80 |
81 | def __init__(self, model="mdhugol/indonesia-bert-sentiment-classification", task = "text-classification"):
82 | tokenizer = AutoTokenizer.from_pretrained(model)
83 | finetune_model = AutoModelForSequenceClassification.from_pretrained(model)
84 | self.pipe = pipeline(task, model=finetune_model, tokenizer=tokenizer)
85 | self.label_index = {'LABEL_0': 'positif', 'LABEL_1': 'netral', 'LABEL_2': 'negatif'}
86 |
87 | def preprocess(self, content, aspect_category):
88 | return f'[CLS] {content} [ASP] {aspect_category} [ASP]'
89 |
90 | def single_inference(self, content, aspect_category):
91 | results = self.pipe(self.preprocess(content, aspect_category))
92 | results = list(map(lambda i: self.label_index[i['label']], results))
93 | return results
94 |
95 |
96 | class HoaxDetection:
97 |
98 | def __init__(self, model="khavitidala/xlmroberta-large-fine-tuned-indo-hoax-classification", task = "text-classification"):
99 | self.pipe = pipeline(task, model=model, max_length=512, truncation=True)
100 |
101 | def batch_inference(self, contents):
102 | results = []
103 | for content in tqdm(contents):
104 | result = self.pipe(content)
105 | res = list(map(lambda i: i['label'], result))
106 | results.extend(res)
107 | return results
108 |
109 | class KeywordExtraction:
110 |
111 | def __init__(self):
112 | self.extract = KeyBERT(model="indobenchmark/indobert-base-p1")
113 | stop_words = list(set(stopwords.words('indonesian')))
114 | data = 'https://raw.githubusercontent.com/Braincore-id/IndoTWEEST/main/stopwords_twitter.csv'
115 | df_stopwords = pd.read_csv(data, names=['stopword'])
116 | stop_words = stop_words + df_stopwords['stopword'].unique().tolist()
117 | self.stopwords = list(set(stop_words))
118 |
119 | def single_inference(self, content):
120 | result = self.extract.extract_keywords(
121 | content, keyphrase_ngram_range = (1, 1),
122 | stop_words=self.stopwords
123 | )
124 | result = list(map(lambda i: i[0], result))
125 | return result
126 |
127 |
128 |
129 | if __name__ == "__main__":
130 |
131 | """
132 | Aspect Category | Aspect Sentiment | Keywords (top 5) | Hoax Classification
133 | """
134 | # Define example content
135 | content = "VIVA Politik Anies Baswedan akan kembali melakukan safari politik seluruh Tanah Air. Akhir Januari 2023 ini Anies akan mengunjungi NTB. Beberapa wilayah Pulau Lombok dan Sumbawa telah disiapkan. Bakal calon presiden yang diusung Partai NasDem Anies Baswedan dijadwalkan akan mengunjungi Nusa Tenggara Barat pada 3031 Januari 2023 mendatang. Agendanya satu hari Pulau Lombok dan satu hari Pulau Sumbawa kata Ketua Panitia yang juga Ketua DPD Partai NasDem Lombok Timur Rumaksi Kantor DPW Partai NasDem NTB Kota Mataram Jumat dikutip dari Antara. mengatakan dalam kunjungannya NTB Anies Baswedan dijadwalkan akan bertemu dengan sejumlah tokoh agama tokoh masyarakat pemuda dan relawan dari lintas agama baik yang ada Pulau Lombok dan Pulau Sumbawa. Selain bertemu dengan tokoh lintas agama dan relawan. Anies Baswedan juga akan mengunjungi Desa Wisata Sade Lombok Tengah. Kemudian juga Pondok Pesantren Yatofa Bodak Lombok Tengah. Ponpes Yatofa Pak Anies akan bersilaturahmi dan melakukan pengajian bersama pimpinan pondok pesantren dan masyarakat ujarnya. Setelah itu dilanjutkan dengan mengunjungi peternak sapi Desa Wanaseba Kabupaten Lombok Timur. Tidak hanya itu bakal calon presiden Partai NasDem itu juga akan melaksanakan shalat berjemaah Masjid Jamik Masbagik Lombok Timur. Setelah dari Masjid Masbagik Pak Anies akan diarak pakai kuda menuju Lapangan Gotong Royong Masbagik. Pak Anies akan mengukuhkan pengurus ranting Partai Nasdem sePulau Lombok. Dari situ melanjutkan perjalanan Lombok Barat untuk silaturahmi dengan tokoh agama lintas agama bersama Bupati Lombok Barat selaku Ketua Dewan Pakar Partai NasDem terang Rumaksi. Kemudian pada Januari Anies Baswedan akan terbang Pulau Sumbawa. Pulau Sumbawa Anies Baswedan akan mengunjungi Kota Bima. Setelah itu akan bergerak menuju Kabupaten Sumbawa. Jadi kegiatan Pulau Sumbawa juga sama dengan yang ada Lombok ucapnya didampingi anggota DPR dapil NTB dari Fraksi Partai NasDem Syamsul Luthfi Ketua DPD NasDem Lombok Tengah Syamsul Hadi Ketua DPD NasDem Lombok Barat Tarmizi dan Ketua DPD NasDem Kota Bima Muthmainnah. Menurut dia panitia daerah siap mengawal kunjungan mantan Gubernur DKI Jakarta tersebut selama mengunjungi NTB bahkan DPW Partai NasDem memastikan bahwa kedatangan bakal calon presiden Anies Baswedan NTB akan berjalan aman dan kondusif. Karena itu pihaknya mempersilakan bagi siapapun khususnya tim relawan untuk melakukan komunikasi dengan DPW Partai NasDem terkait rencana kedatangan Anies Baswedan NTB. Khusus saat pertemuan Ponpes Yatofa akan ada kejutan yang disampaikan. Cuma apa kejutan itu nanti disampaikan saat Pak Anies Baswedan datang NTB katanya. Ant"
136 |
137 | # Example usage of TopicGenerator class
138 | generator = TopicGenerator()
139 | topics = generator.generate_topic(content)
140 | print(topics)
141 | # >>> ['Community', 'Institution', 'Public Figure', 'Leader', 'Transportation', 'Networking', 'Activities', 'Spokesperson', 'Social Media', 'Product Launch', 'Public Rally', 'Volunteer', 'Public Opinion', 'Election', 'Tourism', 'Geographical', 'Political Parties']
142 |
143 | # Example usage of HoaxDetection class
144 | hoax_detection = HoaxDetection()
145 | result = hoax_detection.batch_inference([content])
146 | print(result)
147 | # >>> ['Fakta']
148 |
149 | # Example usage of AspectDetection class
150 | aspect_detection = AspectDetection()
151 | for topic in topics:
152 | res = aspect_detection.single_inference(content, topic)
153 | print(f"{topic} - {res}")
154 | # >>> Institution - ['netral']
155 | # >>> Networking - ['netral']
156 | # >>> Public Rally - ['netral']
157 |
158 | # Example usage of HoaxDetection class
159 | keyword_extraction = KeywordExtraction()
160 | result = keyword_extraction.single_inference(content)
161 | print(result)
162 | # >>> ['agendanya', 'anies', 'nasdem', 'sumbawa', 'ntb']
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/notebook/enrichment.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "```\n",
8 | "Proof on Concept:\n",
9 | "\n",
10 | "Twitter Post (1 baris):\n",
11 | "\n",
12 | " (ideal) \n",
13 | " - Filtrasi keyword (udah ada di dalam dataset)\n",
14 | " - Ekstraksi Person (by PoS atau @)\n",
15 | " - Ekstraksi Aspect (by Noun dari NER / PoS / KBBI)\n",
16 | " - Generate Sentiment (per aspect terdeteksi) -> translate ke bahasa inggris (bert pair cls)\n",
17 | " \n",
18 | " (mvp)\n",
19 | " - utilize openAI at all. :D\n",
20 | "\n",
21 | "Expected Result (tabular format)\n",
22 | "\n",
23 | "source data:\n",
24 | "| name | tweets | re-tweets | ... |\n",
25 | "\n",
26 | "result enrichment:\n",
27 | "| name | tweets | re-tweets | Person / Organization (NER) | Aspect - Sentiment (ABSA) | Topic - (input by user)\n",
28 | "```"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 36,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# Load libraries \n",
38 | "import os\n",
39 | "import re\n",
40 | "import time\n",
41 | "import openai \n",
42 | "import pandas as pd \n",
43 | "from tqdm import tqdm\n",
44 | "from typing import Tuple\n",
45 | "from dotenv import load_dotenv\n",
46 | "\n",
47 | "\n",
48 | "load_dotenv()\n",
49 | "pd.set_option(\"display.max_columns\", None)"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 11,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "# Setting credentials\n",
59 | "OPENAI_KEY = os.getenv(\"OPENAI_API_KEY\", default = None) \n",
60 | "openai.api_key = OPENAI_KEY"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 37,
66 | "metadata": {},
67 | "outputs": [
68 | {
69 | "data": {
70 | "text/html": [
71 | "\n",
72 | "\n",
85 | "
\n",
86 | " \n",
87 | " \n",
88 | " | \n",
89 | " name | \n",
90 | " text | \n",
91 | " rt | \n",
92 | " id | \n",
93 | "
\n",
94 | " \n",
95 | " \n",
96 | " \n",
97 | " | 0 | \n",
98 | " prabowo | \n",
99 | " Megawati Soekarnoputri, diyakini akan menjadik... | \n",
100 | " 0 | \n",
101 | " 1552261054964461568 | \n",
102 | "
\n",
103 | " \n",
104 | " | 1 | \n",
105 | " prabowo | \n",
106 | " Diremehkan, Citra Pak @prabowo menjadi terting... | \n",
107 | " 3 | \n",
108 | " 1551415694738313216 | \n",
109 | "
\n",
110 | " \n",
111 | " | 2 | \n",
112 | " prabowo | \n",
113 | " Dulu Tuhan disuruh menangin Prabowo atau kagak... | \n",
114 | " 0 | \n",
115 | " 1551415694738313216 | \n",
116 | "
\n",
117 | " \n",
118 | " | 3 | \n",
119 | " prabowo | \n",
120 | " @SantorinisSun Loh miss valak masih menyembah ... | \n",
121 | " 0 | \n",
122 | " 1551415694738313216 | \n",
123 | "
\n",
124 | " \n",
125 | " | 4 | \n",
126 | " prabowo | \n",
127 | " Yth bapak Presiden republik Indonesia Ir Haji ... | \n",
128 | " 39 | \n",
129 | " 1552234605419237376 | \n",
130 | "
\n",
131 | " \n",
132 | "
\n",
133 | "
"
134 | ],
135 | "text/plain": [
136 | " name text rt \\\n",
137 | "0 prabowo Megawati Soekarnoputri, diyakini akan menjadik... 0 \n",
138 | "1 prabowo Diremehkan, Citra Pak @prabowo menjadi terting... 3 \n",
139 | "2 prabowo Dulu Tuhan disuruh menangin Prabowo atau kagak... 0 \n",
140 | "3 prabowo @SantorinisSun Loh miss valak masih menyembah ... 0 \n",
141 | "4 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n",
142 | "\n",
143 | " id \n",
144 | "0 1552261054964461568 \n",
145 | "1 1551415694738313216 \n",
146 | "2 1551415694738313216 \n",
147 | "3 1551415694738313216 \n",
148 | "4 1552234605419237376 "
149 | ]
150 | },
151 | "execution_count": 37,
152 | "metadata": {},
153 | "output_type": "execute_result"
154 | }
155 | ],
156 | "source": [
157 | "# Load dataset\n",
158 | "data = pd.read_csv(\"../dataset/data_twitter_pemilu_2024.csv\")\n",
159 | "data.head()"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 38,
165 | "metadata": {},
166 | "outputs": [
167 | {
168 | "data": {
169 | "text/plain": [
170 | "False 625\n",
171 | "True 160\n",
172 | "Name: count, dtype: int64"
173 | ]
174 | },
175 | "execution_count": 38,
176 | "metadata": {},
177 | "output_type": "execute_result"
178 | }
179 | ],
180 | "source": [
181 | "# Data Duplicate checking\n",
182 | "data.duplicated(subset = ['text', 'id', 'rt']).value_counts()"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 39,
188 | "metadata": {},
189 | "outputs": [
190 | {
191 | "data": {
192 | "text/html": [
193 | "\n",
194 | "\n",
207 | "
\n",
208 | " \n",
209 | " \n",
210 | " | \n",
211 | " name | \n",
212 | " text | \n",
213 | " rt | \n",
214 | " id | \n",
215 | "
\n",
216 | " \n",
217 | " \n",
218 | " \n",
219 | " | 10 | \n",
220 | " prabowo | \n",
221 | " Yth bapak Presiden republik Indonesia Ir Haji ... | \n",
222 | " 39 | \n",
223 | " 1552234605419237376 | \n",
224 | "
\n",
225 | " \n",
226 | " | 15 | \n",
227 | " prabowo | \n",
228 | " Kapolri Jenderal Listyo Sigit Prabowo mengatak... | \n",
229 | " 1 | \n",
230 | " 1552476373855244289 | \n",
231 | "
\n",
232 | " \n",
233 | " | 21 | \n",
234 | " prabowo | \n",
235 | " Ngopi daring tayang siang ini di Youtube @kemh... | \n",
236 | " 2 | \n",
237 | " 1551476092749447168 | \n",
238 | "
\n",
239 | " \n",
240 | " | 22 | \n",
241 | " prabowo | \n",
242 | " Yth bapak Presiden republik Indonesia Ir Haji ... | \n",
243 | " 39 | \n",
244 | " 1552234605419237376 | \n",
245 | "
\n",
246 | " \n",
247 | " | 23 | \n",
248 | " prabowo | \n",
249 | " Yth bapak Presiden republik Indonesia Ir Haji ... | \n",
250 | " 39 | \n",
251 | " 1552234605419237376 | \n",
252 | "
\n",
253 | " \n",
254 | " | 33 | \n",
255 | " prabowo | \n",
256 | " Yth bapak Presiden republik Indonesia Ir Haji ... | \n",
257 | " 39 | \n",
258 | " 1552234605419237376 | \n",
259 | "
\n",
260 | " \n",
261 | " | 37 | \n",
262 | " prabowo | \n",
263 | " Yth bapak Presiden republik Indonesia Ir Haji ... | \n",
264 | " 39 | \n",
265 | " 1552234605419237376 | \n",
266 | "
\n",
267 | " \n",
268 | " | 42 | \n",
269 | " prabowo | \n",
270 | " Yth bapak Presiden republik Indonesia Ir Haji ... | \n",
271 | " 39 | \n",
272 | " 1552234605419237376 | \n",
273 | "
\n",
274 | " \n",
275 | " | 45 | \n",
276 | " prabowo | \n",
277 | " Catat nih, Pak Prabowo menduduki tempat dipunc... | \n",
278 | " 2 | \n",
279 | " 1551415694880677891 | \n",
280 | "
\n",
281 | " \n",
282 | " | 49 | \n",
283 | " prabowo | \n",
284 | " Yth bapak Presiden republik Indonesia Ir Haji ... | \n",
285 | " 39 | \n",
286 | " 1552234605419237376 | \n",
287 | "
\n",
288 | " \n",
289 | "
\n",
290 | "
"
291 | ],
292 | "text/plain": [
293 | " name text rt \\\n",
294 | "10 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n",
295 | "15 prabowo Kapolri Jenderal Listyo Sigit Prabowo mengatak... 1 \n",
296 | "21 prabowo Ngopi daring tayang siang ini di Youtube @kemh... 2 \n",
297 | "22 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n",
298 | "23 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n",
299 | "33 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n",
300 | "37 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n",
301 | "42 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n",
302 | "45 prabowo Catat nih, Pak Prabowo menduduki tempat dipunc... 2 \n",
303 | "49 prabowo Yth bapak Presiden republik Indonesia Ir Haji ... 39 \n",
304 | "\n",
305 | " id \n",
306 | "10 1552234605419237376 \n",
307 | "15 1552476373855244289 \n",
308 | "21 1551476092749447168 \n",
309 | "22 1552234605419237376 \n",
310 | "23 1552234605419237376 \n",
311 | "33 1552234605419237376 \n",
312 | "37 1552234605419237376 \n",
313 | "42 1552234605419237376 \n",
314 | "45 1551415694880677891 \n",
315 | "49 1552234605419237376 "
316 | ]
317 | },
318 | "execution_count": 39,
319 | "metadata": {},
320 | "output_type": "execute_result"
321 | }
322 | ],
323 | "source": [
324 | "# Overview duplicated data\n",
325 | "data[data.duplicated(subset = ['text', 'id', 'rt'])].head(10)"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 40,
331 | "metadata": {},
332 | "outputs": [],
333 | "source": [
334 | "# Duplicate data filtering\n",
335 | "data = data.drop_duplicates(subset = ['text', 'id', 'rt'])"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": 41,
341 | "metadata": {},
342 | "outputs": [
343 | {
344 | "data": {
345 | "text/plain": [
346 | "False 625\n",
347 | "Name: count, dtype: int64"
348 | ]
349 | },
350 | "execution_count": 41,
351 | "metadata": {},
352 | "output_type": "execute_result"
353 | }
354 | ],
355 | "source": [
356 | "# Data Duplicate checking - validation\n",
357 | "data.duplicated(subset = ['text', 'id', 'rt']).value_counts()"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": 42,
363 | "metadata": {},
364 | "outputs": [],
365 | "source": [
366 | "# Define prompt and ingestion script\n",
367 | "def prompt_enrichment(tweet_comment: str) -> str:\n",
368 | " prompt = \\\n",
369 | " f\"\"\"\n",
370 | " Ekstraksi informasi yang dibutuhkan berdasarkan komentar twitter dibawah, dengan response cukup sesuai yang di definisikan tanpa penjelasan tambahan.\n",
371 | "\n",
372 | " komentar_twitter: \"{tweet_comment}\"\n",
373 | "\n",
374 | " Untuk response cukup isi dengan format dibawah.\n",
375 | " named_entity_recognition: [Jawaban anda: cakupan NER sesuai label \"PERSON\" atau \"ORGANIZATION\" saja]\n",
376 | " aspect_sentiment: [Identifikasi verb / noun-phrase hasil dari part-of-speech di dalam komentar, disertai dengan nilai sentiment masing-masing aspect dengan format ]\n",
377 | " \"\"\"\n",
378 | " return prompt\n",
379 | "\n",
380 | "def ingest_openai(tweet_comment: str, model_base: str = \"gpt-3.5-turbo\") -> Tuple[str, int]: \n",
381 | " token_usage = 0\n",
382 | " response_extraction = \"\"\n",
383 | " try:\n",
384 | " response = openai.ChatCompletion.create(\n",
385 | " model = model_base, \n",
386 | " messages = [{\"role\" : \"user\", \"content\" : prompt_enrichment(tweet_comment)}], \n",
387 | " temperature = 0.1, max_tokens = 512, top_p = 1.0, \n",
388 | " frequency_penalty = 0.0, presence_penalty = 0.0\n",
389 | " )\n",
390 | " response_extraction = response[\"choices\"][0][\"message\"][\"content\"]\n",
391 | " token_usage = response[\"usage\"][\"total_tokens\"]\n",
392 | " except Exception as E:\n",
393 | " print(f\"[ERROR] - {E}\")\n",
394 | " print(\"Retry with Recursive Func\")\n",
395 | " time.sleep(5)\n",
396 | " ingest_openai(tweet_comment = tweet_comment)\n",
397 | " return response_extraction, token_usage"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": 45,
403 | "metadata": {},
404 | "outputs": [
405 | {
406 | "name": "stdout",
407 | "output_type": "stream",
408 | "text": [
409 | "[COMMENT]\n",
410 | "Puan tak masalah bahkan Ganjar jadi salah satu bacapres. Waktunya Puan Maharani\n",
411 | "[RESULT - Token Usage: 216]\n",
412 | "named_entity_recognition: [Puan, Ganjar, Puan Maharani]\n",
413 | "aspect_sentiment: [Puan (positive), Ganjar (positive), bacapres (positive), Waktunya Puan Maharani (neutral)]\n"
414 | ]
415 | }
416 | ],
417 | "source": [
418 | "# Test ingestion\n",
419 | "comment = data['text'].sample(1).values[0]\n",
420 | "extraction, token_usage = ingest_openai(tweet_comment = comment)\n",
421 | "print(f\"[COMMENT]\\n{comment}\\n[RESULT - Token Usage: {token_usage}]\\n{extraction}\")"
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": 46,
427 | "metadata": {},
428 | "outputs": [],
429 | "source": [
430 | "# Apply on entire dataset\n",
431 | "final_result_extraction, final_token_usage = [], []"
432 | ]
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": 48,
437 | "metadata": {},
438 | "outputs": [
439 | {
440 | "name": "stderr",
441 | "output_type": "stream",
442 | "text": [
443 | "Ingestion Start: 13%|█▎ | 84/625 [10:42<46:56, 5.21s/it] "
444 | ]
445 | },
446 | {
447 | "name": "stdout",
448 | "output_type": "stream",
449 | "text": [
450 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
451 | "Retry with Recursive Func\n"
452 | ]
453 | },
454 | {
455 | "name": "stderr",
456 | "output_type": "stream",
457 | "text": [
458 | "Ingestion Start: 25%|██▌ | 158/625 [28:13<33:08, 4.26s/it] "
459 | ]
460 | },
461 | {
462 | "name": "stdout",
463 | "output_type": "stream",
464 | "text": [
465 | "[ERROR] - The server is overloaded or not ready yet.\n",
466 | "Retry with Recursive Func\n"
467 | ]
468 | },
469 | {
470 | "name": "stderr",
471 | "output_type": "stream",
472 | "text": [
473 | "Ingestion Start: 45%|████▌ | 284/625 [40:18<22:14, 3.91s/it] "
474 | ]
475 | },
476 | {
477 | "name": "stdout",
478 | "output_type": "stream",
479 | "text": [
480 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
481 | "Retry with Recursive Func\n"
482 | ]
483 | },
484 | {
485 | "name": "stderr",
486 | "output_type": "stream",
487 | "text": [
488 | "Ingestion Start: 61%|██████ | 379/625 [59:05<35:12, 8.59s/it] "
489 | ]
490 | },
491 | {
492 | "name": "stdout",
493 | "output_type": "stream",
494 | "text": [
495 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
496 | "Retry with Recursive Func\n"
497 | ]
498 | },
499 | {
500 | "name": "stderr",
501 | "output_type": "stream",
502 | "text": [
503 | "Ingestion Start: 61%|██████ | 380/625 [1:09:17<12:53:12, 189.36s/it]"
504 | ]
505 | },
506 | {
507 | "name": "stdout",
508 | "output_type": "stream",
509 | "text": [
510 | "[ERROR] - HTTP code 502 from API (\n",
511 | "502 Bad Gateway\n",
512 | "\n",
513 | "502 Bad Gateway
\n",
514 | "
cloudflare\n",
515 | "\n",
516 | "\n",
517 | ")\n",
518 | "Retry with Recursive Func\n"
519 | ]
520 | },
521 | {
522 | "name": "stderr",
523 | "output_type": "stream",
524 | "text": [
525 | "Ingestion Start: 100%|██████████| 625/625 [1:38:27<00:00, 9.45s/it] \n"
526 | ]
527 | }
528 | ],
529 | "source": [
530 | "# Iter and push into array\n",
531 | "for comment in tqdm(data[\"text\"], desc = \"Ingestion Start\"):\n",
532 | " result, token = ingest_openai(tweet_comment = comment)\n",
533 | " final_result_extraction.append(result)\n",
534 | " final_token_usage.append(token)"
535 | ]
536 | },
537 | {
538 | "cell_type": "code",
539 | "execution_count": 49,
540 | "metadata": {},
541 | "outputs": [],
542 | "source": [
543 | "# Assign result into dataframe\n",
544 | "data['result extraction'] = final_result_extraction\n",
545 | "data['token usage'] = final_token_usage"
546 | ]
547 | },
548 | {
549 | "cell_type": "code",
550 | "execution_count": 50,
551 | "metadata": {},
552 | "outputs": [],
553 | "source": [
554 | "# Save into dataframe\n",
555 | "data.to_csv(\"../dataset/data_twitter_pemilu_2024_enrich.csv\", index = False)"
556 | ]
557 | }
558 | ],
559 | "metadata": {
560 | "kernelspec": {
561 | "display_name": "Python 3 (ipykernel)",
562 | "language": "python",
563 | "name": "python3"
564 | },
565 | "language_info": {
566 | "codemirror_mode": {
567 | "name": "ipython",
568 | "version": 3
569 | },
570 | "file_extension": ".py",
571 | "mimetype": "text/x-python",
572 | "name": "python",
573 | "nbconvert_exporter": "python",
574 | "pygments_lexer": "ipython3",
575 | "version": "3.9.7"
576 | }
577 | },
578 | "nbformat": 4,
579 | "nbformat_minor": 2
580 | }
581 |
--------------------------------------------------------------------------------
/dags/crawler/final_dataset/twitter_prabowo_subianto.csv:
--------------------------------------------------------------------------------
1 | created_at,id_str,full_text,quote_count,reply_count,retweet_count,favorite_count,lang,user_id_str,conversation_id_str,username,tweet_url
2 | Wed Nov 29 23:55:15 +0000 2023,1730012580595528120,"yang menegaskan agar anggota KORPRI Kemhan dapat terus berkontribusi, berprestasi dan berinovasi dalam mewujudkan Indonesia yang lebih baik dan maju. #Prabowo #PrabowoSubianto #MenhanPrabowo #Herindra #WamenhanRI #Kemhan #KemhanRI #HUTKorpri2023",0,0,0,5,in,714719021816541184,1730012575465889999,Kemhan_RI,https://twitter.com/Kemhan_RI/status/1730012580595528120
3 | Wed Nov 29 23:55:14 +0000 2023,1730012575465889999,"Momen Wamenhan, M. Herindra @herindra87, memimpin Upacara HUT ke-52 KORPRI, di Lapangan Bhinneka Tunggal Ika, Kemhan Jakarta (29/11). Wamenhan M. Herindra membacakan Amanat Menteri Pertahanan Prabowo Subianto @prabowo, https://t.co/UN6DBPUbKI",0,1,5,27,in,714719021816541184,1730012575465889999,Kemhan_RI,https://twitter.com/Kemhan_RI/status/1730012575465889999
4 | Wed Nov 29 23:55:10 +0000 2023,1730012558126682481,"Akhir pekan ini capres dan cawapres nomor urut dua, Prabowo Subianto dan Gibran Rakabuming Raka dikabarkan akan mulai berkampanye di sejumlah daerah. https://t.co/VpjCI5Ntgc",0,0,0,1,in,154102750,1730012558126682481,Beritasatu,https://twitter.com/Beritasatu/status/1730012558126682481
5 | Wed Nov 29 23:50:41 +0000 2023,1730011427820454264,"Presiden Jokowi adalah seorang presiden, mentor , Guru , tauladan dan Bpk yg sangat baik rendah hati . Beliau bisa mengajarkan seorang Prabowo Subianto menjadi seorang yang humble . Jauh dari sosok Prabowo yg sebelum 2019 . Terima Kasih Pak @jokowi 🙏🏼✌🏼🫰🏼 https://t.co/d0TkIoEKqa",0,0,0,0,in,1598344201409699841,1730011427820454264,TitaMar80993092,https://twitter.com/TitaMar80993092/status/1730011427820454264
6 | Wed Nov 29 23:49:10 +0000 2023,1730011046113632550,@barubikinlol Ganjar Pranowo? Prabowo Subianto? Anis Baswedano? O semua ini saya rubah.,0,2,1,42,in,593490220,1729852949877137761,Goodwindology,https://twitter.com/Goodwindology/status/1730011046113632550
7 | Wed Nov 29 23:46:44 +0000 2023,1730010434306977842,Sri Mulyani mengungkap isi pertemuan antara Presiden Joko Widodo (Jokowi) dan Menteri Pertahanan Prabowo Subianto di Istana Bogor pada Selasa (28/11). https://t.co/mXhR6BJNjj,2,11,7,41,in,69183155,1730010434306977842,detikcom,https://twitter.com/detikcom/status/1730010434306977842
8 | Wed Nov 29 23:35:08 +0000 2023,1730007517206577450,"@DedynurPalakka @jokowi Cuma mengingatkan di surat suara pilpres tidak ada foto Jokowi. Yang ada Prabowo Subianto, Anis Baswedan dan Ganjar Pranowo, yang salah satunya menggantikan Jokowi. Setelah Oktober 2024 Jokowi pensiun dari presiden dan pulang ke Solo, menempati rumah hadiah negara'.",0,0,0,0,in,1589160626101764096,1729771749959426239,surosohariyamt3,https://twitter.com/surosohariyamt3/status/1730007517206577450
9 | Wed Nov 29 23:30:38 +0000 2023,1730006384488648796,Jaringan sudah terstruktur dengan baik yaa #PolriTidakNetral #Polri #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/T7db460FfE,0,0,0,1,in,1615964192313606144,1730006384488648796,sejatidindaa,https://twitter.com/sejatidindaa/status/1730006384488648796
10 | Wed Nov 29 23:30:22 +0000 2023,1730006315685347793,Keterlibatan Pratikno dalam operasi pengkondisian NU dan merah #PolriTidakNetral #Polri #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/kPcqJBWJrJ,0,0,0,1,in,1615964192313606144,1730006315685347793,sejatidindaa,https://twitter.com/sejatidindaa/status/1730006315685347793
11 | Wed Nov 29 23:29:12 +0000 2023,1730006025024258141,Ternyata sudah terstruktur ya Polri #PolriTidakNetral #Polri #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/lOEd2h27Bm,0,0,0,1,in,1615964192313606144,1730006025024258141,sejatidindaa,https://twitter.com/sejatidindaa/status/1730006025024258141
12 | Wed Nov 29 23:29:32 +0000 2023,1730006106762789310,Polri emang beda #PolriTidakNetral #Polri #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/bYaW3FdryC,0,0,0,1,in,1615964192313606144,1730006106762789310,sejatidindaa,https://twitter.com/sejatidindaa/status/1730006106762789310
13 | Wed Nov 29 23:29:58 +0000 2023,1730006216083112261,Kompaknya Kapolri dan wakapolri sokong Gibran meresahkan tubuh Kepolisian #PolriTidakNetral #Polri #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/cPdedKFENn,0,0,0,1,in,1615964192313606144,1730006216083112261,sejatidindaa,https://twitter.com/sejatidindaa/status/1730006216083112261
14 | Wed Nov 29 23:27:47 +0000 2023,1730005665496887795,Udah jelas salah malah mau menyangkal #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/fSdPOKN4zF,0,0,0,1,in,1615919266729193473,1730005665496887795,infovalidd,https://twitter.com/infovalidd/status/1730005665496887795
15 | Wed Nov 29 23:26:15 +0000 2023,1730005279465693301,"Baliho Besar Gagasan Kecil, wkwkwkw #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/OxGqczxYGY",0,0,0,1,in,1615919266729193473,1730005279465693301,infovalidd,https://twitter.com/infovalidd/status/1730005279465693301
16 | Wed Nov 29 23:24:21 +0000 2023,1730004804267573740,"Rayyanza Diajak Raffi Ahmad Ketemu Prabowo Subianto, Netizen: Apakah Gibran Akan Digantikan Cipung? https://t.co/dHzUgzY6JH",0,0,0,0,in,1349806712,1730004804267573740,nurasmi69,https://twitter.com/nurasmi69/status/1730004804267573740
17 | Wed Nov 29 23:21:24 +0000 2023,1730004061460791336,Banyak Netizen yang menilai Prabowo - GIbran maju capres cawapres tapi minim gagasan #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/XUk1m7XuKZ,0,0,0,1,in,1615919266729193473,1730004061460791336,infovalidd,https://twitter.com/infovalidd/status/1730004061460791336
18 | Wed Nov 29 23:20:41 +0000 2023,1730003878131970490,"wkwkwk wajar sering tidak nyambung umurnya udah tua, tidak cocok dipilih #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/CmzS22Lzhj",0,0,0,2,in,1615919266729193473,1730003878131970490,infovalidd,https://twitter.com/infovalidd/status/1730003878131970490
19 | Wed Nov 29 23:20:03 +0000 2023,1730003719331397700,Rakyat Paham mana yang tulus mana rakus #PrabowoArogan #prabowosubianto #PrabowoGibranTakutDebat #PrabowoBukanPanutan #PrabowoGibran https://t.co/ftlmcYBnPO,0,0,0,1,in,1615919266729193473,1730003719331397700,infovalidd,https://twitter.com/infovalidd/status/1730003719331397700
20 | Wed Nov 29 22:53:16 +0000 2023,1729996981492486216,@zy_zy_its_me @adri_7i Prabowo Subianto BUKAN AKI AKI ? SUDAH AKI AKI JADI IDOLAMU 🤮,1,4,0,1,in,1629815240790773760,1729868053201252726,1973Suharjito,https://twitter.com/1973Suharjito/status/1729996981492486216
21 | Wed Nov 29 22:11:42 +0000 2023,1729986518704590915,"Sekjen Partai Gerindra Ahmad Muzani menegaskan, proses pencapresan Prabowo Subianto tidak dilakukan secara instan. Ia mengatakan, Prabowo merupakan sosok pemimpin yang matang dan kaya pengalaman. < #Prabowo #Gerindra https://t.co/2kaOoujsmS",5,51,1,10,in,23343960,1729986518704590915,kompascom,https://twitter.com/kompascom/status/1729986518704590915
22 | Wed Nov 29 22:09:06 +0000 2023,1729985865697681425,"Ketua TKN Prabowo Subianto-Gibran Rakabuming Raka, Rosan Perkasa Roeslani memastikan Prabowo dan Presiden Jokowi tidak berbicara mengenai politik saat bertemu di Istana Kepresidenan Bogor, Jawa Barat, Selasa (28/11/2023) kemarin. < #Prabowo #Jokowi https://t.co/vkKjTHvBXX",0,3,0,0,in,23343960,1729985865697681425,kompascom,https://twitter.com/kompascom/status/1729985865697681425
23 | Wed Nov 29 22:07:56 +0000 2023,1729985571014213740,"Juru bicara TKN pasangan capres dan cawapres nomor urut 2, Prabowo Subianto-Gibran Rakabuming, Dedek Prayudi mengatakan, narasi politik “gemoy” dipakai untuk menggaet milenial dan gen Z. < #Prabowo #Gibran #PrabowoGibran #Pemilu2024KCM #JernihMemilih https://t.co/wAUyuhpJTt",0,5,0,0,in,23343960,1729985571014213740,kompascom,https://twitter.com/kompascom/status/1729985571014213740
24 | Wed Nov 29 22:07:09 +0000 2023,1729985372501971034,"Prabowo Subianto TELAH MELAKUKAN PELANGGARAN HAM BERAT ...dan dipecat (dipaksa pensiun dini) dari TNI (SK DKP)..!! SBY , sebagai anggota DKP ikut tanda tangan ...tapi sekarang PLIN PLAN ,ingin cari kekuasaan, ..SBY malah mendukung Prabowo... BOTOL dan KOPLAK...!! https://t.co/jtFhqNCUDY",0,0,1,0,in,1583805020910551042,1729985372501971034,AsharIbnu,https://twitter.com/AsharIbnu/status/1729985372501971034
25 | Wed Nov 29 22:05:34 +0000 2023,1729984977943818417,Prabowo Subianto Diprediksi Menang Pilpres 2024 Gara-gara Fuji: Ada Uti Makin Komplit https://t.co/UIliVnrR9S,30,277,27,182,in,41730943,1729984977943818417,VIVAcoid,https://twitter.com/VIVAcoid/status/1729984977943818417
26 | Wed Nov 29 21:42:44 +0000 2023,1729979231499608236,Pasangan Prabowo Subianto-Gibran Rakabuming Raka akan memaksimalkan kampanye di akhir pekan. #Polhuk #AdadiKompas https://t.co/j374nlirXp,0,0,0,0,in,771030588,1729979231499608236,KompasData,https://twitter.com/KompasData/status/1729979231499608236
27 | Wed Nov 29 21:40:04 +0000 2023,1729978560428093455,"Sukses Meremajakan C-130H, Kinerja GMF AeroAsia Diapresiasi Prabowo Subianto: Tujuh Hercules Juga Melakukan Upgrade & nbsp https://t.co/8I543qlbTY",0,0,0,0,in,1219947729155047424,1729978560428093455,zonajakarta1,https://twitter.com/zonajakarta1/status/1729978560428093455
28 | Wed Nov 29 21:38:11 +0000 2023,1729978086437818734,Pasangan Prabowo Subianto-Gibran Rakabuming Raka akan memaksimalkan kampanye di akhir pekan. #Polhuk #AdadiKompas https://t.co/eD7IqkVBud,0,0,0,2,in,255866913,1729978086437818734,hariankompas,https://twitter.com/hariankompas/status/1729978086437818734
29 | Wed Nov 29 21:35:17 +0000 2023,1729977355622584723,AHY pimpin Deklarasi Capres koalisi Indonesia Maju Prabowo Subianto. pqc AgusYudhoyono PDemokrat DemokratS14P https://t.co/J3W7YDi2jM,0,0,0,0,in,763658966098403328,1729977355622584723,ayangmn,https://twitter.com/ayangmn/status/1729977355622584723
30 | Wed Nov 29 21:35:10 +0000 2023,1729977323552915581,"AHY Deklarasikan Prabowo Subianto Sebagai Capres Hari Ini, semangat perubahan tetap diperjuangkan. zkk PDemokrat AgusYudhoyono DemokratS14P https://t.co/928w3yizOD",0,0,0,0,in,763658966098403328,1729977323552915581,ayangmn,https://twitter.com/ayangmn/status/1729977323552915581
31 | Wed Nov 29 21:34:42 +0000 2023,1729977208851374383,Mars Partai Demokrat bergema di kediaman bacapres Prabowo Subianto. Serdadu AHY S14P all out di pilpres dan pileg 2024! 💪. im` PDemokrat AgusYudhoyono DemokratS14P https://t.co/L5bq40pVwi,0,0,0,0,in,763658966098403328,1729977208851374383,ayangmn,https://twitter.com/ayangmn/status/1729977208851374383
32 | Wed Nov 29 21:34:20 +0000 2023,1729977117906227415,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/4RBX35MWwu",0,0,0,0,in,763658966098403328,1729977117906227415,ayangmn,https://twitter.com/ayangmn/status/1729977117906227415
33 | Wed Nov 29 21:34:05 +0000 2023,1729977053167157685,Mas AHY sudah berpamitan kepada mba Puan soal keputusan dukung Prabowo Subianto.. \t^ PDemokrat AgusYudhoyono DemokratS14P https://t.co/5og08arFkH,0,0,0,0,in,763658966098403328,1729977053167157685,ayangmn,https://twitter.com/ayangmn/status/1729977053167157685
34 | Wed Nov 29 21:32:57 +0000 2023,1729976767685996827,AHY menginstruksikan kpd seluruh pimpinan DPD & DPC Demokrat di 38 provinsi utk menyusun strategi bersama memenangkan Prabowo Subianto dlm Pilpres 2024.. pxx AgusYudhoyono PDemokrat DemokratS14P https://t.co/ZJD1p1tasx,0,0,0,0,in,763658966098403328,1729976767685996827,ayangmn,https://twitter.com/ayangmn/status/1729976767685996827
35 | Wed Nov 29 21:32:47 +0000 2023,1729976724996464694,Partai Demokrat resmi mendukung Prabowo Subianto sebagai calon presiden 2024. Ketum Demokrat AHY mengeluarkan instruksi untuk para kader Demokrat.. k_d PDemokrat AgusYudhoyono DemokratS14P https://t.co/QbDp7wutfx,0,0,0,0,in,763658966098403328,1729976724996464694,ayangmn,https://twitter.com/ayangmn/status/1729976724996464694
36 | Wed Nov 29 21:32:44 +0000 2023,1729976715173388738,"Ketum Partai Demokrat Agus Harimurti Yudhoyono (AHY) meminta para kadernya untuk memenangkan bacapres Koalisi Indonesia Maju (KIM), Prabowo Subianto.. z]w AgusYudhoyono PDemokrat DemokratS14P https://t.co/dbLCKBYkUb",0,0,0,0,in,763658966098403328,1729976715173388738,ayangmn,https://twitter.com/ayangmn/status/1729976715173388738
37 | Wed Nov 29 21:29:36 +0000 2023,1729975924882567660,AHY pimpin Deklarasi Capres koalisi Indonesia Maju Prabowo Subianto. pqc AgusYudhoyono PDemokrat DemokratS14P https://t.co/whn7yl7HeD,0,0,0,0,in,2877262597,1729975924882567660,raninuran_,https://twitter.com/raninuran_/status/1729975924882567660
38 | Wed Nov 29 21:29:30 +0000 2023,1729975901331640521,"AHY Deklarasikan Prabowo Subianto Sebagai Capres Hari Ini, semangat perubahan tetap diperjuangkan. zkk PDemokrat AgusYudhoyono DemokratS14P https://t.co/DlS7pBmure",0,0,0,0,in,2877262597,1729975901331640521,raninuran_,https://twitter.com/raninuran_/status/1729975901331640521
39 | Wed Nov 29 21:29:13 +0000 2023,1729975829101465689,Mars Partai Demokrat bergema di kediaman bacapres Prabowo Subianto. Serdadu AHY S14P all out di pilpres dan pileg 2024! 💪. im` PDemokrat AgusYudhoyono DemokratS14P https://t.co/hWnJuf1xBU,0,0,0,0,in,2877262597,1729975829101465689,raninuran_,https://twitter.com/raninuran_/status/1729975829101465689
40 | Wed Nov 29 21:29:04 +0000 2023,1729975791218475425,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/x69jnJ2GQq",0,0,0,0,in,2877262597,1729975791218475425,raninuran_,https://twitter.com/raninuran_/status/1729975791218475425
41 | Wed Nov 29 21:28:47 +0000 2023,1729975718418026634,Mas AHY sudah berpamitan kepada mba Puan soal keputusan dukung Prabowo Subianto.. \t^ PDemokrat AgusYudhoyono DemokratS14P https://t.co/a3T4lDEARE,0,0,0,0,in,2877262597,1729975718418026634,raninuran_,https://twitter.com/raninuran_/status/1729975718418026634
42 | Wed Nov 29 21:27:48 +0000 2023,1729975471394480622,AHY menginstruksikan kpd seluruh pimpinan DPD & DPC Demokrat di 38 provinsi utk menyusun strategi bersama memenangkan Prabowo Subianto dlm Pilpres 2024.. pxx AgusYudhoyono PDemokrat DemokratS14P https://t.co/MSrJt19igo,0,0,0,0,in,2877262597,1729975471394480622,raninuran_,https://twitter.com/raninuran_/status/1729975471394480622
43 | Wed Nov 29 21:27:46 +0000 2023,1729975463463014868,Partai Demokrat resmi mendukung Prabowo Subianto sebagai calon presiden 2024. Ketum Demokrat AHY mengeluarkan instruksi untuk para kader Demokrat.. k_d PDemokrat AgusYudhoyono DemokratS14P https://t.co/Ju5Ka0X6gq,0,0,0,0,in,2877262597,1729975463463014868,raninuran_,https://twitter.com/raninuran_/status/1729975463463014868
44 | Wed Nov 29 21:27:44 +0000 2023,1729975453476323519,"Ketum Partai Demokrat Agus Harimurti Yudhoyono (AHY) meminta para kadernya untuk memenangkan bacapres Koalisi Indonesia Maju (KIM), Prabowo Subianto.. z]w AgusYudhoyono PDemokrat DemokratS14P https://t.co/OmPIHbfoBW",0,0,0,0,in,2877262597,1729975453476323519,raninuran_,https://twitter.com/raninuran_/status/1729975453476323519
45 | Wed Nov 29 21:21:10 +0000 2023,1729973802518606068,"Tegas, Susi Pudjiastuti Tolak Bergabung dengan Partai Gerindra Besutan Prabowo Subianto - Wartakota https://t.co/Z2hfsCFnSV #Prabowo #BangkitBersama",0,0,0,0,in,465423257,1729973802518606068,haelamarie,https://twitter.com/haelamarie/status/1729973802518606068
46 | Wed Nov 29 21:15:52 +0000 2023,1729972468264427713,AHY pimpin Deklarasi Capres koalisi Indonesia Maju Prabowo Subianto. pqc AgusYudhoyono PDemokrat DemokratS14P https://t.co/em1QAuxlst,0,0,0,0,in,326270868,1729972468264427713,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729972468264427713
47 | Wed Nov 29 21:15:45 +0000 2023,1729972437155193078,"AHY Deklarasikan Prabowo Subianto Sebagai Capres Hari Ini, semangat perubahan tetap diperjuangkan. zkk PDemokrat AgusYudhoyono DemokratS14P https://t.co/JhrpYp0pGM",0,0,0,0,in,326270868,1729972437155193078,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729972437155193078
48 | Wed Nov 29 21:15:21 +0000 2023,1729972340547805288,Mars Partai Demokrat bergema di kediaman bacapres Prabowo Subianto. Serdadu AHY S14P all out di pilpres dan pileg 2024! 💪. im` PDemokrat AgusYudhoyono DemokratS14P https://t.co/Z04fZUkMRy,0,0,0,0,in,326270868,1729972340547805288,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729972340547805288
49 | Wed Nov 29 21:15:13 +0000 2023,1729972305118507294,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/rEnxXouR8v",0,0,0,0,in,326270868,1729972305118507294,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729972305118507294
50 | Wed Nov 29 21:14:52 +0000 2023,1729972217587638641,Mas AHY sudah berpamitan kepada mba Puan soal keputusan dukung Prabowo Subianto.. \t^ PDemokrat AgusYudhoyono DemokratS14P https://t.co/qSqbCyhW9Q,0,0,0,0,in,326270868,1729972217587638641,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729972217587638641
51 | Wed Nov 29 21:13:57 +0000 2023,1729971984715710964,AHY menginstruksikan kpd seluruh pimpinan DPD & DPC Demokrat di 38 provinsi utk menyusun strategi bersama memenangkan Prabowo Subianto dlm Pilpres 2024.. pxx AgusYudhoyono PDemokrat DemokratS14P https://t.co/4KLg9qWX46,0,0,0,0,in,326270868,1729971984715710964,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729971984715710964
52 | Wed Nov 29 21:13:53 +0000 2023,1729971970941518260,Partai Demokrat resmi mendukung Prabowo Subianto sebagai calon presiden 2024. Ketum Demokrat AHY mengeluarkan instruksi untuk para kader Demokrat.. k_d PDemokrat AgusYudhoyono DemokratS14P https://t.co/3JoFiStBbe,0,0,0,0,in,326270868,1729971970941518260,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729971970941518260
53 | Wed Nov 29 21:13:50 +0000 2023,1729971957632995579,"Ketum Partai Demokrat Agus Harimurti Yudhoyono (AHY) meminta para kadernya untuk memenangkan bacapres Koalisi Indonesia Maju (KIM), Prabowo Subianto.. z]w AgusYudhoyono PDemokrat DemokratS14P https://t.co/WwZNJ5gsnv",0,0,0,0,in,326270868,1729971957632995579,Sjoekuuuh,https://twitter.com/Sjoekuuuh/status/1729971957632995579
54 | Wed Nov 29 21:02:25 +0000 2023,1729969082626380119,@hasyimmah Dron sodron si GEMOY (GEMBROT & LETOY) alias PRABOWO SUBIANTO tdk akn Menang slm msh ada org Jawa asli yg nyalon Presiden . Kecuali dlm keadaan darurat.,0,1,0,0,in,1713944739043811328,1729742804077944956,SuciptoHad53960,https://twitter.com/SuciptoHad53960/status/1729969082626380119
55 | Wed Nov 29 20:18:12 +0000 2023,1729957955208921492,Wahlkampf in Indonesien gestartet - Wer folgt Präsident Jokowi? 3 Kandidaten: Umfrage: ehem. General u Verteidigungsminister Prabowo Subianto (72): 40% Ganjar Pranowo (Gouverneur Zentraljava): 28% Anies Baswedan: 24% https://t.co/O6fbEAnWvE,0,0,0,1,de,1238447220,1729957955208921492,Milatrud11,https://twitter.com/Milatrud11/status/1729957955208921492
56 | Wed Nov 29 19:21:19 +0000 2023,1729943639810568341,Pak Prabowo Subianto Rangkul SDM Putra Bangsa yang berprestasi Kelas Dunia .. Salam Prabowo Subianto Presiden RI 2024 .. #PrabowoGibranJuara2024 #rk08 #rkrijuara #JabarRumahPrabowo #JabarTetapPrabowo #JabarSolidPrabowo Sembilan Partai All-in @prabowo @gibran_tweet https://t.co/0tCxs6Opyc,1,0,3,1,in,1531971112929615872,1729943639810568341,rkrijuara,https://twitter.com/rkrijuara/status/1729943639810568341
57 | Wed Nov 29 19:05:40 +0000 2023,1729939700620915150,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/6CHcTQxIgk",0,0,0,0,in,1493150581602467840,1729939700620915150,kariimm_4,https://twitter.com/kariimm_4/status/1729939700620915150
58 | Wed Nov 29 18:55:03 +0000 2023,1729937028811518181,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/b12hOnKPDt",0,0,0,0,in,69874051,1729937028811518181,setiawann_24,https://twitter.com/setiawann_24/status/1729937028811518181
59 | Wed Nov 29 18:51:09 +0000 2023,1729936049642836017,Napro 08 siap bergerak menangkan Prabowo Subianto - ANTARA https://t.co/xO33Gojl73 #BersamaIndonesiaMaju #PrabowoGemoy #KodeKita08Gemoy,0,0,10,0,in,4579831158,1729936049642836017,ChairudinN6548,https://twitter.com/ChairudinN6548/status/1729936049642836017
60 | Wed Nov 29 18:48:10 +0000 2023,1729935297381855658,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/hCaldL1uM4",0,0,0,0,in,69891770,1729935297381855658,nadianurulazmii,https://twitter.com/nadianurulazmii/status/1729935297381855658
61 | Wed Nov 29 18:43:44 +0000 2023,1729934182078673278,AHY pimpin Deklarasi Capres koalisi Indonesia Maju Prabowo Subianto. pqc AgusYudhoyono PDemokrat DemokratS14P https://t.co/MYlUbIRSkk,0,0,0,0,in,1378473318,1729934182078673278,samueleto78,https://twitter.com/samueleto78/status/1729934182078673278
62 | Wed Nov 29 18:43:41 +0000 2023,1729934171240591417,"AHY Deklarasikan Prabowo Subianto Sebagai Capres Hari Ini, semangat perubahan tetap diperjuangkan. zkk PDemokrat AgusYudhoyono DemokratS14P https://t.co/ewT4rQmGSd",0,0,0,0,in,1378473318,1729934171240591417,samueleto78,https://twitter.com/samueleto78/status/1729934171240591417
63 | Wed Nov 29 18:43:18 +0000 2023,1729934074528329947,Mars Partai Demokrat bergema di kediaman bacapres Prabowo Subianto. Serdadu AHY S14P all out di pilpres dan pileg 2024! 💪. im` PDemokrat AgusYudhoyono DemokratS14P https://t.co/n5MneCU1qv,0,0,0,0,in,1378473318,1729934074528329947,samueleto78,https://twitter.com/samueleto78/status/1729934074528329947
64 | Wed Nov 29 18:43:12 +0000 2023,1729934047214997582,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/U3kUFLr2f0",0,0,0,0,in,1378473318,1729934047214997582,samueleto78,https://twitter.com/samueleto78/status/1729934047214997582
65 | Wed Nov 29 18:42:53 +0000 2023,1729933968617959710,Mas AHY sudah berpamitan kepada mba Puan soal keputusan dukung Prabowo Subianto.. \t^ PDemokrat AgusYudhoyono DemokratS14P https://t.co/J28OunKEBS,0,0,0,0,in,1378473318,1729933968617959710,samueleto78,https://twitter.com/samueleto78/status/1729933968617959710
66 | Wed Nov 29 18:14:56 +0000 2023,1729926933146956016,@theo12_ini PRABOWO SUBIANTO,0,1,0,1,pl,1702935093701201920,1729925957979664726,BeyondTheJoker,https://twitter.com/BeyondTheJoker/status/1729926933146956016
67 | Wed Nov 29 18:41:56 +0000 2023,1729933727973961946,Partai Demokrat resmi mendukung Prabowo Subianto sebagai calon presiden 2024. Ketum Demokrat AHY mengeluarkan instruksi untuk para kader Demokrat.. k_d PDemokrat AgusYudhoyono DemokratS14P https://t.co/SyZdBJPhv4,0,0,0,0,in,1378473318,1729933727973961946,samueleto78,https://twitter.com/samueleto78/status/1729933727973961946
68 | Wed Nov 29 18:41:51 +0000 2023,1729933710869590122,"Ketum Partai Demokrat Agus Harimurti Yudhoyono (AHY) meminta para kadernya untuk memenangkan bacapres Koalisi Indonesia Maju (KIM), Prabowo Subianto.. z]w AgusYudhoyono PDemokrat DemokratS14P https://t.co/NXlBC0AbSB",0,0,0,0,in,1378473318,1729933710869590122,samueleto78,https://twitter.com/samueleto78/status/1729933710869590122
69 | Wed Nov 29 18:28:44 +0000 2023,1729930406072611170,"Capres no.urut 2 Prabowo Subianto dicecar habis²an oleh National Corruption Watch"" 😢😢 Apa gak bahaya tah!! https://t.co/P2B0q3dczd""",9,14,164,403,in,1186243531037270017,1729930406072611170,msobri99,https://twitter.com/msobri99/status/1729930406072611170
70 | Wed Nov 29 18:24:07 +0000 2023,1729929245953290351,"Serikat Pekerja Nusantara Dukung Prabowo Subianto di Pilpres 2024, Siap Ikut Memenangkan https://t.co/NxMoX2ziNj",0,4,1,2,in,41730943,1729929245953290351,VIVAcoid,https://twitter.com/VIVAcoid/status/1729929245953290351
71 | Wed Nov 29 18:41:58 +0000 2023,1729933738048655587,AHY menginstruksikan kpd seluruh pimpinan DPD & DPC Demokrat di 38 provinsi utk menyusun strategi bersama memenangkan Prabowo Subianto dlm Pilpres 2024.. pxx AgusYudhoyono PDemokrat DemokratS14P https://t.co/dq22FLYUtI,0,0,0,0,in,1378473318,1729933738048655587,samueleto78,https://twitter.com/samueleto78/status/1729933738048655587
72 | Wed Nov 29 18:14:08 +0000 2023,1729926735792345406,@siregar_elang Apa yang diharapkan dari Prabowo Subianto... TNI sudah PECAT dia artinya sudah tak dibutuhkan... lantas ada yang ingin dia jadi panglima tertinggi di Indonesia...olala... sudah tidak ada lagi kah rakyat di negeri sebesar ini...????,0,0,0,0,in,1656204151590883328,1729310357582479617,CMediio63787,https://twitter.com/CMediio63787/status/1729926735792345406
73 | Wed Nov 29 17:36:58 +0000 2023,1729917379025760741,@dsvwaikdjns @tvOneNews Kejujuran Pak Prabowo Subianto adalah landasan kuat yang membedakannya sebagai pemimpin yang dapat diandalkan dan dapat dipercaya.,0,0,0,0,in,2388963246,1729652211435860474,Gojokaisen86,https://twitter.com/Gojokaisen86/status/1729917379025760741
74 | Wed Nov 29 17:33:33 +0000 2023,1729916521714192423,Sri Mulyani: Pinjaman Luar Negeri (UTANG) Kementerian Prabowo Tembus Rp385 Triliun. https://t.co/wBuIumeO4F Menteri Keuangan Sri Mulyani mengungkapkan HASIL RAPAT bersama Menhan Prabowo Subianto terkait BELANJA Alat Utama Sistem Pertahanan (ALUTSISTA) dari PINJAMAN LUAR NEGERI. https://t.co/8rEVFD8qhM,0,1,0,0,in,4517167514,1729916521714192423,sirajapadoha,https://twitter.com/sirajapadoha/status/1729916521714192423
75 | Wed Nov 29 17:33:21 +0000 2023,1729916468585005182,"@hddfsysgdcoghu @habiburokhman @prabowo @gibran_tweet Dalam memilih pemimpin, kejujuran adalah kualitas yang tak ternilai, dan saya yakin bahwa Pak Prabowo Subianto adalah sosok yang memenuhi kriteria tersebut.",0,0,0,0,in,707639560843468800,1729705301106913318,dsvwaikdjns,https://twitter.com/dsvwaikdjns/status/1729916468585005182
76 | Wed Nov 29 17:31:08 +0000 2023,1729915914420941148,"@hddfsysgdcoghu @ajengcute16__ Pilihannya untuk Pak Prabowo Subianto adalah pilihan untuk memiliki pemimpin yang jujur dan tulus, yang diharapkan membawa integritas dan kejujuran dalam kepemimpinan.",0,1,0,0,in,707639560843468800,1729821879903375505,dsvwaikdjns,https://twitter.com/dsvwaikdjns/status/1729915914420941148
77 | Wed Nov 29 17:29:49 +0000 2023,1729915580780835287,Mana nih yang belum gabung ke barisan Pak @prabowo ? #prabowosubianto #GibranRakabuming #prabowopresiden #PrabowoGibran2024 #indonesia #indonesian #viral #fyp #politik #ganjarpranowo #aniesbaswedan #pinterpolitik https://t.co/QTuDH4qkv1,3,21,19,47,in,1579126996537290752,1729915580780835287,mypresidentid,https://twitter.com/mypresidentid/status/1729915580780835287
78 | Wed Nov 29 17:29:10 +0000 2023,1729915416355705123,@ruyasagit900 @RcyberProj0 @prabowo @gibran_tweet @Projo_Pusat Saya semakin yakin bahwa memilih Pak Prabowo Subianto adalah langkah cerdas untuk membawa Indonesia ke tingkat kejayaan yang baru dan lebih baik.,0,0,0,0,in,2170027031,1729652171434541424,kingslandsjay,https://twitter.com/kingslandsjay/status/1729915416355705123
79 | Wed Nov 29 17:28:11 +0000 2023,1729915171257380927,"Prabowo Cinta Rakyat Prabowo Subianto Sumringah Dapat Dukungang Langsung oleh Dua Presiden, Termotivasi Menang di Pilpres - https://t.co/2aLVkgOAqp https://t.co/iVMbNlgMe8 #BersamaIndonesiaMaju #PrabowoGemoy #KodeKita08Gemoy",0,0,8,0,in,814012104768307201,1729915171257380927,RondangKomariah,https://twitter.com/RondangKomariah/status/1729915171257380927
80 | Wed Nov 29 17:23:44 +0000 2023,1729914052191907910,@Gojokaisen86 @tvOneNews Saya semakin yakin bahwa Pak Prabowo Subianto adalah sosok yang memiliki kepekaan terhadap berbagai isu sosial dan ekonomi yang dihadapi oleh masyarakat Indonesia.,0,0,0,0,in,4153801512,1729652211435860474,mirashahabudin,https://twitter.com/mirashahabudin/status/1729914052191907910
81 | Wed Nov 29 17:22:18 +0000 2023,1729913691678871690,"@Gojokaisen86 @Chaves1305 Setiap kali saya mendalami visi dan misi Pak Prabowo Subianto, semakin kuat keyakinan saya bahwa beliau adalah pemimpin yang memiliki pandangan jauh ke depan.",1,0,0,0,in,4153801512,1729796437863518249,mirashahabudin,https://twitter.com/mirashahabudin/status/1729913691678871690
82 | Wed Nov 29 17:21:16 +0000 2023,1729913427743928498,Budiman Sudjatmiko nyatakan dukungan pada Prabowo Subianto ... - ANTARA https://t.co/5eUxBb88yl #BersamaIndonesiaMaju #PrabowoGemoy #KodeKita08Gemoy,0,0,10,0,in,797615281455525888,1729913427743928498,AzzrielAzaryahu,https://twitter.com/AzzrielAzaryahu/status/1729913427743928498
83 | Wed Nov 29 17:21:15 +0000 2023,1729913427127366076,Budiman Sudjatmiko nyatakan dukungan pada Prabowo Subianto ... - ANTARA https://t.co/WkscP4NgZU #BersamaIndonesiaMaju #PrabowoGemoy #KodeKita08Gemoy,0,0,10,0,in,4579831158,1729913427127366076,ChairudinN6548,https://twitter.com/ChairudinN6548/status/1729913427127366076
84 | Wed Nov 29 17:21:15 +0000 2023,1729913425281871905,Budiman Sudjatmiko nyatakan dukungan pada Prabowo Subianto ... - ANTARA https://t.co/XhI7hOxE6U #BersamaIndonesiaMaju #PrabowoGemoy #KodeKita08Gemoy,0,0,10,0,in,795445014864203777,1729913425281871905,AzhareAkiba,https://twitter.com/AzhareAkiba/status/1729913425281871905
85 | Wed Nov 29 17:20:54 +0000 2023,1729913338363310372,"@Gojokaisen86 @ajengcute16__ Semakin saya mendalami visi dan misi Pak Prabowo Subianto, semakin yakin bahwa beliau memiliki komitmen yang kuat untuk membawa Indonesia ke arah yang lebih baik.",0,0,0,0,in,4153801512,1729821879903375505,mirashahabudin,https://twitter.com/mirashahabudin/status/1729913338363310372
86 | Wed Nov 29 17:17:37 +0000 2023,1729912510743245236,@RcyberProj0 @prabowo @gibran_tweet @Projo_Pusat Semoga dukungan kita terhadap Pak Prabowo Subianto dapat memberikan inspirasi bagi masyarakat untuk tetap bersatu dan menjaga persatuan dalam perbedaan.,0,0,0,0,in,4153801512,1729652171434541424,mirashahabudin,https://twitter.com/mirashahabudin/status/1729912510743245236
87 | Wed Nov 29 17:10:05 +0000 2023,1729910615807005061,@DaengWahidin2 Biarkan rakyat memilih pilihannya yang jelas aku pilih Prabowo Subianto aja oke,0,0,0,0,in,1675018752772149249,1729364216082231802,KangAnom377663,https://twitter.com/KangAnom377663/status/1729910615807005061
88 | Wed Nov 29 17:08:35 +0000 2023,1729910239070449982,"@tvOneNews Dalam mendukung Pak Prabowo Subianto, saya mengajak semua pihak untuk menghormati perbedaan pendapat dan membangun dialog yang konstruktif.",0,0,0,0,in,2170027031,1729652211435860474,kingslandsjay,https://twitter.com/kingslandsjay/status/1729910239070449982
89 | Wed Nov 29 17:08:06 +0000 2023,1729910116856820046,"Tiba-tiba Hentikan Langkah Prabowo Subianto, Aksi Ajudan Ganteng Ini Bikin Heboh - https://t.co/BwJCocZuOd https://t.co/UETxOatZAb #Prabowo #PrabowoUnggul",0,0,0,0,in,1110932693426954240,1729910116856820046,fathw25,https://twitter.com/fathw25/status/1729910116856820046
90 | Wed Nov 29 17:06:10 +0000 2023,1729909631127056760,"Menhan Prabowo Subianto Hadiri Pelantikan KSAD Maruli Simanjuntak: Jakarta – Menteri Pertahanan Prabowo Subianto menghadiri pelantikan Kepala Staf Angkatan Darat (KASAD) di Istana Negara, Jakarta, Rabu (29/11). Presiden Joko Widodo… https://t.co/mHUItX9YYF via @kabar_tangsel",0,0,0,0,in,1115363858,1729909631127056760,tangselupdates,https://twitter.com/tangselupdates/status/1729909631127056760
91 | Wed Nov 29 17:06:07 +0000 2023,1729909617612984495,@ajengcute16__ Dukungan saya untuk Pak Prabowo Subianto adalah hasil dari keyakinan saya akan visi dan komitmen beliau untuk membawa perubahan positif di Indonesia.,0,7,0,0,in,2170027031,1729821879903375505,kingslandsjay,https://twitter.com/kingslandsjay/status/1729909617612984495
92 | Wed Nov 29 16:56:57 +0000 2023,1729907310036972000,Semua akan all in pak Prabowo Subianto pada waktunya ☺ #Allinprabowo #PrabowoGibranIstimewa,0,0,0,0,in,780718575271055360,1729907310036972000,hddfsysgdcoghu,https://twitter.com/hddfsysgdcoghu/status/1729907310036972000
93 | Wed Nov 29 16:53:00 +0000 2023,1729906315638796643,"@ajengcute16__ Semakin banyak anak muda yang memilih Pak Prabowo Subianto sebagai calon presidennya, semakin terasa semangat perubahan yang diinginkan generasi muda untuk masa depan Indonesia.",0,5,0,2,in,780718575271055360,1729821879903375505,hddfsysgdcoghu,https://twitter.com/hddfsysgdcoghu/status/1729906315638796643
94 | Wed Nov 29 16:49:30 +0000 2023,1729905433685692855,"Dari APBN Gak Cukup, Menhan Prabowo Utang ke Luar Negri Rp. 385 Triliun //.. Sri Mulyani: Pinjaman Luar Negeri Kementerian Prabowo Tembus Rp385 Triliun Menkeu Sri Mulyani buka suara soal pinjaman luar negeri Kementerian Pertahanan yang dipimpin Prabowo Subianto. https://t.co/mvbn0p9QpA",0,2,2,1,in,1319684366591639552,1729905433685692855,DahonoB,https://twitter.com/DahonoB/status/1729905433685692855
95 | Wed Nov 29 16:49:25 +0000 2023,1729905414412890420,"Bersama Indonesia Maju, @prabowo - @gibran_tweet satu putaran 💛 #airlanggahartarto #prabowosubianto #gibranrakabumingraka #golkarprabowo #gerindra #DUAsejolipaketkomplet https://t.co/yoo20na3Vm",0,0,0,0,in,1412673973129646084,1729905414412890420,g_politik2024,https://twitter.com/g_politik2024/status/1729905414412890420
96 | Wed Nov 29 16:46:09 +0000 2023,1729904592899707166,@RcyberProj0 @prabowo @gibran_tweet @Projo_Pusat Saya yakin bahwa mendukung Pak Prabowo Subianto adalah langkah yang tepat untuk menciptakan perubahan positif yang kita inginkan dalam perjalanan menuju masa depan Indonesia yang lebih baik.,0,1,0,1,in,983772276222029825,1729652171434541424,ruyasagit900,https://twitter.com/ruyasagit900/status/1729904592899707166
97 | Wed Nov 29 16:44:35 +0000 2023,1729904198043758664,@tvOneNews Saya yakin bahwa Pak Prabowo Subianto adalah pemimpin yang memiliki pandangan jauh ke depan dan mampu menghadirkan solusi inovatif untuk masalah-masalah yang kompleks,0,2,0,1,in,983772276222029825,1729652211435860474,ruyasagit900,https://twitter.com/ruyasagit900/status/1729904198043758664
98 | Wed Nov 29 16:39:19 +0000 2023,1729902870659764680,"@RcyberProj0 @prabowo @gibran_tweet @Projo_Pusat Saya semakin percaya bahwa Pak Prabowo Subianto adalah pemimpin yang memiliki integritas tinggi, dan kesetiaan beliau terhadap nilai-nilai moral adalah nilai tambah yang luar biasa.",0,3,0,1,in,2388963246,1729652171434541424,Gojokaisen86,https://twitter.com/Gojokaisen86/status/1729902870659764680
99 | Wed Nov 29 16:38:35 +0000 2023,1729902689310679408,Semangat capres 2024 KIM Prabowo Subianto tidak pernah padam untuk Indonesia 🔥 @prabowo @Gibran_rakabuming @Gerindra Gerindrajateng #gerindrajateng #kim #gerindrajawatengah #gerindrakotasemarang https://t.co/b5ZkYvxWJn,0,0,0,0,in,1610207637513318404,1729902689310679408,gerindrakotasmg,https://twitter.com/gerindrakotasmg/status/1729902689310679408
100 | Wed Nov 29 16:36:39 +0000 2023,1729902201982849523,"@Leonita_Lestari PDI-P no. 3 , Ganjar Mahfud no. 3, PS kalah 3 x. Tanda tanda alam semesta mendukung Prabowo Subianto melakukan hattrick kekalahan",0,0,0,0,in,1491438677674717186,1729745613947060471,Krisna0902022,https://twitter.com/Krisna0902022/status/1729902201982849523
101 | Wed Nov 29 16:34:14 +0000 2023,1729901591921332734,"@ajengcute16__ Semakin saya memahami visi dan misi Pak Prabowo Subianto, semakin yakin bahwa beliau memiliki rencana konkret untuk memajukan Indonesia ke arah yang lebih baik.",0,8,0,2,in,2388963246,1729821879903375505,Gojokaisen86,https://twitter.com/Gojokaisen86/status/1729901591921332734
102 | Wed Nov 29 21:21:27 +0000 2023,1729973875046482407,"Deklarasi Calon Presiden Koalisi Indonesia Maju, PRABOWO SUBIANTO. taz AgusYudhoyono PDemokrat DemokratS14P https://t.co/ykGOW4vGsS",0,0,0,0,in,2757822406,1729973875046482407,deriskyafriza,https://twitter.com/deriskyafriza/status/1729973875046482407
103 | Wed Nov 29 20:31:37 +0000 2023,1729961333096108096,"@ChrisJ_2211 @Gus_Raharjo @PDI_Perjuangan @adearmando61 2019 Prabowo Subianto dapat award kebohongan terlebay dari PSI, 2024 PSI akan dapat penghargaan dari Rakyat Partai paling Menggibrani....!!!",0,1,0,1,in,1717967580336721920,1729454331379040575,Melly4505483263,https://twitter.com/Melly4505483263/status/1729961333096108096
104 | Wed Nov 29 19:29:55 +0000 2023,1729945805107425495,Pemimpin zalim sudah merusak demokrasi yang masih seumur jagung #gibran #prabowosubianto #neoorba #anies #dinastipolitikjokowi #jokowi #kamimuak #cawapresboneka https://t.co/wtjPo0mYQx,0,0,0,0,in,1710266831326621697,1729945805107425495,inumalica,https://twitter.com/inumalica/status/1729945805107425495
105 | Wed Nov 29 18:08:21 +0000 2023,1729925278133350703,Kekecewaan mahasiswa atas ketidakpahaman Jokowi terhadap demokrasi dan konstitusi #gibran #prabowosubianto #neoorba #anies #dinastipolitikjokowi #jokowi #kamimuak #cawapresboneka || Yati Sukabumi Daesang Sebong Radja Nainggolan ROTY Niki KEREN BHT Sehun Tandanya|| https://t.co/4MI3MSN70a,0,0,0,229,in,1710266831326621697,1729925278133350703,inumalica,https://twitter.com/inumalica/status/1729925278133350703
106 | Wed Nov 29 18:04:59 +0000 2023,1729924432813289928,Jokowi dan kelompok yang telah merusak demokrasi harus taat konstitusi #gibran #prabowosubianto #neoorba #anies #dinastipolitikjokowi #jokowi #kamimuak #cawapresboneka || Yati Sukabumi Daesang Sebong Radja Nainggolan ROTY Niki KEREN BHT Sehun Tandanya|| https://t.co/L25Zlx2PCj,0,0,0,0,in,1710266831326621697,1729924432813289928,inumalica,https://twitter.com/inumalica/status/1729924432813289928
107 | Wed Nov 29 17:59:42 +0000 2023,1729923100798808155,Aksi mahasiswa yang peduli terhadap demokrasi dan benci terhadap nepotisme #gibran #prabowosubianto #neoorba #anies #dinastipolitikjokowi #jokowi #kamimuak #cawapresboneka || Yati Sukabumi Daesang Sebong Radja Nainggolan ROTY Niki KEREN BHT Sehun Tandanya|| https://t.co/MzIDdKKptU,0,0,0,0,in,1710266831326621697,1729923100798808155,inumalica,https://twitter.com/inumalica/status/1729923100798808155
108 | Wed Nov 29 17:50:01 +0000 2023,1729920665439105138,Gerakan mahasiswa untuk mencegah demokrasi semakin hancur #gibran #prabowosubianto #neoorba #anies #dinastipolitikjokowi #jokowi #kamimuak #cawapresboneka || Yati Sukabumi Daesang Sebong Radja Nainggolan ROTY Niki KEREN BHT Sehun Tandanya|| https://t.co/F8hzMXQT8U,0,0,0,0,in,1710258220605022208,1729920665439105138,f4jar_mjaya,https://twitter.com/f4jar_mjaya/status/1729920665439105138
109 |
--------------------------------------------------------------------------------
/notebook/enrichment - 2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "id": "a6b8a7b4",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "# Load libraries \n",
11 | "import os\n",
12 | "import re\n",
13 | "import time\n",
14 | "import openai \n",
15 | "import pandas as pd \n",
16 | "from tqdm import tqdm\n",
17 | "from typing import Tuple\n",
18 | "from dotenv import load_dotenv\n",
19 | "\n",
20 | "\n",
21 | "load_dotenv()\n",
22 | "pd.set_option(\"display.max_columns\", None)"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 3,
28 | "id": "7f7ccfa1",
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "# Setting credentials\n",
33 | "OPENAI_KEY = os.getenv(\"OPENAI_API_KEY\", default = None) \n",
34 | "openai.api_key = OPENAI_KEY"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 5,
40 | "id": "e205895d",
41 | "metadata": {},
42 | "outputs": [
43 | {
44 | "data": {
45 | "text/html": [
46 | "\n",
47 | "\n",
60 | "
\n",
61 | " \n",
62 | " \n",
63 | " | \n",
64 | " created_at | \n",
65 | " id_str | \n",
66 | " full_text | \n",
67 | " quote_count | \n",
68 | " reply_count | \n",
69 | " retweet_count | \n",
70 | " favorite_count | \n",
71 | " lang | \n",
72 | " user_id_str | \n",
73 | " conversation_id_str | \n",
74 | " username | \n",
75 | " tweet_url | \n",
76 | "
\n",
77 | " \n",
78 | " \n",
79 | " \n",
80 | " | 0 | \n",
81 | " Fri Nov 17 23:58:21 +0000 2023 | \n",
82 | " 1.725665e+18 | \n",
83 | " @gogo74070675957 @iina_surbakti @gibran_tweet ... | \n",
84 | " 0.0 | \n",
85 | " 0.0 | \n",
86 | " 0.0 | \n",
87 | " 1.0 | \n",
88 | " in | \n",
89 | " 1.213867e+18 | \n",
90 | " 1.725499e+18 | \n",
91 | " ArjunaOnthesky | \n",
92 | " https://twitter.com/ArjunaOnthesky/status/1725... | \n",
93 | "
\n",
94 | " \n",
95 | " | 1 | \n",
96 | " Fri Nov 17 23:58:19 +0000 2023 | \n",
97 | " 1.725665e+18 | \n",
98 | " @vendie7 Btw sy msh ingat omelanmu ttg prof MD... | \n",
99 | " NaN | \n",
100 | " NaN | \n",
101 | " NaN | \n",
102 | " NaN | \n",
103 | " NaN | \n",
104 | " NaN | \n",
105 | " NaN | \n",
106 | " NaN | \n",
107 | " NaN | \n",
108 | "
\n",
109 | " \n",
110 | " | 2 | \n",
111 | " Fri Nov 17 23:57:49 +0000 2023 | \n",
112 | " 1.725665e+18 | \n",
113 | " ðŸ”´âšªï¸ PEMILU terutama PILPRES adalah SATU... | \n",
114 | " 1.0 | \n",
115 | " 7.0 | \n",
116 | " 12.0 | \n",
117 | " 29.0 | \n",
118 | " in | \n",
119 | " 1.378303e+18 | \n",
120 | " 1.725665e+18 | \n",
121 | " _BungHerwin | \n",
122 | " https://twitter.com/_BungHerwin/status/1725664... | \n",
123 | "
\n",
124 | " \n",
125 | " | 3 | \n",
126 | " Fri Nov 17 23:57:34 +0000 2023 | \n",
127 | " 1.725665e+18 | \n",
128 | " @ekagumilars Indonesia aman & damai tanpa ... | \n",
129 | " 0.0 | \n",
130 | " 0.0 | \n",
131 | " 0.0 | \n",
132 | " 0.0 | \n",
133 | " in | \n",
134 | " 2.537213e+09 | \n",
135 | " 1.725384e+18 | \n",
136 | " irfandjay | \n",
137 | " https://twitter.com/irfandjay/status/172566450... | \n",
138 | "
\n",
139 | " \n",
140 | " | 4 | \n",
141 | " Fri Nov 17 23:57:31 +0000 2023 | \n",
142 | " 1.725664e+18 | \n",
143 | " Pilpres kali ini | \n",
144 | " NaN | \n",
145 | " NaN | \n",
146 | " NaN | \n",
147 | " NaN | \n",
148 | " NaN | \n",
149 | " NaN | \n",
150 | " NaN | \n",
151 | " NaN | \n",
152 | " NaN | \n",
153 | "
\n",
154 | " \n",
155 | "
\n",
156 | "
"
157 | ],
158 | "text/plain": [
159 | " created_at id_str \\\n",
160 | "0 Fri Nov 17 23:58:21 +0000 2023 1.725665e+18 \n",
161 | "1 Fri Nov 17 23:58:19 +0000 2023 1.725665e+18 \n",
162 | "2 Fri Nov 17 23:57:49 +0000 2023 1.725665e+18 \n",
163 | "3 Fri Nov 17 23:57:34 +0000 2023 1.725665e+18 \n",
164 | "4 Fri Nov 17 23:57:31 +0000 2023 1.725664e+18 \n",
165 | "\n",
166 | " full_text quote_count \\\n",
167 | "0 @gogo74070675957 @iina_surbakti @gibran_tweet ... 0.0 \n",
168 | "1 @vendie7 Btw sy msh ingat omelanmu ttg prof MD... NaN \n",
169 | "2 ðŸ”´âšªï¸ PEMILU terutama PILPRES adalah SATU... 1.0 \n",
170 | "3 @ekagumilars Indonesia aman & damai tanpa ... 0.0 \n",
171 | "4 Pilpres kali ini NaN \n",
172 | "\n",
173 | " reply_count retweet_count favorite_count lang user_id_str \\\n",
174 | "0 0.0 0.0 1.0 in 1.213867e+18 \n",
175 | "1 NaN NaN NaN NaN NaN \n",
176 | "2 7.0 12.0 29.0 in 1.378303e+18 \n",
177 | "3 0.0 0.0 0.0 in 2.537213e+09 \n",
178 | "4 NaN NaN NaN NaN NaN \n",
179 | "\n",
180 | " conversation_id_str username \\\n",
181 | "0 1.725499e+18 ArjunaOnthesky \n",
182 | "1 NaN NaN \n",
183 | "2 1.725665e+18 _BungHerwin \n",
184 | "3 1.725384e+18 irfandjay \n",
185 | "4 NaN NaN \n",
186 | "\n",
187 | " tweet_url \n",
188 | "0 https://twitter.com/ArjunaOnthesky/status/1725... \n",
189 | "1 NaN \n",
190 | "2 https://twitter.com/_BungHerwin/status/1725664... \n",
191 | "3 https://twitter.com/irfandjay/status/172566450... \n",
192 | "4 NaN "
193 | ]
194 | },
195 | "execution_count": 5,
196 | "metadata": {},
197 | "output_type": "execute_result"
198 | }
199 | ],
200 | "source": [
201 | "# Load dataset\n",
202 | "data = pd.read_excel(\"../dataset/Full Dataset Kotor updated 2.0.xlsx\")\n",
203 | "data.head()"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 6,
209 | "id": "4065ce80",
210 | "metadata": {},
211 | "outputs": [
212 | {
213 | "data": {
214 | "text/plain": [
215 | "False 1564\n",
216 | "True 172\n",
217 | "dtype: int64"
218 | ]
219 | },
220 | "execution_count": 6,
221 | "metadata": {},
222 | "output_type": "execute_result"
223 | }
224 | ],
225 | "source": [
226 | "# Data Duplicate checking\n",
227 | "data.duplicated(subset = ['full_text', 'id_str', 'retweet_count']).value_counts()"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 7,
233 | "id": "bc07851b",
234 | "metadata": {},
235 | "outputs": [
236 | {
237 | "data": {
238 | "text/html": [
239 | "\n",
240 | "\n",
253 | "
\n",
254 | " \n",
255 | " \n",
256 | " | \n",
257 | " created_at | \n",
258 | " id_str | \n",
259 | " full_text | \n",
260 | " quote_count | \n",
261 | " reply_count | \n",
262 | " retweet_count | \n",
263 | " favorite_count | \n",
264 | " lang | \n",
265 | " user_id_str | \n",
266 | " conversation_id_str | \n",
267 | " username | \n",
268 | " tweet_url | \n",
269 | "
\n",
270 | " \n",
271 | " \n",
272 | " \n",
273 | " | 151 | \n",
274 | " Fri Nov 17 21:00:46 +0000 2023 | \n",
275 | " 1.725620e+18 | \n",
276 | " @NenkMonica @gibran_tweet Salahnya dmn dateng ... | \n",
277 | " NaN | \n",
278 | " NaN | \n",
279 | " NaN | \n",
280 | " NaN | \n",
281 | " NaN | \n",
282 | " NaN | \n",
283 | " NaN | \n",
284 | " NaN | \n",
285 | " NaN | \n",
286 | "
\n",
287 | " \n",
288 | " | 152 | \n",
289 | " Fri Nov 17 20:49:46 +0000 2023 | \n",
290 | " 1.725617e+18 | \n",
291 | " Kreatif balon #GanjarMahfud2024 anti di Bongka... | \n",
292 | " 0.0 | \n",
293 | " 3.0 | \n",
294 | " 7.0 | \n",
295 | " 24.0 | \n",
296 | " in | \n",
297 | " 7.945097e+17 | \n",
298 | " 1.725617e+18 | \n",
299 | " Jayabay19479190 | \n",
300 | " https://twitter.com/Jayabay19479190/status/172... | \n",
301 | "
\n",
302 | " \n",
303 | " | 352 | \n",
304 | " Fri Nov 17 16:24:41 +0000 2023 | \n",
305 | " 1.725551e+18 | \n",
306 | " @DPP_PKB @ninikwafiroh Maju bersama AMIN memen... | \n",
307 | " 0.0 | \n",
308 | " 0.0 | \n",
309 | " 0.0 | \n",
310 | " 0.0 | \n",
311 | " in | \n",
312 | " 1.618235e+18 | \n",
313 | " 1.725482e+18 | \n",
314 | " IndahRahaleb | \n",
315 | " https://twitter.com/IndahRahaleb/status/172555... | \n",
316 | "
\n",
317 | " \n",
318 | " | 353 | \n",
319 | " Fri Nov 17 16:23:57 +0000 2023 | \n",
320 | " 1.725550e+18 | \n",
321 | " @triwul82 Mahfud MD mengingatkan bahwa tanggun... | \n",
322 | " NaN | \n",
323 | " NaN | \n",
324 | " NaN | \n",
325 | " NaN | \n",
326 | " NaN | \n",
327 | " NaN | \n",
328 | " NaN | \n",
329 | " NaN | \n",
330 | " NaN | \n",
331 | "
\n",
332 | " \n",
333 | " | 529 | \n",
334 | " Fri Nov 17 23:56:55 +0000 2023 | \n",
335 | " 1.725664e+18 | \n",
336 | " @Mdy_Asmara1701 Para Kabinda di banyak daerah ... | \n",
337 | " NaN | \n",
338 | " NaN | \n",
339 | " NaN | \n",
340 | " NaN | \n",
341 | " NaN | \n",
342 | " NaN | \n",
343 | " NaN | \n",
344 | " NaN | \n",
345 | " NaN | \n",
346 | "
\n",
347 | " \n",
348 | " | 586 | \n",
349 | " Fri Nov 17 23:58:21 +0000 2023 | \n",
350 | " 1.725665e+18 | \n",
351 | " @gogo74070675957 @iina_surbakti @gibran_tweet ... | \n",
352 | " 0.0 | \n",
353 | " 0.0 | \n",
354 | " 0.0 | \n",
355 | " 1.0 | \n",
356 | " in | \n",
357 | " 1.213867e+18 | \n",
358 | " 1.725499e+18 | \n",
359 | " ArjunaOnthesky | \n",
360 | " https://twitter.com/ArjunaOnthesky/status/1725... | \n",
361 | "
\n",
362 | " \n",
363 | " | 587 | \n",
364 | " Fri Nov 17 23:56:33 +0000 2023 | \n",
365 | " 1.725664e+18 | \n",
366 | " Ini Daftar Nama dan Struktur Lengkap TKN Prabo... | \n",
367 | " 0.0 | \n",
368 | " 0.0 | \n",
369 | " 0.0 | \n",
370 | " 0.0 | \n",
371 | " in | \n",
372 | " 1.110933e+18 | \n",
373 | " 1.725664e+18 | \n",
374 | " fathw25 | \n",
375 | " https://twitter.com/fathw25/status/17256642523... | \n",
376 | "
\n",
377 | " \n",
378 | " | 588 | \n",
379 | " Fri Nov 17 23:53:50 +0000 2023 | \n",
380 | " 1.725664e+18 | \n",
381 | " Kecuali Jokowi jadi ketum Golkar dan Gibran ja... | \n",
382 | " 0.0 | \n",
383 | " 0.0 | \n",
384 | " 0.0 | \n",
385 | " 0.0 | \n",
386 | " in | \n",
387 | " 8.486730e+07 | \n",
388 | " 1.725664e+18 | \n",
389 | " tualang | \n",
390 | " https://twitter.com/tualang/status/17256635676... | \n",
391 | "
\n",
392 | " \n",
393 | " | 589 | \n",
394 | " Fri Nov 17 23:35:18 +0000 2023 | \n",
395 | " 1.725659e+18 | \n",
396 | " Temukan dan dapatkan Kaos Baju prabowo gemoy -... | \n",
397 | " 0.0 | \n",
398 | " 0.0 | \n",
399 | " 0.0 | \n",
400 | " 0.0 | \n",
401 | " in | \n",
402 | " 2.993486e+09 | \n",
403 | " 1.725659e+18 | \n",
404 | " aris_jenang | \n",
405 | " https://twitter.com/aris_jenang/status/1725658... | \n",
406 | "
\n",
407 | " \n",
408 | " | 590 | \n",
409 | " Fri Nov 17 23:35:18 +0000 2023 | \n",
410 | " 1.725659e+18 | \n",
411 | " Masyarakat Menilai Gibran Tidak Mempunyai Kapa... | \n",
412 | " 0.0 | \n",
413 | " 0.0 | \n",
414 | " 0.0 | \n",
415 | " 1.0 | \n",
416 | " in | \n",
417 | " 1.618289e+18 | \n",
418 | " 1.725659e+18 | \n",
419 | " Liza16144812 | \n",
420 | " https://twitter.com/Liza16144812/status/172565... | \n",
421 | "
\n",
422 | " \n",
423 | "
\n",
424 | "
"
425 | ],
426 | "text/plain": [
427 | " created_at id_str \\\n",
428 | "151 Fri Nov 17 21:00:46 +0000 2023 1.725620e+18 \n",
429 | "152 Fri Nov 17 20:49:46 +0000 2023 1.725617e+18 \n",
430 | "352 Fri Nov 17 16:24:41 +0000 2023 1.725551e+18 \n",
431 | "353 Fri Nov 17 16:23:57 +0000 2023 1.725550e+18 \n",
432 | "529 Fri Nov 17 23:56:55 +0000 2023 1.725664e+18 \n",
433 | "586 Fri Nov 17 23:58:21 +0000 2023 1.725665e+18 \n",
434 | "587 Fri Nov 17 23:56:33 +0000 2023 1.725664e+18 \n",
435 | "588 Fri Nov 17 23:53:50 +0000 2023 1.725664e+18 \n",
436 | "589 Fri Nov 17 23:35:18 +0000 2023 1.725659e+18 \n",
437 | "590 Fri Nov 17 23:35:18 +0000 2023 1.725659e+18 \n",
438 | "\n",
439 | " full_text quote_count \\\n",
440 | "151 @NenkMonica @gibran_tweet Salahnya dmn dateng ... NaN \n",
441 | "152 Kreatif balon #GanjarMahfud2024 anti di Bongka... 0.0 \n",
442 | "352 @DPP_PKB @ninikwafiroh Maju bersama AMIN memen... 0.0 \n",
443 | "353 @triwul82 Mahfud MD mengingatkan bahwa tanggun... NaN \n",
444 | "529 @Mdy_Asmara1701 Para Kabinda di banyak daerah ... NaN \n",
445 | "586 @gogo74070675957 @iina_surbakti @gibran_tweet ... 0.0 \n",
446 | "587 Ini Daftar Nama dan Struktur Lengkap TKN Prabo... 0.0 \n",
447 | "588 Kecuali Jokowi jadi ketum Golkar dan Gibran ja... 0.0 \n",
448 | "589 Temukan dan dapatkan Kaos Baju prabowo gemoy -... 0.0 \n",
449 | "590 Masyarakat Menilai Gibran Tidak Mempunyai Kapa... 0.0 \n",
450 | "\n",
451 | " reply_count retweet_count favorite_count lang user_id_str \\\n",
452 | "151 NaN NaN NaN NaN NaN \n",
453 | "152 3.0 7.0 24.0 in 7.945097e+17 \n",
454 | "352 0.0 0.0 0.0 in 1.618235e+18 \n",
455 | "353 NaN NaN NaN NaN NaN \n",
456 | "529 NaN NaN NaN NaN NaN \n",
457 | "586 0.0 0.0 1.0 in 1.213867e+18 \n",
458 | "587 0.0 0.0 0.0 in 1.110933e+18 \n",
459 | "588 0.0 0.0 0.0 in 8.486730e+07 \n",
460 | "589 0.0 0.0 0.0 in 2.993486e+09 \n",
461 | "590 0.0 0.0 1.0 in 1.618289e+18 \n",
462 | "\n",
463 | " conversation_id_str username \\\n",
464 | "151 NaN NaN \n",
465 | "152 1.725617e+18 Jayabay19479190 \n",
466 | "352 1.725482e+18 IndahRahaleb \n",
467 | "353 NaN NaN \n",
468 | "529 NaN NaN \n",
469 | "586 1.725499e+18 ArjunaOnthesky \n",
470 | "587 1.725664e+18 fathw25 \n",
471 | "588 1.725664e+18 tualang \n",
472 | "589 1.725659e+18 aris_jenang \n",
473 | "590 1.725659e+18 Liza16144812 \n",
474 | "\n",
475 | " tweet_url \n",
476 | "151 NaN \n",
477 | "152 https://twitter.com/Jayabay19479190/status/172... \n",
478 | "352 https://twitter.com/IndahRahaleb/status/172555... \n",
479 | "353 NaN \n",
480 | "529 NaN \n",
481 | "586 https://twitter.com/ArjunaOnthesky/status/1725... \n",
482 | "587 https://twitter.com/fathw25/status/17256642523... \n",
483 | "588 https://twitter.com/tualang/status/17256635676... \n",
484 | "589 https://twitter.com/aris_jenang/status/1725658... \n",
485 | "590 https://twitter.com/Liza16144812/status/172565... "
486 | ]
487 | },
488 | "execution_count": 7,
489 | "metadata": {},
490 | "output_type": "execute_result"
491 | }
492 | ],
493 | "source": [
494 | "# Overview duplicated data\n",
495 | "data[data.duplicated(subset = ['full_text', 'id_str', 'retweet_count'])].head(10)"
496 | ]
497 | },
498 | {
499 | "cell_type": "code",
500 | "execution_count": 8,
501 | "id": "f1d94386",
502 | "metadata": {},
503 | "outputs": [],
504 | "source": [
505 | "# Duplicate data filtering\n",
506 | "data = data.drop_duplicates(subset = ['full_text', 'id_str', 'retweet_count'])"
507 | ]
508 | },
509 | {
510 | "cell_type": "code",
511 | "execution_count": 9,
512 | "id": "f5e82215",
513 | "metadata": {},
514 | "outputs": [
515 | {
516 | "data": {
517 | "text/plain": [
518 | "False 1564\n",
519 | "dtype: int64"
520 | ]
521 | },
522 | "execution_count": 9,
523 | "metadata": {},
524 | "output_type": "execute_result"
525 | }
526 | ],
527 | "source": [
528 | "# Data Duplicate checking - validation\n",
529 | "data.duplicated(subset = ['full_text', 'id_str', 'retweet_count']).value_counts()"
530 | ]
531 | },
532 | {
533 | "cell_type": "code",
534 | "execution_count": 10,
535 | "id": "b8e3d826",
536 | "metadata": {},
537 | "outputs": [],
538 | "source": [
539 | "# Define prompt and ingestion script\n",
540 | "def prompt_enrichment(tweet_comment: str) -> str:\n",
541 | " prompt = \\\n",
542 | " f\"\"\"\n",
543 | " Ekstraksi informasi yang dibutuhkan berdasarkan komentar twitter dibawah, dengan response cukup sesuai yang di definisikan tanpa penjelasan tambahan.\n",
544 | "\n",
545 | " komentar_twitter: \"{tweet_comment}\"\n",
546 | "\n",
547 | " Untuk response cukup isi dengan format dibawah.\n",
548 | " named_entity_recognition: [Jawaban anda: cakupan NER sesuai label \"PERSON\" atau \"ORGANIZATION\" saja]\n",
549 | " aspect_sentiment: [Identifikasi verb / noun-phrase hasil dari part-of-speech di dalam komentar, disertai dengan nilai sentiment masing-masing aspect dengan format ]\n",
550 | " \"\"\"\n",
551 | " return prompt\n",
552 | "\n",
553 | "def ingest_openai(tweet_comment: str, model_base: str = \"gpt-3.5-turbo\") -> Tuple[str, int]: \n",
554 | " token_usage = 0\n",
555 | " response_extraction = \"\"\n",
556 | " try:\n",
557 | " response = openai.ChatCompletion.create(\n",
558 | " model = model_base, \n",
559 | " messages = [{\"role\" : \"user\", \"content\" : prompt_enrichment(tweet_comment)}], \n",
560 | " temperature = 0.1, max_tokens = 512, top_p = 1.0, \n",
561 | " frequency_penalty = 0.0, presence_penalty = 0.0\n",
562 | " )\n",
563 | " response_extraction = response[\"choices\"][0][\"message\"][\"content\"]\n",
564 | " token_usage = response[\"usage\"][\"total_tokens\"]\n",
565 | " except Exception as E:\n",
566 | " print(f\"[ERROR] - {E}\")\n",
567 | " print(\"Retry with Recursive Func\")\n",
568 | " time.sleep(5)\n",
569 | " ingest_openai(tweet_comment = tweet_comment)\n",
570 | " return response_extraction, token_usage"
571 | ]
572 | },
573 | {
574 | "cell_type": "code",
575 | "execution_count": 13,
576 | "id": "6447e99d",
577 | "metadata": {},
578 | "outputs": [
579 | {
580 | "name": "stdout",
581 | "output_type": "stream",
582 | "text": [
583 | "Total Rows: 1564\n"
584 | ]
585 | }
586 | ],
587 | "source": [
588 | "# Check total rows\n",
589 | "print(f\"Total Rows: {data.shape[0]}\")"
590 | ]
591 | },
592 | {
593 | "cell_type": "code",
594 | "execution_count": 11,
595 | "id": "e8a8dc81",
596 | "metadata": {},
597 | "outputs": [
598 | {
599 | "name": "stdout",
600 | "output_type": "stream",
601 | "text": [
602 | "[COMMENT]\n",
603 | "Komisi Pemilihan Umum (KPU) RI telah menetapkan nomor urut terhadap tiga calon presiden dan wakil presiden di Pilpres 2024. #BersamaIndonesiaMaju #PrabowoGibranIstimewa #PrabowoGemoy Prabowo Subianto\n",
604 | "[RESULT - Token Usage: 279]\n",
605 | "named_entity_recognition: [\"Komisi Pemilihan Umum (KPU) RI\", \"Prabowo Subianto\"]\n",
606 | "aspect_sentiment: [\"menetapkan nomor urut (positive)\", \"tiga calon presiden dan wakil presiden (neutral)\"]\n"
607 | ]
608 | }
609 | ],
610 | "source": [
611 | "# Test ingestion\n",
612 | "comment = data['full_text'].sample(1).values[0]\n",
613 | "extraction, token_usage = ingest_openai(tweet_comment = comment)\n",
614 | "print(f\"[COMMENT]\\n{comment}\\n[RESULT - Token Usage: {token_usage}]\\n{extraction}\")"
615 | ]
616 | },
617 | {
618 | "cell_type": "code",
619 | "execution_count": 14,
620 | "id": "36092a07",
621 | "metadata": {},
622 | "outputs": [],
623 | "source": [
624 | "# Apply on entire dataset\n",
625 | "final_result_extraction, final_token_usage = [], []"
626 | ]
627 | },
628 | {
629 | "cell_type": "code",
630 | "execution_count": 15,
631 | "id": "7072d5e2",
632 | "metadata": {},
633 | "outputs": [
634 | {
635 | "name": "stderr",
636 | "output_type": "stream",
637 | "text": [
638 | "Ingestion Start: 1%|▋ | 19/1564 [07:40<6:43:15, 15.66s/it]"
639 | ]
640 | },
641 | {
642 | "name": "stdout",
643 | "output_type": "stream",
644 | "text": [
645 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
646 | "Retry with Recursive Func\n"
647 | ]
648 | },
649 | {
650 | "name": "stderr",
651 | "output_type": "stream",
652 | "text": [
653 | "Ingestion Start: 2%|█▍ | 36/1564 [29:06<18:26:39, 43.46s/it]"
654 | ]
655 | },
656 | {
657 | "name": "stdout",
658 | "output_type": "stream",
659 | "text": [
660 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
661 | "Retry with Recursive Func\n"
662 | ]
663 | },
664 | {
665 | "name": "stderr",
666 | "output_type": "stream",
667 | "text": [
668 | "Ingestion Start: 4%|██▍ | 63/1564 [44:38<6:37:56, 15.91s/it]"
669 | ]
670 | },
671 | {
672 | "name": "stdout",
673 | "output_type": "stream",
674 | "text": [
675 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
676 | "Retry with Recursive Func\n"
677 | ]
678 | },
679 | {
680 | "name": "stderr",
681 | "output_type": "stream",
682 | "text": [
683 | "Ingestion Start: 5%|██▉ | 76/1564 [57:51<6:08:41, 14.87s/it]"
684 | ]
685 | },
686 | {
687 | "name": "stdout",
688 | "output_type": "stream",
689 | "text": [
690 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
691 | "Retry with Recursive Func\n"
692 | ]
693 | },
694 | {
695 | "name": "stderr",
696 | "output_type": "stream",
697 | "text": [
698 | "Ingestion Start: 7%|███▊ | 102/1564 [1:19:19<3:43:36, 9.18s/it]"
699 | ]
700 | },
701 | {
702 | "name": "stdout",
703 | "output_type": "stream",
704 | "text": [
705 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
706 | "Retry with Recursive Func\n",
707 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
708 | "Retry with Recursive Func\n"
709 | ]
710 | },
711 | {
712 | "name": "stderr",
713 | "output_type": "stream",
714 | "text": [
715 | "Ingestion Start: 8%|████▋ | 126/1564 [1:51:21<4:05:45, 10.25s/it]"
716 | ]
717 | },
718 | {
719 | "name": "stdout",
720 | "output_type": "stream",
721 | "text": [
722 | "[ERROR] - The server is overloaded or not ready yet.\n",
723 | "Retry with Recursive Func\n"
724 | ]
725 | },
726 | {
727 | "name": "stderr",
728 | "output_type": "stream",
729 | "text": [
730 | "Ingestion Start: 10%|█████▌ | 149/1564 [2:08:17<4:53:27, 12.44s/it]"
731 | ]
732 | },
733 | {
734 | "name": "stdout",
735 | "output_type": "stream",
736 | "text": [
737 | "[ERROR] - The server is overloaded or not ready yet.\n",
738 | "Retry with Recursive Func\n"
739 | ]
740 | },
741 | {
742 | "name": "stderr",
743 | "output_type": "stream",
744 | "text": [
745 | "Ingestion Start: 11%|██████▍ | 174/1564 [2:28:49<6:07:45, 15.87s/it]"
746 | ]
747 | },
748 | {
749 | "name": "stdout",
750 | "output_type": "stream",
751 | "text": [
752 | "[ERROR] - The server is overloaded or not ready yet.\n",
753 | "Retry with Recursive Func\n"
754 | ]
755 | },
756 | {
757 | "name": "stderr",
758 | "output_type": "stream",
759 | "text": [
760 | "Ingestion Start: 11%|██████▍ | 177/1564 [2:31:18<12:27:23, 32.33s/it]"
761 | ]
762 | },
763 | {
764 | "name": "stdout",
765 | "output_type": "stream",
766 | "text": [
767 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
768 | "Retry with Recursive Func\n"
769 | ]
770 | },
771 | {
772 | "name": "stderr",
773 | "output_type": "stream",
774 | "text": [
775 | "Ingestion Start: 12%|███████▏ | 195/1564 [2:46:45<8:09:27, 21.45s/it]"
776 | ]
777 | },
778 | {
779 | "name": "stdout",
780 | "output_type": "stream",
781 | "text": [
782 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
783 | "Retry with Recursive Func\n"
784 | ]
785 | },
786 | {
787 | "name": "stderr",
788 | "output_type": "stream",
789 | "text": [
790 | "Ingestion Start: 13%|███████▎ | 199/1564 [2:57:21<27:32:33, 72.64s/it]"
791 | ]
792 | },
793 | {
794 | "name": "stdout",
795 | "output_type": "stream",
796 | "text": [
797 | "[ERROR] - The server is overloaded or not ready yet.\n",
798 | "Retry with Recursive Func\n"
799 | ]
800 | },
801 | {
802 | "name": "stderr",
803 | "output_type": "stream",
804 | "text": [
805 | "Ingestion Start: 13%|███████▎ | 202/1564 [2:58:22<14:36:16, 38.60s/it]"
806 | ]
807 | },
808 | {
809 | "name": "stdout",
810 | "output_type": "stream",
811 | "text": [
812 | "[ERROR] - The server is overloaded or not ready yet.\n",
813 | "Retry with Recursive Func\n"
814 | ]
815 | },
816 | {
817 | "name": "stderr",
818 | "output_type": "stream",
819 | "text": [
820 | "Ingestion Start: 13%|███████▍ | 205/1564 [3:00:05<13:06:18, 34.72s/it]"
821 | ]
822 | },
823 | {
824 | "name": "stdout",
825 | "output_type": "stream",
826 | "text": [
827 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
828 | "Retry with Recursive Func\n",
829 | "[ERROR] - The server is overloaded or not ready yet.\n",
830 | "Retry with Recursive Func\n",
831 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
832 | "Retry with Recursive Func\n",
833 | "[ERROR] - The server is overloaded or not ready yet.\n",
834 | "Retry with Recursive Func\n"
835 | ]
836 | },
837 | {
838 | "name": "stderr",
839 | "output_type": "stream",
840 | "text": [
841 | "Ingestion Start: 14%|████████ | 218/1564 [3:35:01<6:53:09, 18.42s/it]"
842 | ]
843 | },
844 | {
845 | "name": "stdout",
846 | "output_type": "stream",
847 | "text": [
848 | "[ERROR] - The server is overloaded or not ready yet.\n",
849 | "Retry with Recursive Func\n"
850 | ]
851 | },
852 | {
853 | "name": "stderr",
854 | "output_type": "stream",
855 | "text": [
856 | "Ingestion Start: 15%|████████▌ | 230/1564 [3:43:20<5:50:06, 15.75s/it]"
857 | ]
858 | },
859 | {
860 | "name": "stdout",
861 | "output_type": "stream",
862 | "text": [
863 | "[ERROR] - The server is overloaded or not ready yet.\n",
864 | "Retry with Recursive Func\n"
865 | ]
866 | },
867 | {
868 | "name": "stderr",
869 | "output_type": "stream",
870 | "text": [
871 | "Ingestion Start: 16%|█████████▎ | 252/1564 [3:53:26<6:15:50, 17.19s/it]"
872 | ]
873 | },
874 | {
875 | "name": "stdout",
876 | "output_type": "stream",
877 | "text": [
878 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
879 | "Retry with Recursive Func\n"
880 | ]
881 | },
882 | {
883 | "name": "stderr",
884 | "output_type": "stream",
885 | "text": [
886 | "Ingestion Start: 17%|█████████▋ | 265/1564 [4:09:32<28:04:03, 77.79s/it]"
887 | ]
888 | },
889 | {
890 | "name": "stdout",
891 | "output_type": "stream",
892 | "text": [
893 | "[ERROR] - The server is overloaded or not ready yet.\n",
894 | "Retry with Recursive Func\n"
895 | ]
896 | },
897 | {
898 | "name": "stderr",
899 | "output_type": "stream",
900 | "text": [
901 | "Ingestion Start: 17%|█████████▊ | 269/1564 [4:17:01<24:25:14, 67.89s/it]"
902 | ]
903 | },
904 | {
905 | "name": "stdout",
906 | "output_type": "stream",
907 | "text": [
908 | "[ERROR] - The server is overloaded or not ready yet.\n",
909 | "Retry with Recursive Func\n",
910 | "[ERROR] - The server is overloaded or not ready yet.\n",
911 | "Retry with Recursive Func\n",
912 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
913 | "Retry with Recursive Func\n"
914 | ]
915 | },
916 | {
917 | "name": "stderr",
918 | "output_type": "stream",
919 | "text": [
920 | "Ingestion Start: 20%|███████████▎ | 305/1564 [4:38:32<4:02:15, 11.54s/it]"
921 | ]
922 | },
923 | {
924 | "name": "stdout",
925 | "output_type": "stream",
926 | "text": [
927 | "[ERROR] - The server is overloaded or not ready yet.\n",
928 | "Retry with Recursive Func\n"
929 | ]
930 | },
931 | {
932 | "name": "stderr",
933 | "output_type": "stream",
934 | "text": [
935 | "Ingestion Start: 20%|███████████▌ | 311/1564 [4:41:13<4:17:58, 12.35s/it]"
936 | ]
937 | },
938 | {
939 | "name": "stdout",
940 | "output_type": "stream",
941 | "text": [
942 | "[ERROR] - The server is overloaded or not ready yet.\n",
943 | "Retry with Recursive Func\n"
944 | ]
945 | },
946 | {
947 | "name": "stderr",
948 | "output_type": "stream",
949 | "text": [
950 | "Ingestion Start: 20%|███████████▍ | 313/1564 [4:47:41<30:43:09, 88.40s/it]"
951 | ]
952 | },
953 | {
954 | "name": "stdout",
955 | "output_type": "stream",
956 | "text": [
957 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
958 | "Retry with Recursive Func\n"
959 | ]
960 | },
961 | {
962 | "name": "stderr",
963 | "output_type": "stream",
964 | "text": [
965 | "Ingestion Start: 21%|████████████ | 326/1564 [4:59:41<4:37:20, 13.44s/it]"
966 | ]
967 | },
968 | {
969 | "name": "stdout",
970 | "output_type": "stream",
971 | "text": [
972 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
973 | "Retry with Recursive Func\n"
974 | ]
975 | },
976 | {
977 | "name": "stderr",
978 | "output_type": "stream",
979 | "text": [
980 | "Ingestion Start: 22%|████████████▊ | 345/1564 [5:13:19<3:59:37, 11.79s/it]"
981 | ]
982 | },
983 | {
984 | "name": "stdout",
985 | "output_type": "stream",
986 | "text": [
987 | "[ERROR] - The server is overloaded or not ready yet.\n",
988 | "Retry with Recursive Func\n"
989 | ]
990 | },
991 | {
992 | "name": "stderr",
993 | "output_type": "stream",
994 | "text": [
995 | "Ingestion Start: 23%|█████████████▍ | 362/1564 [5:17:50<3:48:55, 11.43s/it]"
996 | ]
997 | },
998 | {
999 | "name": "stdout",
1000 | "output_type": "stream",
1001 | "text": [
1002 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1003 | "Retry with Recursive Func\n"
1004 | ]
1005 | },
1006 | {
1007 | "name": "stderr",
1008 | "output_type": "stream",
1009 | "text": [
1010 | "\r",
1011 | "Ingestion Start: 23%|████████████▉ | 363/1564 [5:28:02<63:54:04, 191.54s/it]"
1012 | ]
1013 | },
1014 | {
1015 | "name": "stdout",
1016 | "output_type": "stream",
1017 | "text": [
1018 | "[ERROR] - The server is overloaded or not ready yet.\n",
1019 | "Retry with Recursive Func\n"
1020 | ]
1021 | },
1022 | {
1023 | "name": "stderr",
1024 | "output_type": "stream",
1025 | "text": [
1026 | "Ingestion Start: 23%|█████████████▏ | 367/1564 [5:40:14<67:54:21, 204.23s/it]"
1027 | ]
1028 | },
1029 | {
1030 | "name": "stdout",
1031 | "output_type": "stream",
1032 | "text": [
1033 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1034 | "Retry with Recursive Func\n"
1035 | ]
1036 | },
1037 | {
1038 | "name": "stderr",
1039 | "output_type": "stream",
1040 | "text": [
1041 | "Ingestion Start: 24%|█████████████▌ | 372/1564 [5:51:50<30:17:52, 91.50s/it]"
1042 | ]
1043 | },
1044 | {
1045 | "name": "stdout",
1046 | "output_type": "stream",
1047 | "text": [
1048 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1049 | "Retry with Recursive Func\n"
1050 | ]
1051 | },
1052 | {
1053 | "name": "stderr",
1054 | "output_type": "stream",
1055 | "text": [
1056 | "Ingestion Start: 25%|██████████████▋ | 395/1564 [6:11:23<5:00:03, 15.40s/it]"
1057 | ]
1058 | },
1059 | {
1060 | "name": "stdout",
1061 | "output_type": "stream",
1062 | "text": [
1063 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1064 | "Retry with Recursive Func\n"
1065 | ]
1066 | },
1067 | {
1068 | "name": "stderr",
1069 | "output_type": "stream",
1070 | "text": [
1071 | "Ingestion Start: 26%|██████████████▌ | 400/1564 [6:22:11<16:52:25, 52.19s/it]"
1072 | ]
1073 | },
1074 | {
1075 | "name": "stdout",
1076 | "output_type": "stream",
1077 | "text": [
1078 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1079 | "Retry with Recursive Func\n"
1080 | ]
1081 | },
1082 | {
1083 | "name": "stderr",
1084 | "output_type": "stream",
1085 | "text": [
1086 | "\r",
1087 | "Ingestion Start: 26%|██████████████▎ | 401/1564 [6:32:22<71:00:11, 219.79s/it]"
1088 | ]
1089 | },
1090 | {
1091 | "name": "stdout",
1092 | "output_type": "stream",
1093 | "text": [
1094 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1095 | "Retry with Recursive Func\n"
1096 | ]
1097 | },
1098 | {
1099 | "name": "stderr",
1100 | "output_type": "stream",
1101 | "text": [
1102 | "Ingestion Start: 28%|████████████████▏ | 438/1564 [6:47:18<2:08:55, 6.87s/it]"
1103 | ]
1104 | },
1105 | {
1106 | "name": "stdout",
1107 | "output_type": "stream",
1108 | "text": [
1109 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1110 | "Retry with Recursive Func\n"
1111 | ]
1112 | },
1113 | {
1114 | "name": "stderr",
1115 | "output_type": "stream",
1116 | "text": [
1117 | "Ingestion Start: 29%|████████████████▌ | 447/1564 [6:58:12<4:59:52, 16.11s/it]"
1118 | ]
1119 | },
1120 | {
1121 | "name": "stdout",
1122 | "output_type": "stream",
1123 | "text": [
1124 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1125 | "Retry with Recursive Func\n"
1126 | ]
1127 | },
1128 | {
1129 | "name": "stderr",
1130 | "output_type": "stream",
1131 | "text": [
1132 | "Ingestion Start: 31%|█████████████████▉ | 484/1564 [7:12:20<2:08:16, 7.13s/it]"
1133 | ]
1134 | },
1135 | {
1136 | "name": "stdout",
1137 | "output_type": "stream",
1138 | "text": [
1139 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1140 | "Retry with Recursive Func\n"
1141 | ]
1142 | },
1143 | {
1144 | "name": "stderr",
1145 | "output_type": "stream",
1146 | "text": [
1147 | "Ingestion Start: 38%|██████████████████████▎ | 601/1564 [7:34:58<2:50:47, 10.64s/it]"
1148 | ]
1149 | },
1150 | {
1151 | "name": "stdout",
1152 | "output_type": "stream",
1153 | "text": [
1154 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1155 | "Retry with Recursive Func\n"
1156 | ]
1157 | },
1158 | {
1159 | "name": "stderr",
1160 | "output_type": "stream",
1161 | "text": [
1162 | "Ingestion Start: 39%|██████████████████████ | 606/1564 [7:45:28<13:17:59, 49.98s/it]"
1163 | ]
1164 | },
1165 | {
1166 | "name": "stdout",
1167 | "output_type": "stream",
1168 | "text": [
1169 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1170 | "Retry with Recursive Func\n"
1171 | ]
1172 | },
1173 | {
1174 | "name": "stderr",
1175 | "output_type": "stream",
1176 | "text": [
1177 | "Ingestion Start: 39%|█████████████████████▊ | 609/1564 [7:55:46<28:57:12, 109.14s/it]"
1178 | ]
1179 | },
1180 | {
1181 | "name": "stdout",
1182 | "output_type": "stream",
1183 | "text": [
1184 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1185 | "Retry with Recursive Func\n"
1186 | ]
1187 | },
1188 | {
1189 | "name": "stderr",
1190 | "output_type": "stream",
1191 | "text": [
1192 | "Ingestion Start: 40%|███████████████████████ | 623/1564 [8:15:31<3:25:04, 13.08s/it]"
1193 | ]
1194 | },
1195 | {
1196 | "name": "stdout",
1197 | "output_type": "stream",
1198 | "text": [
1199 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1200 | "Retry with Recursive Func\n"
1201 | ]
1202 | },
1203 | {
1204 | "name": "stderr",
1205 | "output_type": "stream",
1206 | "text": [
1207 | "Ingestion Start: 42%|████████████████████████▌ | 662/1564 [8:38:18<1:34:09, 6.26s/it]"
1208 | ]
1209 | },
1210 | {
1211 | "name": "stdout",
1212 | "output_type": "stream",
1213 | "text": [
1214 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1215 | "Retry with Recursive Func\n"
1216 | ]
1217 | },
1218 | {
1219 | "name": "stderr",
1220 | "output_type": "stream",
1221 | "text": [
1222 | "Ingestion Start: 45%|██████████████████████████▎ | 710/1564 [9:00:09<3:27:47, 14.60s/it]"
1223 | ]
1224 | },
1225 | {
1226 | "name": "stdout",
1227 | "output_type": "stream",
1228 | "text": [
1229 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1230 | "Retry with Recursive Func\n"
1231 | ]
1232 | },
1233 | {
1234 | "name": "stderr",
1235 | "output_type": "stream",
1236 | "text": [
1237 | "Ingestion Start: 48%|███████████████████████████▋ | 746/1564 [9:41:35<9:45:53, 42.97s/it]"
1238 | ]
1239 | },
1240 | {
1241 | "name": "stdout",
1242 | "output_type": "stream",
1243 | "text": [
1244 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1245 | "Retry with Recursive Func\n"
1246 | ]
1247 | },
1248 | {
1249 | "name": "stderr",
1250 | "output_type": "stream",
1251 | "text": [
1252 | "Ingestion Start: 48%|███████████████████████████▉ | 753/1564 [9:52:47<8:15:51, 36.69s/it]"
1253 | ]
1254 | },
1255 | {
1256 | "name": "stdout",
1257 | "output_type": "stream",
1258 | "text": [
1259 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1260 | "Retry with Recursive Func\n"
1261 | ]
1262 | },
1263 | {
1264 | "name": "stderr",
1265 | "output_type": "stream",
1266 | "text": [
1267 | "Ingestion Start: 52%|█████████████████████████████▌ | 811/1564 [10:20:21<2:18:22, 11.03s/it]"
1268 | ]
1269 | },
1270 | {
1271 | "name": "stdout",
1272 | "output_type": "stream",
1273 | "text": [
1274 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1275 | "Retry with Recursive Func\n"
1276 | ]
1277 | },
1278 | {
1279 | "name": "stderr",
1280 | "output_type": "stream",
1281 | "text": [
1282 | "Ingestion Start: 57%|████████████████████████████████▍ | 889/1564 [10:55:13<1:22:39, 7.35s/it]"
1283 | ]
1284 | },
1285 | {
1286 | "name": "stdout",
1287 | "output_type": "stream",
1288 | "text": [
1289 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1290 | "Retry with Recursive Func\n"
1291 | ]
1292 | },
1293 | {
1294 | "name": "stderr",
1295 | "output_type": "stream",
1296 | "text": [
1297 | "Ingestion Start: 57%|████████████████████████████████▌ | 895/1564 [11:06:15<7:19:04, 39.38s/it]"
1298 | ]
1299 | },
1300 | {
1301 | "name": "stdout",
1302 | "output_type": "stream",
1303 | "text": [
1304 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1305 | "Retry with Recursive Func\n"
1306 | ]
1307 | },
1308 | {
1309 | "name": "stderr",
1310 | "output_type": "stream",
1311 | "text": [
1312 | "Ingestion Start: 62%|███████████████████████████████████▏ | 965/1564 [11:32:20<1:40:08, 10.03s/it]"
1313 | ]
1314 | },
1315 | {
1316 | "name": "stdout",
1317 | "output_type": "stream",
1318 | "text": [
1319 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1320 | "Retry with Recursive Func\n"
1321 | ]
1322 | },
1323 | {
1324 | "name": "stderr",
1325 | "output_type": "stream",
1326 | "text": [
1327 | "Ingestion Start: 62%|███████████████████████████████████▍ | 972/1564 [11:42:57<4:19:43, 26.32s/it]"
1328 | ]
1329 | },
1330 | {
1331 | "name": "stdout",
1332 | "output_type": "stream",
1333 | "text": [
1334 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1335 | "Retry with Recursive Func\n"
1336 | ]
1337 | },
1338 | {
1339 | "name": "stderr",
1340 | "output_type": "stream",
1341 | "text": [
1342 | "Ingestion Start: 64%|███████████████████████████████████▋ | 998/1564 [12:13:14<12:00:17, 76.36s/it]"
1343 | ]
1344 | },
1345 | {
1346 | "name": "stdout",
1347 | "output_type": "stream",
1348 | "text": [
1349 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1350 | "Retry with Recursive Func\n"
1351 | ]
1352 | },
1353 | {
1354 | "name": "stderr",
1355 | "output_type": "stream",
1356 | "text": [
1357 | "Ingestion Start: 64%|██████████████████████████████████▌ | 1000/1564 [12:23:35<26:26:48, 168.81s/it]"
1358 | ]
1359 | },
1360 | {
1361 | "name": "stdout",
1362 | "output_type": "stream",
1363 | "text": [
1364 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1365 | "Retry with Recursive Func\n"
1366 | ]
1367 | },
1368 | {
1369 | "name": "stderr",
1370 | "output_type": "stream",
1371 | "text": [
1372 | "Ingestion Start: 65%|████████████████████████████████████▍ | 1019/1564 [12:36:38<1:47:35, 11.84s/it]"
1373 | ]
1374 | },
1375 | {
1376 | "name": "stdout",
1377 | "output_type": "stream",
1378 | "text": [
1379 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1380 | "Retry with Recursive Func\n"
1381 | ]
1382 | },
1383 | {
1384 | "name": "stderr",
1385 | "output_type": "stream",
1386 | "text": [
1387 | "Ingestion Start: 65%|████████████████████████████████████▋ | 1024/1564 [12:47:09<7:29:42, 49.97s/it]"
1388 | ]
1389 | },
1390 | {
1391 | "name": "stdout",
1392 | "output_type": "stream",
1393 | "text": [
1394 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1395 | "Retry with Recursive Func\n"
1396 | ]
1397 | },
1398 | {
1399 | "name": "stderr",
1400 | "output_type": "stream",
1401 | "text": [
1402 | "Ingestion Start: 68%|██████████████████████████████████████▎ | 1071/1564 [13:28:24<2:33:12, 18.65s/it]"
1403 | ]
1404 | },
1405 | {
1406 | "name": "stdout",
1407 | "output_type": "stream",
1408 | "text": [
1409 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1410 | "Retry with Recursive Func\n"
1411 | ]
1412 | },
1413 | {
1414 | "name": "stderr",
1415 | "output_type": "stream",
1416 | "text": [
1417 | "Ingestion Start: 69%|██████████████████████████████████████▋ | 1079/1564 [13:39:21<3:10:21, 23.55s/it]"
1418 | ]
1419 | },
1420 | {
1421 | "name": "stdout",
1422 | "output_type": "stream",
1423 | "text": [
1424 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1425 | "Retry with Recursive Func\n"
1426 | ]
1427 | },
1428 | {
1429 | "name": "stderr",
1430 | "output_type": "stream",
1431 | "text": [
1432 | "Ingestion Start: 71%|███████████████████████████████████████▌ | 1104/1564 [13:56:41<1:19:00, 10.31s/it]"
1433 | ]
1434 | },
1435 | {
1436 | "name": "stdout",
1437 | "output_type": "stream",
1438 | "text": [
1439 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1440 | "Retry with Recursive Func\n"
1441 | ]
1442 | },
1443 | {
1444 | "name": "stderr",
1445 | "output_type": "stream",
1446 | "text": [
1447 | "Ingestion Start: 72%|████████████████████████████████████████▍ | 1128/1564 [14:11:15<1:14:02, 10.19s/it]"
1448 | ]
1449 | },
1450 | {
1451 | "name": "stdout",
1452 | "output_type": "stream",
1453 | "text": [
1454 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1455 | "Retry with Recursive Func\n"
1456 | ]
1457 | },
1458 | {
1459 | "name": "stderr",
1460 | "output_type": "stream",
1461 | "text": [
1462 | "Ingestion Start: 73%|████████████████████████████████████████▊ | 1139/1564 [14:22:51<1:52:28, 15.88s/it]"
1463 | ]
1464 | },
1465 | {
1466 | "name": "stdout",
1467 | "output_type": "stream",
1468 | "text": [
1469 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1470 | "Retry with Recursive Func\n"
1471 | ]
1472 | },
1473 | {
1474 | "name": "stderr",
1475 | "output_type": "stream",
1476 | "text": [
1477 | "Ingestion Start: 74%|█████████████████████████████████████████▍ | 1158/1564 [14:36:00<1:24:59, 12.56s/it]"
1478 | ]
1479 | },
1480 | {
1481 | "name": "stdout",
1482 | "output_type": "stream",
1483 | "text": [
1484 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1485 | "Retry with Recursive Func\n"
1486 | ]
1487 | },
1488 | {
1489 | "name": "stderr",
1490 | "output_type": "stream",
1491 | "text": [
1492 | "Ingestion Start: 78%|█████████████████████████████████████████████▍ | 1226/1564 [14:55:12<40:19, 7.16s/it]"
1493 | ]
1494 | },
1495 | {
1496 | "name": "stdout",
1497 | "output_type": "stream",
1498 | "text": [
1499 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1500 | "Retry with Recursive Func\n"
1501 | ]
1502 | },
1503 | {
1504 | "name": "stderr",
1505 | "output_type": "stream",
1506 | "text": [
1507 | "Ingestion Start: 81%|██████████████████████████████████████████████▉ | 1265/1564 [15:09:59<38:10, 7.66s/it]"
1508 | ]
1509 | },
1510 | {
1511 | "name": "stdout",
1512 | "output_type": "stream",
1513 | "text": [
1514 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1515 | "Retry with Recursive Func\n"
1516 | ]
1517 | },
1518 | {
1519 | "name": "stderr",
1520 | "output_type": "stream",
1521 | "text": [
1522 | "Ingestion Start: 81%|█████████████████████████████████████████████▌ | 1274/1564 [15:21:10<1:29:41, 18.56s/it]"
1523 | ]
1524 | },
1525 | {
1526 | "name": "stdout",
1527 | "output_type": "stream",
1528 | "text": [
1529 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1530 | "Retry with Recursive Func\n"
1531 | ]
1532 | },
1533 | {
1534 | "name": "stderr",
1535 | "output_type": "stream",
1536 | "text": [
1537 | "Ingestion Start: 83%|███████████████████████████████████████████████▉ | 1291/1564 [15:33:34<34:34, 7.60s/it]"
1538 | ]
1539 | },
1540 | {
1541 | "name": "stdout",
1542 | "output_type": "stream",
1543 | "text": [
1544 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1545 | "Retry with Recursive Func\n"
1546 | ]
1547 | },
1548 | {
1549 | "name": "stderr",
1550 | "output_type": "stream",
1551 | "text": [
1552 | "Ingestion Start: 84%|████████████████████████████████████████████████▍ | 1306/1564 [15:45:26<24:32, 5.71s/it]"
1553 | ]
1554 | },
1555 | {
1556 | "name": "stdout",
1557 | "output_type": "stream",
1558 | "text": [
1559 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1560 | "Retry with Recursive Func\n"
1561 | ]
1562 | },
1563 | {
1564 | "name": "stderr",
1565 | "output_type": "stream",
1566 | "text": [
1567 | "Ingestion Start: 87%|██████████████████████████████████████████████████▏ | 1354/1564 [16:02:15<32:11, 9.20s/it]"
1568 | ]
1569 | },
1570 | {
1571 | "name": "stdout",
1572 | "output_type": "stream",
1573 | "text": [
1574 | "[ERROR] - HTTP code 502 from API (\r\n",
1575 | "502 Bad Gateway\r\n",
1576 | "\r\n",
1577 | "502 Bad Gateway
\r\n",
1578 | "
cloudflare\r\n",
1579 | "\r\n",
1580 | "\r\n",
1581 | ")\n",
1582 | "Retry with Recursive Func\n"
1583 | ]
1584 | },
1585 | {
1586 | "name": "stderr",
1587 | "output_type": "stream",
1588 | "text": [
1589 | "\r",
1590 | "Ingestion Start: 87%|████████████████████████████████████████████████▌ | 1355/1564 [16:03:45<1:56:09, 33.35s/it]"
1591 | ]
1592 | },
1593 | {
1594 | "name": "stdout",
1595 | "output_type": "stream",
1596 | "text": [
1597 | "[ERROR] - HTTP code 502 from API (\r\n",
1598 | "502 Bad Gateway\r\n",
1599 | "\r\n",
1600 | "502 Bad Gateway
\r\n",
1601 | "
cloudflare\r\n",
1602 | "\r\n",
1603 | "\r\n",
1604 | ")\n",
1605 | "Retry with Recursive Func\n"
1606 | ]
1607 | },
1608 | {
1609 | "name": "stderr",
1610 | "output_type": "stream",
1611 | "text": [
1612 | "Ingestion Start: 87%|██████████████████████████████████████████████████▍ | 1360/1564 [16:04:37<43:57, 12.93s/it]"
1613 | ]
1614 | },
1615 | {
1616 | "name": "stdout",
1617 | "output_type": "stream",
1618 | "text": [
1619 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1620 | "Retry with Recursive Func\n"
1621 | ]
1622 | },
1623 | {
1624 | "name": "stderr",
1625 | "output_type": "stream",
1626 | "text": [
1627 | "\r",
1628 | "Ingestion Start: 87%|██████████████████████████████████████████████▉ | 1361/1564 [16:14:45<10:46:56, 191.21s/it]"
1629 | ]
1630 | },
1631 | {
1632 | "name": "stdout",
1633 | "output_type": "stream",
1634 | "text": [
1635 | "[ERROR] - HTTP code 502 from API (\r\n",
1636 | "502 Bad Gateway\r\n",
1637 | "\r\n",
1638 | "502 Bad Gateway
\r\n",
1639 | "
cloudflare\r\n",
1640 | "\r\n",
1641 | "\r\n",
1642 | ")\n",
1643 | "Retry with Recursive Func\n"
1644 | ]
1645 | },
1646 | {
1647 | "name": "stderr",
1648 | "output_type": "stream",
1649 | "text": [
1650 | "Ingestion Start: 94%|██████████████████████████████████████████████████████▋ | 1473/1564 [16:30:36<10:48, 7.13s/it]"
1651 | ]
1652 | },
1653 | {
1654 | "name": "stdout",
1655 | "output_type": "stream",
1656 | "text": [
1657 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1658 | "Retry with Recursive Func\n"
1659 | ]
1660 | },
1661 | {
1662 | "name": "stderr",
1663 | "output_type": "stream",
1664 | "text": [
1665 | "Ingestion Start: 95%|████████████████████████████████████████████████████▉ | 1478/1564 [16:41:08<1:10:51, 49.44s/it]"
1666 | ]
1667 | },
1668 | {
1669 | "name": "stdout",
1670 | "output_type": "stream",
1671 | "text": [
1672 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1673 | "Retry with Recursive Func\n"
1674 | ]
1675 | },
1676 | {
1677 | "name": "stderr",
1678 | "output_type": "stream",
1679 | "text": [
1680 | "Ingestion Start: 95%|█████████████████████████████████████████████████████▏ | 1486/1564 [16:55:33<1:06:36, 51.23s/it]"
1681 | ]
1682 | },
1683 | {
1684 | "name": "stdout",
1685 | "output_type": "stream",
1686 | "text": [
1687 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1688 | "Retry with Recursive Func\n"
1689 | ]
1690 | },
1691 | {
1692 | "name": "stderr",
1693 | "output_type": "stream",
1694 | "text": [
1695 | "Ingestion Start: 98%|████████████████████████████████████████████████████████▋ | 1530/1564 [17:19:09<12:27, 21.99s/it]"
1696 | ]
1697 | },
1698 | {
1699 | "name": "stdout",
1700 | "output_type": "stream",
1701 | "text": [
1702 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1703 | "Retry with Recursive Func\n"
1704 | ]
1705 | },
1706 | {
1707 | "name": "stderr",
1708 | "output_type": "stream",
1709 | "text": [
1710 | "Ingestion Start: 99%|█████████████████████████████████████████████████████████▎| 1547/1564 [17:31:34<02:05, 7.38s/it]"
1711 | ]
1712 | },
1713 | {
1714 | "name": "stdout",
1715 | "output_type": "stream",
1716 | "text": [
1717 | "[ERROR] - Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)\n",
1718 | "Retry with Recursive Func\n"
1719 | ]
1720 | },
1721 | {
1722 | "name": "stderr",
1723 | "output_type": "stream",
1724 | "text": [
1725 | "Ingestion Start: 100%|██████████████████████████████████████████████████████████| 1564/1564 [17:44:08<00:00, 40.82s/it]\n"
1726 | ]
1727 | }
1728 | ],
1729 | "source": [
1730 | "# Iter and push into array\n",
1731 | "for comment in tqdm(data[\"full_text\"], desc = \"Ingestion Start\"):\n",
1732 | " result, token = ingest_openai(tweet_comment = comment)\n",
1733 | " final_result_extraction.append(result)\n",
1734 | " final_token_usage.append(token)"
1735 | ]
1736 | },
1737 | {
1738 | "cell_type": "code",
1739 | "execution_count": 16,
1740 | "id": "290b2686",
1741 | "metadata": {},
1742 | "outputs": [
1743 | {
1744 | "data": {
1745 | "text/plain": [
1746 | "(1564, 1564)"
1747 | ]
1748 | },
1749 | "execution_count": 16,
1750 | "metadata": {},
1751 | "output_type": "execute_result"
1752 | }
1753 | ],
1754 | "source": [
1755 | "len(final_result_extraction), len(final_token_usage)"
1756 | ]
1757 | },
1758 | {
1759 | "cell_type": "code",
1760 | "execution_count": 19,
1761 | "id": "06534b4e",
1762 | "metadata": {},
1763 | "outputs": [],
1764 | "source": [
1765 | "# Assign result into dataframe\n",
1766 | "data['result extraction'] = final_result_extraction\n",
1767 | "data['token usage'] = final_token_usage"
1768 | ]
1769 | },
1770 | {
1771 | "cell_type": "code",
1772 | "execution_count": 20,
1773 | "id": "14520371",
1774 | "metadata": {},
1775 | "outputs": [],
1776 | "source": [
1777 | "# Save into dataframe\n",
1778 | "data.to_csv(\"../dataset/data_twitter_pemilu_2024_enrich [V2].csv\", index = False)"
1779 | ]
1780 | }
1781 | ],
1782 | "metadata": {
1783 | "kernelspec": {
1784 | "display_name": "Python 3 (ipykernel)",
1785 | "language": "python",
1786 | "name": "python3"
1787 | },
1788 | "language_info": {
1789 | "codemirror_mode": {
1790 | "name": "ipython",
1791 | "version": 3
1792 | },
1793 | "file_extension": ".py",
1794 | "mimetype": "text/x-python",
1795 | "name": "python",
1796 | "nbconvert_exporter": "python",
1797 | "pygments_lexer": "ipython3",
1798 | "version": "3.9.7"
1799 | }
1800 | },
1801 | "nbformat": 4,
1802 | "nbformat_minor": 5
1803 | }
1804 |
--------------------------------------------------------------------------------