├── example.env
├── images
    ├── stargazer_workflow.png
    ├── gpt4all_topics_pie_chart.png
    ├── gpt4all_topics_wordcloud.png
    ├── cockroach_topics_pie_chart.png
    ├── cockroach_topics_wordcloud.png
    ├── gpt4all_countries_pie_chart.png
    ├── langchain_topics_pie_chart.png
    ├── langchain_topics_wordcloud.png
    ├── cockroach_countries_pie_chart.png
    ├── langchain_countries_pie_chart.png
    ├── gpt4all_starred_repos_pie_chart.png
    ├── cockroach_starred_repos_pie_chart.png
    └── langchain_starred_repos_pie_chart.png
├── requirements.txt
├── results
    └── sample_insights.csv
├── functions
    ├── github_stargazers.py
    ├── string_to_dataframe.py
    ├── webpage_text_extractor.py
    ├── chatgpt.py
    ├── chatgpt_batch.py
    └── github_user_details.py
├── .gitignore
├── visualize_results.py
├── stargazers.py
├── README.md
└── LICENSE


/example.env:
--------------------------------------------------------------------------------
1 | REPO_URL="https://github.com/georgia-tech-db/evadb"
2 | GITHUB_API="github_pat_..."
3 | OPENAI_KEY="sk-..."
4 | 


--------------------------------------------------------------------------------
/images/stargazer_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/stargazers-reloaded/HEAD/images/stargazer_workflow.png


--------------------------------------------------------------------------------
/images/gpt4all_topics_pie_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/stargazers-reloaded/HEAD/images/gpt4all_topics_pie_chart.png


--------------------------------------------------------------------------------
/images/gpt4all_topics_wordcloud.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/stargazers-reloaded/HEAD/images/gpt4all_topics_wordcloud.png


--------------------------------------------------------------------------------
/images/cockroach_topics_pie_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/stargazers-reloaded/HEAD/images/cockroach_topics_pie_chart.png


--------------------------------------------------------------------------------
/images/cockroach_topics_wordcloud.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/stargazers-reloaded/HEAD/images/cockroach_topics_wordcloud.png


--------------------------------------------------------------------------------
/images/gpt4all_countries_pie_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/stargazers-reloaded/HEAD/images/gpt4all_countries_pie_chart.png


--------------------------------------------------------------------------------
/images/langchain_topics_pie_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/stargazers-reloaded/HEAD/images/langchain_topics_pie_chart.png


--------------------------------------------------------------------------------
/images/langchain_topics_wordcloud.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/stargazers-reloaded/HEAD/images/langchain_topics_wordcloud.png


--------------------------------------------------------------------------------
/images/cockroach_countries_pie_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/stargazers-reloaded/HEAD/images/cockroach_countries_pie_chart.png


--------------------------------------------------------------------------------
/images/langchain_countries_pie_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/stargazers-reloaded/HEAD/images/langchain_countries_pie_chart.png


--------------------------------------------------------------------------------
/images/gpt4all_starred_repos_pie_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/stargazers-reloaded/HEAD/images/gpt4all_starred_repos_pie_chart.png


--------------------------------------------------------------------------------
/images/cockroach_starred_repos_pie_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/stargazers-reloaded/HEAD/images/cockroach_starred_repos_pie_chart.png


--------------------------------------------------------------------------------
/images/langchain_starred_repos_pie_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/stargazers-reloaded/HEAD/images/langchain_starred_repos_pie_chart.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | evadb
 2 | PyGithub
 3 | selenium
 4 | easyocr
 5 | tqdm
 6 | tiktoken
 7 | openai
 8 | python-dotenv==1.0.0
 9 | matplotlib
10 | wordcloud


--------------------------------------------------------------------------------
/results/sample_insights.csv:
--------------------------------------------------------------------------------
 1 | langchain_stargazerinsightsgpt4._row_id,langchain_stargazerinsightsgpt4.name,langchain_stargazerinsightsgpt4.country,langchain_stargazerinsightsgpt4.city,langchain_stargazerinsightsgpt4.email,langchain_stargazerinsightsgpt4.occupation,langchain_stargazerinsightsgpt4.programming_languages,langchain_stargazerinsightsgpt4.social_media,langchain_stargazerinsightsgpt4.response
 2 | 1,Alaettin Ayar,N/A,Gdansk,N/A,N/A,Python,N/A,"Web development, Machine Learning"
 3 | 2,John Shahawy,N/A,Jacksonville,N/A,N/A,"Java, Python",N/A,N/A
 4 | 3,David Song,United States,California,N/A,N/A,"JavaScript, Java",davidtsong.com,Web development
 5 | 4,Martin Salo,United Kingdom,London,N/A,Co-Founder & CTO at yummyshop,Python,N/A,Machine Learning
 6 | 5,Samantha Whitmore,N/A,N/A,N/A,N/A,Python,N/A,"Machine Learning, Web development"
 7 | 6,achuthasubhash,India,Guntur,subhashachutha@gmail.com,N/A,Python,"https//www.linkedin.com/in/achutha-subhash2b29a167, https//www.twitter.com/AchuthaSubhash",Machine Learning
 8 | 7,Agrover1 12,N/A,N/A,agroverl12@gmail.com,N/A,Python,N/A,Machine Learning
 9 | 8,gemasphi,N/A,N/A,N/A,N/A,"Python, C++, JavaScript",N/A,Machine Learning
10 | 9,Robin Singh,N/A,N/A,N/A,N/A,"Python, JavaScript",N/A,Web development
11 | 10,ivmncs,United States,Brooklyn,N/A,N/A,"Python, Rust",N/A,Machine Learning


--------------------------------------------------------------------------------
/functions/github_stargazers.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from tqdm import tqdm
 3 | from github import Github
 4 | 
 5 | from evadb.catalog.catalog_type import NdArrayType, ColumnType
 6 | from evadb.functions.abstract.abstract_function import AbstractFunction
 7 | from evadb.functions.decorators.decorators import forward, setup
 8 | from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe
 9 | 
10 | 
11 | class GithubStargazers(AbstractFunction):
12 |     """
13 |     Arguments:
14 |         None
15 | 
16 |     Input Signatures:
17 |         repo_url (str) : The URL of the GitHub repository to scrape stargazers from.
18 |         github_token (str) : GitHub personal access token for authentication.
19 | 
20 |     Output Signatures:
21 |         stargazers (str) : A list of GitHub usernames who have starred the repository.
22 | 
23 |     Example Usage:
24 |         You can use this function to scrape stargazers of a GitHub repository as follows:
25 | 
26 |         repo_url = "https://github.com/owner/repo"
27 |         github_token = "your_github_token" (Personal Access Token)
28 |         cursor.function("GithubStargazersScraper", repo_url, github_token)
29 |     """
30 | 
31 |     @property
32 |     def name(self) -> str:
33 |         return "GithubStargazers"
34 | 
35 |     @setup(cacheable=False, function_type="web-scraping")
36 |     def setup(self) -> None:
37 |         # Any setup or initialization can be done here if needed
38 |         pass
39 | 
40 |     @forward(
41 |         input_signatures=[
42 |             PandasDataframe(
43 |                 columns=["repo_url", "github_token"],
44 |                 column_types=[ColumnType.TEXT, ColumnType.TEXT],
45 |                 column_shapes=[(1,), (1,), (1,)],
46 |             )
47 |         ],
48 |         output_signatures=[
49 |             PandasDataframe(
50 |                 columns=["stargazers"],
51 |                 column_types=[NdArrayType.STR],
52 |                 column_shapes=[(None,)],
53 |             )
54 |         ],
55 |     )
56 |     def forward(self, input_df):
57 |         # Ensure the URL is provided
58 |         if input_df.empty or input_df.iloc[0, 0] is None:
59 |             raise ValueError("Repository URL must be provided.")
60 | 
61 |         # Extract inputs from the DataFrame
62 |         repo_url = input_df.iloc[0, 0]
63 |         github_token = input_df.iloc[0, 1]
64 | 
65 |         # Initialize GitHub API client
66 |         if github_token:
67 |             github = Github(github_token)
68 |         else:
69 |             github = Github()
70 | 
71 |         try:
72 |             # Parse the repository URL to extract owner and repo name
73 |             parts = repo_url.strip("/").split("/")
74 |             owner = parts[-2]
75 |             repo_name = parts[-1]
76 | 
77 |             # Get the repository and its stargazers
78 |             repository = github.get_repo(f"{owner}/{repo_name}")
79 |             stargazers = []
80 |             stargazers = [stargazer.login for stargazer in repository.get_stargazers()[:1000]]
81 | 
82 |         except Exception as e:
83 |             print(f"Error: {str(e)}")
84 | 
85 |         df = pd.DataFrame({"github_username": stargazers})
86 | 
87 |         return df
88 | 


--------------------------------------------------------------------------------
/functions/string_to_dataframe.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from evadb.catalog.catalog_type import ColumnType
 4 | from evadb.functions.abstract.abstract_function import AbstractFunction
 5 | from evadb.functions.decorators.decorators import forward, setup
 6 | from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe
 7 | 
 8 | class StringToDataframe(AbstractFunction):
 9 |     """
10 |     Arguments:
11 |         None
12 | 
13 |     Input Signatures:
14 |         input_string (str) : A string containing structured data.
15 | 
16 |     Output Signatures:
17 |         output_dataframe (DataFrame) : A DataFrame containing structured data.
18 | 
19 |     Example Usage:
20 |         You can use this function to convert a structured string into a DataFrame.
21 | 
22 |         input_string = "Name: John\nAge: 30\nCountry: USA"
23 |     """
24 | 
25 |     @property
26 |     def name(self) -> str:
27 |         return "StringToDataframe"
28 | 
29 |     @setup(cacheable=False)
30 |     def setup(self) -> None:
31 |         # Any setup or initialization can be done here if needed
32 |         pass
33 | 
34 |     @forward(
35 |         input_signatures=[
36 |             PandasDataframe(
37 |                 columns=["extracted_text"],
38 |                 column_types=[ColumnType.TEXT],
39 |                 column_shapes=[(None,)],
40 |             )
41 |         ],
42 |         output_signatures=[
43 |             PandasDataframe(
44 |                 columns=["name", "country", "city", "email",
45 |                          "occupation",
46 |                          "programming_languages", 
47 |                          "topics_of_interest", 
48 |                          "social_media"],
49 |                 column_types=[ColumnType.TEXT, ColumnType.TEXT, ColumnType.TEXT, ColumnType.TEXT, ColumnType.TEXT, ColumnType.TEXT, ColumnType.TEXT, ColumnType.TEXT],
50 |                 column_shapes=[(None,), (None,), (None,), (None,), (None,), (None,), (None,), (None,)],
51 |             )
52 |         ],
53 |     )
54 |     def forward(self, input_df):
55 |         # Ensure input is provided
56 |         if input_df.empty or input_df.iloc[0] is None:
57 |             raise ValueError("Input string must be provided.")
58 | 
59 |         # Initialize lists for columns
60 |         keys_list = []
61 |         values_list = []
62 | 
63 |         # Iterate over rows of the input DataFrame
64 |         for _, row in input_df.iterrows():
65 |             response = row["response"]
66 | 
67 |             # Split the input string into lines
68 |             lines = response.strip().split("\n")
69 | 
70 |             # Initialize lists for columns in this row
71 |             keys = []
72 |             values = []
73 | 
74 |             # Parse the lines and extract key-value pairs
75 |             for line in lines:
76 |                 parts = line.split(":")
77 |                 if len(parts) == 2:
78 |                     key = parts[0].strip()
79 |                     value = parts[1].strip()
80 |                     keys.append(key)
81 |                     values.append(value)
82 |                 if len(parts) > 2:
83 |                     key = parts[0].strip()
84 |                     # value = parts[1].strip() + parts[2].strip()
85 |                     value = [parts[i].strip() for i in range(1, len(parts))]
86 |                     value = "".join(value)
87 |                     keys.append(key)
88 |                     values.append(value)
89 | 
90 |             keys_list.append(keys)
91 |             values_list.append(values)
92 | 
93 |         # Create a DataFrame from the parsed data
94 |         output_dataframe = pd.DataFrame(values_list, columns=keys_list[0])
95 | 
96 |         return output_dataframe


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | *workspace
162 | 
163 | *venv/
164 | evadb_data
165 | **/*.json
166 | **/*.csv
167 | **/*.png


--------------------------------------------------------------------------------
/functions/webpage_text_extractor.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | from selenium.webdriver.firefox.options import Options as FirefoxOptions
  3 | from selenium.webdriver.common.by import By
  4 | import concurrent.futures
  5 | import pandas as pd
  6 | import time
  7 | from evadb.catalog.catalog_type import ColumnType
  8 | from evadb.functions.abstract.abstract_function import AbstractFunction
  9 | from evadb.functions.decorators.decorators import forward, setup
 10 | from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe
 11 | 
 12 | import easyocr
 13 | from tqdm import tqdm
 14 | 
 15 | 
 16 | reader = easyocr.Reader(["en"], gpu=True)
 17 | 
 18 | 
 19 | def scrape_user_page(url):
 20 |     try:
 21 |         options = FirefoxOptions()
 22 |         options.add_argument("--headless")
 23 | 
 24 |         driver = webdriver.Firefox(options=options)
 25 | 
 26 |         driver.set_window_size(1920, 1080)
 27 |         # Open the GitHub user page
 28 |         driver.get(f"https://github.com/{url}")
 29 |         # driver.execute_script("document.body.style.zoom='120%'")
 30 | 
 31 |         # Capture the user profile section
 32 |         user_info_blocks = []
 33 |         try:
 34 |             user_info_blocks.append(driver.find_element(By.CLASS_NAME, "h-card"))
 35 |         except:
 36 |             pass
 37 |         info_ids = ["user-profile-frame", "user-private-profile-frame"]
 38 |         for info_id in info_ids:
 39 |             try:
 40 |                 user_info_blocks.append(driver.find_element(By.ID, info_id))
 41 |             except:
 42 |                 pass
 43 | 
 44 |         extracted_text = ""
 45 |         for info_block in user_info_blocks:
 46 |             screenshot = info_block.screenshot_as_png
 47 |             # with torch.cuda.device(gpu_id):
 48 |             result = reader.readtext(screenshot, detail=0)
 49 |             for i in result:
 50 |                 extracted_text += i + " "
 51 | 
 52 |         return extracted_text
 53 | 
 54 |     except Exception as e:
 55 |         print(f"Error for {url}: {str(e)}")
 56 |         return str(e)
 57 |     finally:
 58 |         driver.quit()
 59 | 
 60 | 
 61 | # Define a function to extract text from a set of URLs
 62 | def extract_text_from_url(url):
 63 |     try:
 64 |         # Scrape user page using Selenium and EasyOCR
 65 |         extracted_text = scrape_user_page(url)
 66 |     except Exception as e:
 67 |         error_msg = f"Error extracting text from {url}: {str(e)}"
 68 |         print(error_msg)
 69 |         return error_msg
 70 | 
 71 |     return extracted_text
 72 | 
 73 | 
 74 | class WebPageTextExtractor(AbstractFunction):
 75 |     """
 76 |     Arguments:
 77 |         None
 78 | 
 79 |     Input Signatures:
 80 |         urls (list) : A list of URLs from which to extract text.
 81 | 
 82 |     Output Signatures:
 83 |         extracted_text (list) : A list of text extracted from the provided URLs.
 84 | 
 85 |     Example Usage:
 86 |         You can use this function to extract text from a list of URLs like this:
 87 | 
 88 |         urls = ["https://example.com/page1", "https://example.com/page2"]
 89 |     """
 90 | 
 91 |     @property
 92 |     def name(self) -> str:
 93 |         return "WebPageTextExtractor"
 94 | 
 95 |     @setup(cacheable=False, function_type="web-scraping")
 96 |     def setup(self) -> None:
 97 |         # Any setup or initialization can be done here if needed
 98 |         pass
 99 | 
100 |     @forward(
101 |         input_signatures=[
102 |             PandasDataframe(
103 |                 columns=["urls"],
104 |                 column_types=[ColumnType.TEXT],
105 |                 column_shapes=[(None,)],
106 |             )
107 |         ],
108 |         output_signatures=[
109 |             PandasDataframe(
110 |                 columns=["extracted_text"],
111 |                 column_types=[ColumnType.TEXT],
112 |                 column_shapes=[(None,)],
113 |             )
114 |         ],
115 |     )
116 |     def forward(self, input_df):
117 |         # Ensure URLs are provided
118 |         if input_df.empty or input_df.iloc[0] is None:
119 |             raise ValueError("URLs must be provided.")
120 | 
121 |         print(input_df)
122 | 
123 |         # Extract URLs from the DataFrame
124 |         urls = input_df["github_username"]
125 | 
126 |         # Use ThreadPoolExecutor for concurrent processing
127 |         num_workers = 1
128 |         # Note: CUDA errors in EasyOCR with more than 1 worker
129 |         ## profiling
130 |         # 1 worker: 218.00s
131 |         # 4 workers: 147.44s
132 |         # 8 workers: 134.55s
133 |         # 12 workers: 149.89s
134 | 
135 |         num_urls = len(urls)
136 | 
137 |         print(f"Extracting text from {num_urls} URLs using {num_workers} workers")
138 | 
139 |         start = time.time()
140 |         extracted_text_lists = []
141 |         # Use ThreadPoolExecutor for concurrent processing
142 |         with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
143 |             # Submit tasks to extract text from each URL
144 |             extracted_text_lists = list(
145 |                 tqdm(executor.map(extract_text_from_url, urls), total=num_urls)
146 |             )
147 | 
148 |         # Create a DataFrame from the extracted text
149 |         extracted_text_df = pd.DataFrame({"extracted_text": extracted_text_lists})
150 |         end = time.time()
151 |         print("time taken: {:.2f}s".format(end - start))
152 |         return extracted_text_df
153 | 


--------------------------------------------------------------------------------
/visualize_results.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import pandas as pd
  4 | from dotenv import load_dotenv
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | plt.rcParams['font.family'] = 'sans-serif'
  8 | plt.rcParams['font.sans-serif'] = 'DejaVu Sans Mono'
  9 | 
 10 | from wordcloud import WordCloud
 11 | 
 12 | 
 13 | if not load_dotenv():
 14 |     print(
 15 |         "Could not load .env file or it is empty. Please check if it exists and is readable."
 16 |     )
 17 |     exit(1)
 18 | 
 19 | # REPO DETAILS
 20 | repo_url = os.environ.get('REPO_URL')
 21 | # Parse the repository URL to extract owner and repo name
 22 | parts = repo_url.strip("/").split("/")
 23 | repo_name = parts[-1]
 24 | 
 25 | 
 26 | def plot_pie_chart(data, title, output_path):
 27 | 
 28 |     fig, ax = plt.subplots(figsize=(12, 8))
 29 | 
 30 |     # explode first slice if it is not Other
 31 |     explode = [0] * len(data)
 32 |     if data.index.tolist()[0] != "Other":
 33 |         explode[0] = 0.1
 34 | 
 35 |     colors = ["#005F73", "#AE2012", "#EE9B00", "#94D2BD"]
 36 |     colors_dict = {"Web development": colors[0], "Machine Learning": colors[1], "Databases": colors[2], "Other": colors[3]}
 37 |     colors = [colors_dict[x[0]] for x in data.index.tolist()]
 38 |     wedges, texts, autotexts = ax.pie(data, explode=explode, colors=colors, autopct=lambda pct: "{:.1f}%".format(pct),
 39 |                                     textprops=dict(color="w"), shadow=True, startangle=90)
 40 | 
 41 |     labels = [x[0] for x in data.index.tolist()]
 42 | 
 43 |     # if "Other is in the list, it will be placed at the end of the pie chart
 44 |     if "Other" in labels:
 45 |         order_indices = [labels.index(x) for x in labels if x != "Other"]
 46 |         other_index = labels.index("Other")
 47 |         order_indices.append(other_index)
 48 |     else:
 49 |         order_indices = list(range(len(labels)))
 50 | 
 51 |     ax.legend([wedges[x] for x in order_indices], [labels[x] for x in order_indices],
 52 |             loc="center left",
 53 |             fancybox=True, shadow=True,
 54 |             bbox_to_anchor=(1, 0, 0.5, 1), prop={"size": 18})
 55 | 
 56 |     # if any of the percentages is less than 10%, change the font size to 10
 57 |     textsize = 20
 58 |     for autotext in autotexts:
 59 |         if float(autotext.get_text().strip("%")) < 10:
 60 |             textsize = 15
 61 |             break
 62 |     # if the percentage is less  than 4%, don't show the percentage
 63 |     for autotext in autotexts:
 64 |         if float(autotext.get_text().strip("%")) < 4:
 65 |             autotext.set_visible(False)
 66 | 
 67 |     plt.setp(autotexts, size=textsize, weight="bold")
 68 | 
 69 |     ax.set_title(title, fontsize=20, weight="bold")
 70 | 
 71 |     # move title to the right
 72 |     ax.title.set_position([0.725, 0.5])
 73 |     ax.axis('equal')
 74 |     plt.tight_layout()
 75 |     fig.savefig(output_path, bbox_inches='tight')
 76 | 
 77 | 
 78 | def clean_topics_list(topics_list):
 79 |     # each row is a string of topics with various formats
 80 |     # extract the strings "Machine Learning", "Databases", and "Web Development" into a list
 81 |     # return the list
 82 |     output = []
 83 |     for row in topics_list:
 84 |         output_row = []
 85 |         if pd.isna(row):
 86 |             output_row.append("Other")
 87 |             output.append(output_row)
 88 |             continue
 89 |         if "Machine Learning".lower() in row.lower():
 90 |             output_row.append("Machine Learning")
 91 |         if "Databases".lower() in row.lower():
 92 |             output_row.append("Databases")
 93 |         if "Web development".lower() in row.lower():
 94 |             output_row.append("Web development")
 95 |         if len(output_row) == 0:
 96 |             output_row.append("Other")
 97 |         output.append(output_row)
 98 | 
 99 |     return output
100 | 
101 | 
102 | if __name__ == '__main__':
103 | 
104 |     output_dir = "images"
105 |     os.makedirs(output_dir, exist_ok=True)
106 | 
107 |     # 1. Visualize gpt-35 interests insights in a Word Cloud
108 | 
109 |     gpt35_insights_df = pd.read_csv(f"results/{repo_name}_insights_gpt35.csv", dtype=str)
110 | 
111 |     all_topics = gpt35_insights_df[f"{repo_name}_stargazerinsights.topics_of_interest"].tolist()
112 |     all_topics = [x for x in all_topics if not pd.isna(x)]
113 | 
114 |     all_topics = pd.DataFrame(all_topics, columns=['Topics'])
115 |     # Combine all topics into a single text
116 |     all_topics_text = ', '.join(all_topics['Topics'].tolist())
117 | 
118 |     # Generate the word cloud
119 |     wordcloud = WordCloud(width=1920, height=1080, background_color='white').generate(all_topics_text)
120 |     # Display the word cloud
121 |     plt.figure(figsize=(12, 8))
122 |     plt.imshow(wordcloud, interpolation='bilinear')
123 |     plt.title(f"Word Cloud of {repo_name} user interests", fontsize=20, weight="bold", loc="center", y=1.05)
124 |     # move title to the right
125 |     plt.axis('off')
126 |     plt.tight_layout()
127 |     plt.savefig(os.path.join(output_dir, f"{repo_name}_topics_wordcloud.png"), dpi=300)
128 | 
129 |     # 2. Visualize gpt-4 topic insights in a pie chart
130 | 
131 |     insights_df = pd.read_csv(f"results/{repo_name}_insights_gpt4.csv", dtype=str)
132 |     topics_df = insights_df[f"{repo_name}_stargazerinsightsgpt4.response"].tolist()
133 |     topics_list = clean_topics_list(topics_df)
134 | 
135 |     all_topics = []
136 |     for topics in topics_list:
137 |         all_topics.extend(topics)
138 | 
139 |     all_topics = pd.DataFrame(all_topics, columns=['Topics'])
140 | 
141 |     # Count the occurrences of each topic
142 |     topic_counts = all_topics.value_counts()
143 |     plot_pie_chart(data=topic_counts,
144 |                    title=f"Topics of Interest Distribution for {repo_name} users",
145 |                    output_path=os.path.join(output_dir, f"{repo_name}_topics_pie_chart.png"))
146 |     print("Results saved to images folder.")
147 | 


--------------------------------------------------------------------------------
/functions/chatgpt.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018-2023 EvaDB
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | import os
 18 | import time
 19 | 
 20 | import pandas as pd
 21 | from retry import retry
 22 | 
 23 | from evadb.catalog.catalog_type import NdArrayType
 24 | from evadb.functions.abstract.abstract_function import AbstractFunction
 25 | from evadb.functions.decorators.decorators import forward, setup
 26 | from evadb.functions.decorators.io_descriptors.data_types import (
 27 |     PandasDataframe,
 28 | )
 29 | from evadb.utils.generic_utils import try_to_import_openai
 30 | from tqdm import tqdm
 31 | 
 32 | 
 33 | _VALID_CHAT_COMPLETION_MODEL = [
 34 |     "gpt-3.5-turbo",
 35 |     "gpt-3.5-turbo-16k",
 36 |     "gpt-4-0613",
 37 | ]
 38 | 
 39 | 
 40 | class ChatGPT(AbstractFunction):
 41 |     """
 42 |     Arguments:
 43 |         model (str) : ID of the OpenAI model to use. Refer to '_VALID_CHAT_COMPLETION_MODEL' for a list of supported models.
 44 |         temperature (float) : Sampling temperature to use in the model. Higher value results in a more random output.
 45 | 
 46 |     Input Signatures:
 47 |         query (str)   : The task / question that the user wants the model to accomplish / respond.
 48 |         content (str) : Any relevant context that the model can use to complete its tasks and generate the response.
 49 |         prompt (str)  : An optional prompt that can be passed to the model. It can contain instructions to the model,
 50 |                         or a set of examples to help the model generate a better response.
 51 |                         If not provided, the system prompt defaults to that of an helpful assistant that accomplishes user tasks.
 52 | 
 53 |     Output Signatures:
 54 |         response (str) : Contains the response generated by the model based on user input. Any errors encountered
 55 |                          will also be passed in the response.
 56 | 
 57 |     Example Usage:
 58 |         Assume we have the transcripts for a few videos stored in a table 'video_transcripts' in a column named 'text'.
 59 |         If the user wants to retrieve the summary of each video, the ChatGPT UDF can be used as:
 60 | 
 61 |             query = "Generate the summary of the video"
 62 |             cursor.table("video_transcripts").select(f"ChatGPT({question}, text)")
 63 | 
 64 |         In the above UDF invocation, the 'query' passed would be the user task to generate video summaries, and the
 65 |         'content' passed would be the video transcripts that need to be used in order to generate the summary. Since
 66 |         no prompt is passed, the default system prompt will be used.
 67 | 
 68 |         Now assume the user wants to create the video summary in 50 words and in French. Instead of passing these instructions
 69 |         along with each query, a prompt can be set as such:
 70 | 
 71 |             prompt = "Generate your responses in 50 words or less. Also, generate the response in French."
 72 |             cursor.table("video_transcripts").select(f"ChatGPT({question}, text, {prompt})")
 73 | 
 74 |         In the above invocation, an additional argument is passed as prompt. While the query and content arguments remain
 75 |         the same, the 'prompt' argument will be set as a system message in model params.
 76 | 
 77 |         Both of the above cases would generate a summary for each row / video transcript of the table in the response.
 78 |     """
 79 | 
 80 |     @property
 81 |     def name(self) -> str:
 82 |         return "ChatGPT"
 83 | 
 84 |     @setup(cacheable=False, function_type="chat-completion", batchable=True)
 85 |     def setup(
 86 |         self,
 87 |         model="gpt-3.5-turbo",
 88 |         temperature: float = 0,
 89 |     ) -> None:
 90 |         assert (
 91 |             model in _VALID_CHAT_COMPLETION_MODEL
 92 |         ), f"Unsupported ChatGPT {model}"
 93 |         self.model = model
 94 |         self.temperature = temperature
 95 | 
 96 |     @forward(
 97 |         input_signatures=[
 98 |             PandasDataframe(
 99 |                 columns=["query", "content", "prompt"],
100 |                 column_types=[
101 |                     NdArrayType.STR,
102 |                     NdArrayType.STR,
103 |                     NdArrayType.STR,
104 |                 ],
105 |                 column_shapes=[(1,), (1,), (None,)],
106 |             )
107 |         ],
108 |         output_signatures=[
109 |             PandasDataframe(
110 |                 columns=["response"],
111 |                 column_types=[
112 |                     NdArrayType.STR,
113 |                 ],
114 |                 column_shapes=[(1,)],
115 |             )
116 |         ],
117 |     )
118 |     def forward(self, text_df):
119 |         try_to_import_openai()
120 |         import openai
121 | 
122 |         @retry(tries=6, delay=20)
123 |         def completion_with_backoff(**kwargs):
124 |             return openai.ChatCompletion.create(**kwargs)
125 | 
126 |         # Register API key
127 |         openai.api_key = os.environ.get('OPENAI_KEY')
128 |         assert len(openai.api_key) != 0, (
129 |             "Please set your OpenAI API key in evadb.yml file (third_party,"
130 |             " open_api_key) or environment variable (OPENAI_KEY)"
131 |         )
132 | 
133 |         queries = text_df[text_df.columns[0]]
134 |         content = text_df[text_df.columns[0]]
135 |         if len(text_df.columns) > 1:
136 |             queries = text_df.iloc[:, 0]
137 |             content = text_df.iloc[:, 1]
138 | 
139 |         prompt = None
140 |         if len(text_df.columns) > 2:
141 |             prompt = text_df.iloc[0, 2]
142 | 
143 |         # openai api currently supports answers to a single prompt only
144 |         completion_tokens = 0
145 |         prompt_tokens = 0
146 | 
147 |         results = []
148 |         for i, (query, content) in tqdm(enumerate(zip(queries, content))):
149 |             if i != 0 and i % 100 == 0:
150 |                 print(f"Completed {i} rows")
151 |                 # Avoid hitting API limit
152 |                 time.sleep(30)
153 |             params = {
154 |                 "model": self.model,
155 |                 "temperature": self.temperature,
156 |                 "messages": [],
157 |             }
158 | 
159 |             def_sys_prompt_message = {
160 |                 "role": "system",
161 |                 "content": prompt
162 |                 if prompt is not None
163 |                 else (
164 |                     "You are a helpful assistant that accomplishes user tasks."
165 |                 ),
166 |             }
167 | 
168 |             params["messages"].append(def_sys_prompt_message)
169 |             params["messages"].extend(
170 |                 [
171 |                     {
172 |                         "role": "user",
173 |                         "content": f"Here is some context : {content}",
174 |                     },
175 |                     {
176 |                         "role": "user",
177 |                         "content": f"Complete the following task: {query}",
178 |                     },
179 |                 ],
180 |             )
181 | 
182 |             response = completion_with_backoff(**params)
183 |             answer = response.choices[0].message.content
184 |             results.append(answer)
185 |             completion_tokens += response['usage']['completion_tokens']
186 |             prompt_tokens += response['usage']['prompt_tokens']
187 | 
188 |         completion_tokens += response['usage']['completion_tokens']
189 |         prompt_tokens += response['usage']['prompt_tokens']
190 | 
191 |         df = pd.DataFrame({"response": results})
192 | 
193 |         print(f"Total tokens used: {completion_tokens + prompt_tokens}")
194 |         print(f"Completion tokens used: {completion_tokens}")
195 |         print(f"Prompt tokens used: {prompt_tokens}")
196 |         pricing = {
197 |             'gpt-3.5-turbo': {'prompt': 0.0015, 'completion': 0.002},
198 |             'gpt-3.5-turbo-16k': {'prompt': 0.003, 'completion': 0.004},
199 |             'gpt-4-0613': {'prompt': 0.03, 'completion': 0.06},
200 |         }
201 |         print(f"Prompt tokens price: ${pricing[self.model]['prompt'] * prompt_tokens/1000}")
202 |         print(f"Completion tokens price: ${pricing[self.model]['completion'] * completion_tokens/1000}")
203 |         price = (pricing[self.model]['prompt'] * prompt_tokens + pricing[self.model]['completion'] * completion_tokens)/1000
204 |         print(f"Total Price: ${price}")
205 | 
206 |         return df
207 | 


--------------------------------------------------------------------------------
/functions/chatgpt_batch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018-2023 EvaDB
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | import os
 18 | import time
 19 | 
 20 | import pandas as pd
 21 | from retry import retry
 22 | 
 23 | from evadb.catalog.catalog_type import NdArrayType
 24 | from evadb.functions.abstract.abstract_function import AbstractFunction
 25 | from evadb.functions.decorators.decorators import forward, setup
 26 | from evadb.functions.decorators.io_descriptors.data_types import (
 27 |     PandasDataframe,
 28 | )
 29 | from evadb.utils.generic_utils import try_to_import_openai
 30 | import tiktoken
 31 | from tqdm import tqdm
 32 | 
 33 | _VALID_CHAT_COMPLETION_MODEL = [
 34 |     "gpt-3.5-turbo",
 35 |     "gpt-3.5-turbo-16k",
 36 |     "gpt-4-0613",
 37 | ]
 38 | 
 39 | 
 40 | class ChatGPTMultirow(AbstractFunction):
 41 |     """
 42 |     Arguments:
 43 |         model (str) : ID of the OpenAI model to use. Refer to '_VALID_CHAT_COMPLETION_MODEL' for a list of supported models.
 44 |         temperature (float) : Sampling temperature to use in the model. Higher value results in a more random output.
 45 | 
 46 |     Input Signatures:
 47 |         query (str)   : The task / question that the user wants the model to accomplish / respond.
 48 |         content (str) : Any relevant context that the model can use to complete its tasks and generate the response.
 49 |         prompt (str)  : An optional prompt that can be passed to the model. It can contain instructions to the model,
 50 |                         or a set of examples to help the model generate a better response.
 51 |                         If not provided, the system prompt defaults to that of an helpful assistant that accomplishes user tasks.
 52 | 
 53 |     Output Signatures:
 54 |         response (str) : Contains the response generated by the model based on user input. Any errors encountered
 55 |                          will also be passed in the response.
 56 | 
 57 |     Example Usage:
 58 |         Assume we have the transcripts for a few videos stored in a table 'video_transcripts' in a column named 'text'.
 59 |         If the user wants to retrieve the summary of each video, the ChatGPT UDF can be used as:
 60 | 
 61 |             query = "Generate the summary of the video"
 62 |             cursor.table("video_transcripts").select(f"ChatGPT({question}, text)")
 63 | 
 64 |         In the above UDF invocation, the 'query' passed would be the user task to generate video summaries, and the
 65 |         'content' passed would be the video transcripts that need to be used in order to generate the summary. Since
 66 |         no prompt is passed, the default system prompt will be used.
 67 | 
 68 |         Now assume the user wants to create the video summary in 50 words and in French. Instead of passing these instructions
 69 |         along with each query, a prompt can be set as such:
 70 | 
 71 |             prompt = "Generate your responses in 50 words or less. Also, generate the response in French."
 72 |             cursor.table("video_transcripts").select(f"ChatGPT({question}, text, {prompt})")
 73 | 
 74 |         In the above invocation, an additional argument is passed as prompt. While the query and content arguments remain
 75 |         the same, the 'prompt' argument will be set as a system message in model params.
 76 | 
 77 |         Both of the above cases would generate a summary for each row / video transcript of the table in the response.
 78 |     """
 79 | 
 80 |     @property
 81 |     def name(self) -> str:
 82 |         return "ChatGPT"
 83 | 
 84 |     @setup(cacheable=False, function_type="chat-completion", batchable=True)
 85 |     def setup(
 86 |         self,
 87 |         model="gpt-3.5-turbo",
 88 |         temperature: float = 0,
 89 |     ) -> None:
 90 |         assert (
 91 |             model in _VALID_CHAT_COMPLETION_MODEL
 92 |         ), f"Unsupported ChatGPT {model}"
 93 |         self.model = model
 94 |         self.temperature = temperature
 95 | 
 96 |     @forward(
 97 |         input_signatures=[
 98 |             PandasDataframe(
 99 |                 columns=["query", "content", "prompt"],
100 |                 column_types=[
101 |                     NdArrayType.STR,
102 |                     NdArrayType.STR,
103 |                     NdArrayType.STR,
104 |                 ],
105 |                 column_shapes=[(1,), (1,), (None,)],
106 |             )
107 |         ],
108 |         output_signatures=[
109 |             PandasDataframe(
110 |                 columns=["response"],
111 |                 column_types=[
112 |                     NdArrayType.STR,
113 |                 ],
114 |                 column_shapes=[(1,)],
115 |             )
116 |         ],
117 |     )
118 |     def forward(self, text_df):
119 |         try_to_import_openai()
120 |         import openai
121 | 
122 |         @retry(tries=6, delay=20)
123 |         def completion_with_backoff(**kwargs):
124 |             return openai.ChatCompletion.create(**kwargs)
125 | 
126 |         # Register API key
127 |         openai.api_key = os.environ.get('OPENAI_KEY')
128 |         assert len(openai.api_key) != 0, (
129 |             "Please set your OpenAI API key in evadb.yml file (third_party,"
130 |             " open_api_key) or environment variable (OPENAI_KEY)"
131 |         )
132 | 
133 |         queries = text_df[text_df.columns[0]]
134 |         content = text_df[text_df.columns[0]]
135 |         if len(text_df.columns) > 1:
136 |             queries = text_df.iloc[:, 0]
137 |             content = text_df.iloc[:, 1]
138 | 
139 |         prompt = None
140 |         if len(text_df.columns) > 2:
141 |             prompt = text_df.iloc[0, 2]
142 | 
143 |         # openai api currently supports answers to a single prompt only
144 |         completion_tokens = 0
145 |         prompt_tokens = 0
146 | 
147 |         # divide content into batches of 20
148 |         batch_size = 10
149 |         content = content.tolist()
150 |         content_batched = [
151 |             content[i : i + batch_size] for i in range(0, len(content), batch_size)
152 |         ]
153 | 
154 |         all_results = []
155 |         for i, batch in tqdm(enumerate(content_batched)):
156 |             if i % 40 == 0:
157 |                 print(f"Completed {i} batches")
158 |                 # Avoid hitting API limit
159 |                 time.sleep(30)
160 |             all_content = ""
161 |             for row in batch:
162 |                 all_content += row
163 |                 all_content += "\n\n"
164 | 
165 |             all_content = all_content[:-4]
166 |             encoding = tiktoken.encoding_for_model(self.model)
167 |             num_tokens = len(encoding.encode(all_content))
168 |             num_tokens += len(encoding.encode(queries[0]))
169 |             print(f"Estimated input prompt tokens: {num_tokens}")
170 |             params = {
171 |                 "model": self.model,
172 |                 "temperature": self.temperature,
173 |                 "messages": [],
174 |             }
175 | 
176 |             def_sys_prompt_message = {
177 |                 "role": "system",
178 |                 "content": prompt
179 |                 if prompt is not None
180 |                 else ("You are a helpful assistant that accomplishes user tasks."),
181 |             }
182 | 
183 |             params["messages"].append(def_sys_prompt_message)
184 |             params["messages"].extend(
185 |                 [
186 |                     {
187 |                         "role": "user",
188 |                         "content": f"Here is some context : {all_content}",
189 |                     },
190 |                     {
191 |                         "role": "user",
192 |                         "content": f"Complete the following task: {queries[0]}",
193 |                     },
194 |                 ],
195 |             )
196 | 
197 |             response = completion_with_backoff(**params)
198 |             answer = response.choices[0].message.content
199 |             results = answer.split("\n\n")
200 |             if len(results) != len(batch):
201 |                 raise Exception(
202 |                     f"WARNING: batch size is {len(batch)} but results are {len(results)}"
203 |                 )
204 | 
205 |             all_results.extend(results)
206 | 
207 |             completion_tokens += response["usage"]["completion_tokens"]
208 |             prompt_tokens += response["usage"]["prompt_tokens"]
209 | 
210 |         if len(all_results) != len(queries):
211 |             raise Exception(
212 |                 "Length of results and queries do not match, please improve your prompt"
213 |             )
214 | 
215 |         df = pd.DataFrame({"response": all_results})
216 | 
217 |         print(f"Total tokens used: {completion_tokens + prompt_tokens}")
218 |         print(f"Completion tokens used: {completion_tokens}")
219 |         print(f"Prompt tokens used: {prompt_tokens}")
220 |         pricing = {
221 |             "gpt-3.5-turbo": {"prompt": 0.0015, "completion": 0.002},
222 |             "gpt-3.5-turbo-16k": {"prompt": 0.003, "completion": 0.004},
223 |             "gpt-4-0613": {"prompt": 0.03, "completion": 0.06},
224 |         }
225 |         print(
226 |             f"Prompt tokens price: ${pricing[self.model]['prompt'] * prompt_tokens/1000}"
227 |         )
228 |         print(
229 |             f"Completion tokens price: ${pricing[self.model]['completion'] * completion_tokens/1000}"
230 |         )
231 |         price = (
232 |             pricing[self.model]["prompt"] * prompt_tokens
233 |             + pricing[self.model]["completion"] * completion_tokens
234 |         ) / 1000
235 |         print(f"Total Price: ${price}")
236 | 
237 |         return df
238 | 


--------------------------------------------------------------------------------
/functions/github_user_details.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from github import Github
  3 | import time
  4 | import concurrent.futures
  5 | 
  6 | from evadb.catalog.catalog_type import ColumnType
  7 | from evadb.functions.abstract.abstract_function import AbstractFunction
  8 | from evadb.functions.decorators.decorators import forward, setup
  9 | from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe
 10 | 
 11 | 
 12 | class GithubUserDetails(AbstractFunction):
 13 |     """
 14 |     Arguments:
 15 |         None
 16 | 
 17 |     Input Signatures:
 18 |         github_username (str) : The GitHub username for the user whose details you want to retrieve.
 19 |         github_token (str) : GitHub personal access token for authentication.
 20 | 
 21 |     Output Signatures:
 22 |         user_name (str) : The name of the GitHub user.
 23 |         user_login (str) : The login (username) of the GitHub user.
 24 |         user_following (int) : The number of users the GitHub user is following.
 25 |         user_followers (int) : The number of followers the GitHub user has.
 26 |         user_email (str) : The email address of the GitHub user.
 27 |         user_id (int) : The unique ID of the GitHub user.
 28 |         user_location (str) : The location of the GitHub user.
 29 |         user_bio (str) : The bio of the GitHub user.
 30 |         user_company (str) : The company associated with the GitHub user.
 31 |         user_blog (str) : The blog URL of the GitHub user.
 32 |         user_url (str) : The URL of the GitHub user's profile.
 33 |         user_twitter_username (str) : The Twitter username of the GitHub user.
 34 |         user_repos (list) : A list of dictionaries representing the user's repositories with 10+ stars.
 35 |         user_starred (list) : A list of dictionaries representing repositories starred by the user with 10+ stars.
 36 | 
 37 |     Example Usage:
 38 |         You can use this function to retrieve details about a GitHub user as follows:
 39 | 
 40 |         github_username = "username"
 41 |         github_token = "your_personal_access_token"
 42 |         cursor.function("GithubUserDetails", github_username, github_token)
 43 |     """
 44 | 
 45 |     @property
 46 |     def name(self) -> str:
 47 |         return "GithubUserDetails"
 48 | 
 49 |     @setup(cacheable=False, function_type="web-scraping")
 50 |     def setup(self) -> None:
 51 |         # Any setup or initialization can be done here if needed
 52 |         pass
 53 | 
 54 |     @forward(
 55 |         input_signatures=[
 56 |             PandasDataframe(
 57 |                 columns=["github_username", "github_token"],
 58 |                 column_types=[ColumnType.TEXT, ColumnType.TEXT],
 59 |                 column_shapes=[(1,), (1,)],
 60 |             )
 61 |         ],
 62 |         output_signatures=[
 63 |             PandasDataframe(
 64 |                 columns=[
 65 |                     "user_name", "user_login", 
 66 |                     "user_following", "user_followers",
 67 |                     "user_email", 
 68 |                     "user_id", 
 69 |                     "user_location", "user_bio",
 70 |                     "user_company", "user_blog", "user_url", "user_twitter_username",
 71 |                     "user_repos", "user_starred",
 72 |                 ],
 73 |                 column_types=[
 74 |                     ColumnType.TEXT, ColumnType.TEXT,
 75 |                     ColumnType.INTEGER, ColumnType.INTEGER,
 76 |                     ColumnType.TEXT,
 77 |                     ColumnType.INTEGER,
 78 |                     ColumnType.TEXT, ColumnType.TEXT,
 79 |                     ColumnType.TEXT, ColumnType.TEXT, ColumnType.TEXT, ColumnType.TEXT,
 80 |                     ColumnType.TEXT,
 81 |                     ColumnType.TEXT,
 82 |                     ColumnType.TEXT
 83 |                 ],
 84 |                 column_shapes=[
 85 |                     (1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,)
 86 |                     ],
 87 |             )
 88 |         ],
 89 |     )
 90 |     def forward(self, input_df):
 91 |         # Ensure the GitHub username is provided
 92 |         if input_df.empty or input_df.iloc[0, 0] is None:
 93 |             raise ValueError("GitHub username must be provided.")
 94 | 
 95 |         # Extract inputs from the DataFrame
 96 |         github_username = input_df.iloc[0, 0]
 97 |         github_token = input_df.iloc[0, 1]
 98 | 
 99 |         # Initialize GitHub API client
100 |         if github_token:
101 |             github = Github(github_token)
102 |         else:
103 |             github = Github()
104 | 
105 |         # Create an empty list to store user details
106 |         user_details_list = []
107 | 
108 |         # Define a function to fetch user details for a range of rows
109 |         def fetch_user_details_range(start_index, end_index):
110 | 
111 |             # Process a range of rows from start_index to end_index
112 |             api_limit_error = False
113 |             i = 0
114 |             for index in range(start_index, end_index):
115 |                 i = i + 1
116 | 
117 |                 # Avoid hitting API limit
118 |                 if i != 0 and i % 10 == 0:
119 |                     print(f"Downloading details of user: {i}")
120 |                     time.sleep(30)
121 | 
122 |                 if api_limit_error == True:
123 |                     api_limit_error = False
124 |                     index = index - 1
125 | 
126 |                 github_username = input_df.iloc[index]["github_username"]
127 | 
128 |                 try:
129 |                     # Retrieve the user object
130 |                     user = github.get_user(github_username)
131 | 
132 |                     # Gather user details into separate variables
133 |                     user_name = user.name
134 |                     user_login = user.login
135 |                     user_following = user.following
136 |                     user_followers = user.followers
137 |                     user_email = user.email
138 |                     user_id = user.id
139 |                     user_location = user.location
140 |                     user_bio = user.bio
141 |                     user_company = user.company
142 |                     user_blog = user.blog
143 |                     user_url = user.url
144 |                     user_twitter_username = user.twitter_username
145 | 
146 |                     # Repos of user with 10+ stars
147 |                     user_repos =  user.get_repos()
148 |                     user_created_repos = []
149 |                     for repo in user_repos:
150 |                         if repo.fork is False:
151 |                             if repo.stargazers_count > 10:
152 |                                 user_created_repos.append({
153 |                                     repo.name,
154 |                                     repo.description,
155 |                                     repo.html_url,
156 |                                     repo.language
157 |                                 })
158 | 
159 |                     # Repos starred by user with 10+ stars
160 |                     starred_repos = user.get_starred()
161 |                     user_starred_repos = []
162 |                     j = 0
163 |                     for repo in starred_repos:
164 |                         j = j + 1
165 |                         if j > 10:
166 |                             break
167 |                         if repo.stargazers_count > 100:
168 |                             user_starred_repos.append({
169 |                                 repo.name,
170 |                                 repo.description,
171 |                                 repo.html_url,
172 |                                 repo.language
173 |                             })
174 | 
175 |                     # Gather user details into a dictionary
176 |                     user_details = {
177 |                         "user_name": user.name,
178 |                         "user_login": user.login,
179 |                         "user_following": user.following,
180 |                         "user_followers": user.followers,
181 |                         "user_email": user.email,
182 |                         "user_id": user.id,
183 |                         "user_location": user.location,
184 |                         "user_bio": user.bio,
185 |                         "user_company": user.company,
186 |                         "user_blog": user.blog,
187 |                         "user_url": user.url,
188 |                         "user_twitter_username": user.twitter_username,
189 |                         "user_repos": f"{user_created_repos}",
190 |                         "user_starred_repos": f"{user_starred_repos}"
191 |                     }
192 | 
193 |                     # Append user details to the list
194 |                     user_details_list.append(user_details)
195 | 
196 |                 except Exception as e:
197 |                     print(f"Error: {str(e)}")
198 |                     api_limit_error = True
199 |                     # sleep for 5 minutes
200 |                     time.sleep(300)
201 | 
202 |         num_workers = 1
203 |         num_rows = len(input_df)
204 |         rows_per_worker = num_rows // num_workers
205 | 
206 |         print(f"Downloading details of {num_rows} users")
207 | 
208 |         # Create a list of tuples defining the ranges for each worker
209 |         # Include any remaining rows in the last worker's range
210 |         worker_ranges = [
211 |             (i * rows_per_worker, (i + 1) * rows_per_worker) 
212 |             for i in range(num_workers)
213 |         ]
214 |         worker_ranges[-1] = (worker_ranges[-1][0], num_rows)
215 | 
216 |         # Iterate over rows in the input DataFrame using ThreadPoolExecutor
217 |         with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
218 |             for start_index, end_index in worker_ranges:
219 |                 executor.submit(
220 |                     fetch_user_details_range, start_index, end_index
221 |                 )
222 | 
223 |         # Create a DataFrame from the list of user details
224 |         user_details_df = pd.DataFrame(user_details_list)
225 | 
226 |         return user_details_df


--------------------------------------------------------------------------------
/stargazers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | from dotenv import load_dotenv
  3 | import os
  4 | import pandas as pd
  5 | import evadb
  6 | 
  7 | pd.set_option("display.max_columns", None)  # Show all columns
  8 | pd.set_option("display.expand_frame_repr", False)
  9 | pd.set_option("display.max_colwidth", None)
 10 | 
 11 | 
 12 | if not load_dotenv():
 13 |     print(
 14 |         "Could not load .env file or it is empty. Please check if it exists and is readable."
 15 |     )
 16 |     exit(1)
 17 | 
 18 | # REPO DETAILS
 19 | repo_url = os.environ.get('REPO_URL')
 20 | github_pat = os.environ.get('GITHUB_API')
 21 | 
 22 | # Parse the repository URL to extract owner and repo name
 23 | parts = repo_url.strip("/").split("/")
 24 | repo_name = parts[-1]
 25 | 
 26 | DEFAULT_CSV_PATH = f"{repo_name}.csv"
 27 | 
 28 | 
 29 | if __name__ == "__main__":
 30 |     try:
 31 |         # establish evadb api cursor
 32 |         print("⏳ Connect to EvaDB...")
 33 |         cursor = evadb.connect().cursor()
 34 |         print("✅ Connected to EvaDB...")
 35 | 
 36 |         cursor.query(
 37 |             f"""
 38 |             CREATE OR REPLACE FUNCTION GithubStargazers
 39 |             INPUT (repo_url TEXT(1000), github_pat TEXT(1000))
 40 |             OUTPUT (github_username TEXT(1000))
 41 |             TYPE  Webscraping
 42 |             IMPL  'functions/github_stargazers.py';
 43 |         """
 44 |         ).df()
 45 | 
 46 |         cursor.query(
 47 |             f"""
 48 |             CREATE OR REPLACE FUNCTION WebPageTextExtractor
 49 |             INPUT (urls TEXT(1000))
 50 |             OUTPUT (extracted_text TEXT(1000))
 51 |             TYPE  Webscraping
 52 |             IMPL  'functions/webpage_text_extractor.py';
 53 |         """
 54 |         ).df()
 55 | 
 56 |         cursor.query(
 57 |             f"""
 58 |             CREATE OR REPLACE FUNCTION GithubUserdetails
 59 |             INPUT (github_username TEXT(1000), github_pat TEXT(1000))
 60 |             OUTPUT (
 61 |                 user_name TEXT(1000),
 62 |                 user_login TEXT(1000),
 63 |                 user_following INTEGER,
 64 |                 user_followers INTEGER,
 65 |                 user_email TEXT(1000),
 66 |                 user_id INTEGER,
 67 |                 user_location TEXT(1000),
 68 |                 user_bio TEXT(1000),
 69 |                 user_company TEXT(1000),
 70 |                 user_blog TEXT(1000),
 71 |                 user_url TEXT(1000),
 72 |                 user_twitter_username TEXT(1000),
 73 |                 user_repos TEXT(1000),
 74 |                 user_starred_repos TEXT(1000)
 75 |             )
 76 |             TYPE  Webscraping
 77 |             IMPL  'functions/github_user_details.py';
 78 |         """
 79 |         ).df()
 80 | 
 81 |         cursor.query(
 82 |             """
 83 |             CREATE OR REPLACE FUNCTION StringToDataframe
 84 |             INPUT (input_string TEXT(1000))
 85 |             OUTPUT (
 86 |                 name TEXT(1000),
 87 |                 country TEXT(1000),
 88 |                 city TEXT(1000),
 89 |                 email TEXT(1000),
 90 |                 occupation TEXT(1000),
 91 |                 programming_languages TEXT(1000),
 92 |                 topics_of_interest TEXT(1000),
 93 |                 social_media TEXT(1000)
 94 |             )
 95 |             TYPE  Webscraping
 96 |             IMPL  'functions/string_to_dataframe.py';
 97 |         """
 98 |         ).df()
 99 | 
100 |         cursor.query(
101 |             """CREATE OR REPLACE FUNCTION GPT35
102 |                 IMPL 'functions/chatgpt.py'
103 |                 MODEL 'gpt-3.5-turbo-16k'
104 |             """
105 |         ).df()
106 | 
107 |         cursor.query(
108 |             """CREATE OR REPLACE FUNCTION GPT4
109 |                 IMPL 'functions/chatgpt_batch.py'
110 |                 MODEL 'gpt-4-0613'
111 |             """
112 |         ).df()
113 | 
114 |         print(
115 |             cursor.query(
116 |                 f"""
117 |            CREATE TABLE IF NOT EXISTS {repo_name}_StargazerList AS
118 |            SELECT GithubStargazers("{repo_url}", "{github_pat}");
119 |         """
120 |             ).df()
121 |         )
122 | 
123 |         select_query = cursor.query(
124 |             f"SELECT * FROM {repo_name}_StargazerList;"
125 |         ).df()
126 | 
127 |         print(select_query)
128 | 
129 |         print(
130 |             cursor.query(
131 |                 f"""
132 |            CREATE TABLE IF NOT EXISTS {repo_name}_StargazerDetails AS
133 |            SELECT GithubUserdetails(github_username, "{github_pat}")
134 |            FROM {repo_name}_StargazerList;
135 |         """
136 |             ).df()
137 |         )
138 | 
139 |         select_query = cursor.query(
140 |             f"""
141 |         SELECT * FROM {repo_name}_StargazerDetails;
142 |         """
143 |         ).df()
144 | 
145 |         print(select_query)
146 | 
147 |         print(
148 |             cursor.query(
149 |                 f"""
150 |            CREATE TABLE IF NOT EXISTS {repo_name}_StargazerScrapedDetails AS
151 |            SELECT github_username, WebPageTextExtractor(github_username)
152 |            FROM {repo_name}_StargazerList;
153 |         """
154 |             ).df()
155 |         )
156 | 
157 |         select_query = cursor.query(
158 |             f"""
159 |                 SELECT *
160 |                 FROM {repo_name}_StargazerScrapedDetails;
161 |         """
162 |         ).df()
163 | 
164 |         print("Processing insights...")
165 |         # cursor.query(f"DROP TABLE IF EXISTS {repo_name}_StargazerInsights;").df()
166 | 
167 |         LLM_prompt = """You are given a block of disorganized text extracted from the GitHub user profile of a user using an automated web scraper. The goal is to get structured results from this data.
168 |                 Extract the following fields from the text: name, country, city, email, occupation, programming_languages, topics_of_interest, social_media.
169 |                 If some field is not found, just output fieldname: N/A. Always return all the 8 field names. DO NOT add any additional text to your output.
170 |                 The topic_of_interest field must list a broad range of technical topics that are mentioned in any portion of the text.  This field is the most important, so add as much information as you can. Do not add non-technical interests.
171 |                 The programming_languages field can contain one or more programming languages out of only the following 4 programming languages - Python, C++, JavaScript, Java. Do not include any other language outside these 4 languages in the output. If the user is not interested in any of these 4 programming languages, output N/A.
172 |                 If the country is not available, use the city field to fill the country. For example, if the city is New York, fill the country as United States.
173 |                 If there are social media links, including personal websites, add them to the social media section. Do NOT add social media links that are not present.
174 |                 Here is an example (use it only for the output format, not for the content):
175 | 
176 |                 name: logicx
177 |                 country: United States
178 |                 city: Atlanta
179 |                 email: abc@gatech.edu
180 |                 occupation: PhD student at Georgia Tech
181 |                 programming_languages: Python, Java
182 |                 topics_of_interest: Google Colab, fake data generation, Postgres
183 |                 social_media: https://www.logicx.io, https://www.twitter.com/logicx, https://www.linkedin.com/in/logicx
184 |                 """
185 |         # GPT-35 fuzzy topics
186 |         cursor.query(
187 |             f"""
188 |             CREATE TABLE IF NOT EXISTS {repo_name}_StargazerInsights AS
189 |             SELECT StringToDataframe(
190 |                 GPT35("{LLM_prompt}", extracted_text
191 |                 )
192 |             )
193 |             FROM {repo_name}_StargazerScrapedDetails;
194 |         """
195 |         ).df()
196 | 
197 |         select_query = cursor.query(
198 |             f"""
199 |                 SELECT *
200 |                 FROM {repo_name}_StargazerInsights;
201 |         """
202 |         ).df()
203 | 
204 |         print(select_query)
205 | 
206 |         select_query.to_csv(f"results/{repo_name}_insights_gpt35.csv", index=False)
207 |         # cursor.query(f"DROP TABLE IF EXISTS {repo_name}_StargazerInsightsGPT4;").df()
208 |         LLM_prompt = """You are given 10 rows of input, each row is separated by two new line characters.
209 |                      Categorize the topics listed in each row into one or more of the following 3 technical areas - Machine Learning, Databases, and Web development. If the topics listed are not related to any of these 3 areas, output a single N/A. Do not miss any input row. Do not add any additional text or numbers to your output.
210 |                      The output rows must be separated by two new line characters. Each input row must generate exactly one output row. For example, the input row [Recommendation systems, Deep neural networks, Postgres] must generate only the output row [Machine Learning, Databases].
211 |                      The input row [enterpreneurship, startups, venture capital] must generate the output row N/A.
212 |                      """
213 | 
214 |         cursor.query(
215 |             f"""CREATE TABLE IF NOT EXISTS
216 |                  {repo_name}_StargazerInsightsGPT4 AS
217 |                     SELECT name,
218 |                             country,
219 |                             city,
220 |                             email,
221 |                             occupation,
222 |                             programming_languages,
223 |                             social_media,
224 |                             GPT4("{LLM_prompt}", topics_of_interest)
225 |                     FROM {repo_name}_StargazerInsights;
226 |         """
227 |         ).df()
228 | 
229 |         select_query = cursor.query(
230 |             f"""
231 |                 SELECT *
232 |                 FROM {repo_name}_StargazerInsightsGPT4;
233 |         """
234 |         ).df()
235 | 
236 |         select_query.to_csv(f"results/{repo_name}_insights_gpt4.csv", index=False)
237 | 
238 |     except Exception as e:
239 |         print(f"❗️ EvaDB Session ended with an error: {e}")
240 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🌟 Stargazers Reloaded
  2 | 
  3 | ## LLM-Powered Analysis of Your GitHub Community 🐙
  4 | 
  5 | GitHub 🌟 symbolize a repository's popularity in the developer community. Whether you're a developer, open-source enthusiast, or just curious about tech trends, these stars provide insights into the coding community.
  6 | 
  7 | What if we could delve into the minds of these star-givers, extracting insights from their profiles to understand their interests, locations, and more? In this app, we show how [EvaDB](https://github.com/georgia-tech-db/evadb) makes it super easy to get insights about your GitHub community using large language models (LLMs). 
  8 | 
  9 | This app is inspired by the "original" [Stargazers app](https://github.com/spencerkimball/stargazers) written by Spencer Kimball from Cockroach Labs. While the original app focused on analyzing the community exclusively using the GitHub API, our LLM-powered Stargazers app powered by EvaDB also extracts insights from unstructured data obtained from the stargazers' webpages.
 10 | 
 11 | ## LLM Cost Optimizations
 12 | 
 13 | To generate the most accurate results, this app can directly use GPT-4 to generate the entire structured data. However, GPT-4 calls are **40 times** more expensive per row than GPT-3.5 calls. It takes **$60** to process just **1000 users**. So, this app uses a **model cascade optimization** to generate high-quality insights at a fraction of the cost. Additionally, with EvaDB, it is easy to **batch** input user rows to GPT-4 to further reduce the cost of the query.  Using these optimizations, we found that the app has **11x lower cost** than a standalone GPT-4 model. The batching optimization is implemented in [chatgpt_batch.py](functions/chatgpt_batch.py).
 14 | 
 15 | ## Getting Started
 16 | 
 17 | 
 18 | First install the dependencies:
 19 | 
 20 | ```bash
 21 | pip install -r requirements.txt
 22 | ```
 23 | 
 24 | Then, add the following environment variables to a `.env` file in the root directory of the project (see `example.env`): To generate a Github Personal Access Token (PAT), here are the <a href="https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-fine-grained-personal-access-token">instructions</a>. To find your OpenAI API key, here are the <a href="https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key">instructions</a>.
 25 | 
 26 | ```
 27 | REPO_URL=<url of the repo to analyze....https://github.com/georgia-tech-db/evadb>
 28 | GITHUB_API=<your-github-personal-access-token...github_pat_...>
 29 | OPENAI_API=<your-openai-api-key...sk-...>
 30 | ```
 31 | 
 32 | After filling in these variables in the `example.env` file, rename it file to `.env`. 
 33 | 
 34 | ```bash
 35 | mv example.env .env
 36 | ```
 37 | 
 38 | Finally, run the app:
 39 | 
 40 | ```bash
 41 | 
 42 | python stargarzers.py
 43 | ```
 44 | 
 45 | ## How it Works
 46 | 
 47 | <img src="images/stargazer_workflow.png" width="100%" alt="Workflow of LLM-Powered Analyses of GitHub Community">
 48 | 
 49 | The app uses EvaDB to generate insights about your stargazers in four steps:
 50 | 
 51 | 1. **Collecting stargazers**: The app uses the GitHub API to collect the profile information of the stargazers of the repo specified in the `REPO_URL` environment variable.
 52 | 
 53 | ```SQL
 54 | --- List of Stargazers
 55 | CREATE TABLE gpt4all_StargazerList AS
 56 |   SELECT GithubStargazers("https://github.com/nomic-ai/gpt4all", "GITHUB_KEY");
 57 | 
 58 | --- Details of Stargazers extracted using the Github API
 59 | CREATE TABLE gpt4all_StargazerDetails AS
 60 |   SELECT GithubUserdetails(github_username, "GITHUB_KEY")
 61 |   FROM gpt4all_StargazerList;
 62 | ```
 63 | 
 64 | The `GithubUserdetails` function is implemented in [`github_user_details.py`](functions/github_user_details.py).
 65 | 
 66 | 2. **Scraping stargazers' profiles**: The app then takes screenshots of stargazers' user profile pages and uses [`EasyOCR`](https://github.com/JaidedAI/EasyOCR) to extract unstructured text blobs from the screenshots, all in one query.
 67 | 
 68 | ```SQL
 69 | --- Text in webpages of Stargazers extracted using WebPageTextExtractor
 70 | CREATE TABLE gpt4all_StargazerScrapedDetails AS
 71 |   SELECT github_username, WebPageTextExtractor(github_username)
 72 |   FROM gpt4all_StargazerList;
 73 | ```
 74 | 
 75 | Check [`webpage_text_extractor.py`](functions/webpage_text_extractor.py) for more details on how the `WebPageTextExtractor` function performs the scraping.
 76 | 
 77 | 3. **Generating insights**: The app then uses GPT-3.5 to generate insights about the stargazers' interests and needs, using the text blobs extracted in the previous step. We use a custom prompt to guide the generation process and ensure that the generated insights are relevant to the repo. You can modify the prompt to suit your needs.
 78 | 
 79 | ```Plain Text
 80 | --- Prompt to GPT-35
 81 | You are given a block of disorganized text extracted from the GitHub user profile of a user using an automated web scraper. The goal is to get structured results from this data.
 82 | Extract the following fields from the text: name, country, city, email, occupation, programming_languages, topics_of_interest, social_media.
 83 | If some field is not found, just output fieldname: N/A. Always return all the 8 field names. DO NOT add any additional text to your output.
 84 | The topic_of_interest field must list a broad range of technical topics that are mentioned in any portion of the text.  This field is the most important, so add as much information as you can. Do not add non-technical interests.
 85 | The programming_languages field can contain one or more programming languages out of only the following 4 programming languages - Python, C++, JavaScript, Java. Do not include any other language outside these 4 languages in the output. If the user is not interested in any of these 4 programming languages, output N/A.
 86 | If the country is not available, use the city field to fill the country. For example, if the city is New York, fill the country as United States.
 87 | If there are social media links, including personal websites, add them to the social media section. Do NOT add social media links that are not present.
 88 | Here is an example (use it only for the output format, not for the content):
 89 | 
 90 | name: logicx
 91 | country: United States
 92 | city: Atlanta
 93 | email: abc@gatech.edu
 94 | occupation: PhD student at Georgia Tech
 95 | programming_languages: Python, Java
 96 | topics_of_interest: Google Colab, fake data generation, Postgres
 97 | social_media: https://www.logicx.io, https://www.twitter.com/logicx, https://www.linkedin.com/in/logicx
 98 | ```
 99 | 
100 | ```SQL
101 | --- Using LLMs to extract insights from text
102 | CREATE TABLE gpt4all_StargazerInsights AS
103 |   SELECT StringToDataframe(GPT35("{LLM_prompt}", extracted_text))
104 |   FROM gpt4all_StargazerScrapedDetails;
105 | ```
106 | 
107 | If you want to generate different insights with other column names, you can modify the prompt and the `StringToDataframe` function in [`string_to_dataframe.py`](functions/string_to_dataframe.py).
108 | 
109 | 4. **Improving insights**: GPT-3.5 does not work well for all the columns. For example, it cannot categorize user interests into popular topics of interest effectively. To improve the quality of the insights, we use a Cascade of LLMs to generate insights for the `topics_of_interest` column.
110 | First, the GPT-3.5 query above generates a broad list of topics of interest. The semi-organized results are then processed by the more powerful GPT-4 model to generate a more focused list.
111 | 
112 | ```Plain Text
113 | --- Prompt to GPT-4
114 | You are given 10 rows of input, each row is separated by two new line characters.
115 | Categorize the topics listed in each row into one or more of the following 3 technical areas - Machine Learning, Databases, and Web development. If the topics listed are not related to any of these 3 areas, output a single N/A. Do not miss any input row. Do not add any additional text or numbers to your output.
116 | The output rows must be separated by two new line characters. Each input row must generate exactly one output row. For example, the input row [Recommendation systems, Deep neural networks, Postgres] must generate only the output row [Machine Learning, Databases].
117 | The input row [enterpreneurship, startups, venture capital] must generate the output row N/A.
118 | ```
119 | 
120 | ```SQL
121 | --- Deeper insights using an expensive LLM prompt
122 | CREATE TABLE IF NOT EXISTS
123 | sqlite_data.{repo_name}_StargazerInsightsGPT4 AS
124 | SELECT name,
125 |  country,
126 |  city,
127 |  email,
128 |  occupation,
129 |  programming_languages,
130 |  social_media,
131 |  GPT4("{LLM_prompt}",topics_of_interest)
132 | FROM sqlite_data.{repo_name}_StargazerInsights;
133 | ```
134 | 
135 | ## Results
136 | 
137 | The app generates a CSV file with insights about your stargazers in the [`results`](results/) folder. We provide a sample CSV output file. To generate visualizations from the insights, run the following command:
138 | 
139 | ```bash
140 | python visualize_results.py
141 | ```
142 | 
143 | The visualizations are saved in the [`images`](images/) folder. 
144 | 
145 | Here are some interesting trends that we found in three fast-growing communities.
146 | 
147 | ## [GPT4All](https://github.com/nomic-ai/gpt4all)
148 | 
149 | **Web developers ❤️ open-source LLMs 🤩**
150 | 
151 | <p align="center">
152 |   <img src="images/gpt4all_topics_wordcloud.png" width="48%" alt="gpt4all WordCloud">
153 |   &nbsp;&nbsp;
154 |   <img src="images/gpt4all_topics_pie_chart.png" width="48%" alt="gpt4all Topics">
155 |   <br> <br>
156 |   <img src="images/gpt4all_countries_pie_chart.png" width="48%" alt="gpt4all Countries">
157 |   &nbsp;&nbsp;
158 |   <img src="images/gpt4all_starred_repos_pie_chart.png" width="48%" alt="gpt4all Repos">
159 | </p>
160 | 
161 | ## [Langchain](https://github.com/langchain-ai/langchain)
162 | 
163 | **Langchain is most popular 📈 in the Machine Learning 🤖🧠 interest groups**
164 | 
165 | <p align="center">
166 |   <img src="images/langchain_topics_wordcloud.png" width="48%" alt="langchain WordCloud">
167 |   &nbsp;&nbsp;
168 |   <img src="images/langchain_topics_pie_chart.png" width="48%" alt="langchain Topics">
169 |   <br> <br>
170 |   <img src="images/langchain_countries_pie_chart.png" width="48%" alt="langchain Countries">
171 |   &nbsp;&nbsp;
172 |   <img src="images/langchain_starred_repos_pie_chart.png" width="48%" alt="langchain Repos">
173 | </p>
174 | 
175 | ## [CockroachDB](https://github.com/cockroachdb/cockroach)
176 | 
177 | **CockroachDB is most followed by Database and web developers 📊 in the United States 🇺🇸**
178 | 
179 | <p align="center">
180 |   <img src="images/cockroach_topics_wordcloud.png" width="48%" alt="CockroachDB WordCloud">
181 |   &nbsp;&nbsp;
182 |   <img src="images/cockroach_topics_pie_chart.png" width="48%" alt="CockroachDB Topics">
183 |   <br> <br>
184 |   <img src="images/cockroach_countries_pie_chart.png" width="48%" alt="CockroachDB Countries">
185 |   &nbsp;&nbsp;
186 |   <img src="images/cockroach_starred_repos_pie_chart.png" width="48%" alt="CockroachDB Repos">
187 | </p>
188 | 
189 | ## EvaDB Shoutout
190 | 
191 | 👋 Hey! If you're excited about database systems for AI applications, show some ❤️ by:
192 | <ul>
193 |   <li> 🐙 giving a ⭐ on our <a href="https://github.com/georgia-tech-db/evadb">EvaDB repo on Github</a>
194 |   <li> 📟 joining our <a href="https://evadb.ai/community">Slack Community</a>
195 |   <li> 🐦 following us on <a href="https://twitter.com/evadb_ai">Twitter</a>
196 |   <li> 📝 following us on <a href="https://medium.com/evadb-blog">Medium</a>
197 | </ul>
198 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------