├── .github
└── workflows
│ └── pylint.yml
├── LICENSE
├── README.md
├── app.py
└── requirements.txt
/.github/workflows/pylint.yml:
--------------------------------------------------------------------------------
1 | name: Pylint
2 |
3 | on: [push]
4 |
5 | jobs:
6 | build:
7 | runs-on: ubuntu-latest
8 | strategy:
9 | matrix:
10 | python-version: ["3.8", "3.9", "3.10"]
11 | steps:
12 | - uses: actions/checkout@v3
13 | - name: Set up Python ${{ matrix.python-version }}
14 | uses: actions/setup-python@v3
15 | with:
16 | python-version: ${{ matrix.python-version }}
17 | - name: Install dependencies
18 | run: |
19 | python -m pip install --upgrade pip
20 | pip install pylint
21 | - name: Analysing the code with pylint
22 | run: |
23 | pylint $(git ls-files '*.py')
24 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Maya Akim
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # autoGNT
2 |
3 | AutoGNT is a fully custamizable, personalized, "smart" newsletter that takes user's input and automatically creates a newsletter based on that input.
4 |
5 | Demo is here:
6 | https://majacinka-autognt-app-cruudz.streamlit.app/
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | #Import and install necessary dependencies
2 |
3 | #serpapi, requests → Scrape google results
4 | #sklearn → filter results based on how similar they are
5 | #Newspaper3K → extract text from articles
6 | #Langchain → split text/summarize it and prompt template in order to generate the title
7 | #MailGun → send email
8 |
9 | import streamlit as st
10 | import requests
11 | import json
12 | import numpy as np
13 |
14 | from newspaper import Article, ArticleException
15 | from langchain.text_splitter import TokenTextSplitter
16 | from langchain.chains.summarize import load_summarize_chain
17 | from langchain import PromptTemplate, LLMChain, OpenAI
18 | from sklearn.feature_extraction.text import TfidfVectorizer
19 | from sklearn.metrics.pairwise import cosine_similarity
20 |
21 | # checks if the fetched newsarticles are identical, and filters out the ones that are too similar
22 | def is_unique(new_article, articles):
23 | if not articles: # if the list is empty
24 | return True
25 |
26 | # Create a new TfidfVectorizer and transform the article texts into vectors
27 | vectorizer = TfidfVectorizer().fit([new_article] + articles)
28 | vectors = vectorizer.transform([new_article] + articles)
29 |
30 | # Calculate the cosine similarity of the new article to each of the existing articles
31 | similarity_scores = cosine_similarity(vectors[0:1], vectors[1:])
32 |
33 | # If the highest similarity score is above a threshold (for example, 0.8), return False (not unique), keep at around 0.6
34 | if np.max(similarity_scores) > 0.6:
35 | return False
36 |
37 | # Otherwise, return True (unique)
38 | return True
39 |
40 | # Scrapes google search results
41 | def get_latest_results(query, api_key):
42 | params = {
43 | "q": query,
44 | "location": "United States",
45 | "h1": "en",
46 | "gl": "us",
47 | "google_domain": "google.com",
48 | "tbs": "qdr:d", #Search for results from previous day
49 | "api_key": api_key,
50 | }
51 |
52 | response = requests.get("https://serpapi.com/search", params)
53 | results = json.loads(response.text)
54 |
55 | # List of websites to exclude because you can't scrape them
56 | excluded_websites = ["ft.com", "cointelegraph.com", "cell.com", "futuretools.io"]
57 |
58 | urls = [r["link"] for r in results["organic_results"] if not any(excluded_site in r["link"] for excluded_site in excluded_websites)][:40] #limit to first 40 results
59 |
60 | parsed_texts = [] #list to store parsed text and corresponding URL
61 | article_texts = [] # list to store original article texts for similarity comparison
62 |
63 | # Initialize the text_splitter before using it
64 | text_splitter = TokenTextSplitter(chunk_size=3000, chunk_overlap=200)
65 |
66 | #iterate over each URL
67 | for url in urls:
68 | try:
69 | #create an article object
70 | article = Article(url)
71 |
72 | #download the article
73 | article.download()
74 |
75 | #parse the article
76 | article.parse()
77 |
78 | # Check if the new article is unique
79 | if not is_unique(article.text, article_texts):
80 | continue # If not unique, skip to the next article
81 |
82 | #split text into chunks of 4k tokens
83 | splitted_texts = text_splitter.split_text(article.text)
84 | if not splitted_texts:
85 | print(article.text)
86 |
87 | #Append tuple of splitted text and URL to the list
88 | parsed_texts.append((splitted_texts, url))
89 | article_texts.append(article.text) # Add the text of the new unique article to the list
90 |
91 | except ArticleException:
92 | print(f"Failed to download and parse article: {url}")
93 |
94 | return parsed_texts
95 |
96 | #required by chain.run()
97 | class Document:
98 | def __init__(self, title, text):
99 | self.title = title
100 | self.page_content = text
101 | self.metadata = {"stop": []}
102 |
103 | def summarize_text(to_summarize_texts, openai_api_key):
104 |
105 | summarized_texts_titles_urls = []
106 |
107 | llm = OpenAI(openai_api_key=openai_api_key, temperature=0.8)
108 | chain_summarize = load_summarize_chain(llm, chain_type="map_reduce")
109 |
110 | # Define prompt that generates titles for summarized text
111 | prompt = PromptTemplate(
112 | input_variables=["text"],
113 | template="Write an appropriate, clickbaity news article title in less than 70 characters for this text: {text}"
114 | )
115 |
116 | for to_summarize_text, url in to_summarize_texts:
117 | # Convert each text string to a Document object
118 | to_summarize_text = [Document('Dummy Title', text) for text in to_summarize_text]
119 | if not to_summarize_text: # Check if list is empty before running the chain
120 | print(f"No text to summarize for URL: {url}")
121 | continue
122 |
123 | # Summarize chunks here
124 | summarized_text = chain_summarize.run(to_summarize_text)
125 |
126 | # prompt template that generates unique titles
127 | chain_prompt = LLMChain(llm=llm, prompt=prompt)
128 | clickbait_title = chain_prompt.run(summarized_text)
129 |
130 | summarized_texts_titles_urls.append((clickbait_title, summarized_text, url))
131 |
132 | return summarized_texts_titles_urls
133 |
134 | def send_email_mailgun(subject, body, to, from_email, mailgun_domain, mailgun_api_key):
135 | response = requests.post(
136 | f"https://api.mailgun.net/v3/{mailgun_domain}/messages",
137 | auth=("api", mailgun_api_key),
138 | data={"from": from_email,
139 | "to": to,
140 | "subject": subject,
141 | "text": body})
142 |
143 | #in case of an error, what's the status
144 | print("Status code:", response.status_code)
145 | print("Response data:", response.text)
146 |
147 | return response
148 |
149 | def main():
150 | #frontend
151 | st.title('AutoNewsletter')
152 | st.markdown("## Please input your API keys")
153 |
154 | #create text input field for API keys
155 | serpapi_key = st.text_input("Insert your SerpAPI key here: ", type="password")
156 | openai_api_key = st.text_input("Insert your OpenAI api key: ", type="password")
157 |
158 | #create text input field for keyword
159 | user_query = st.text_input("Make me a newsletter about: ")
160 |
161 | #you'll have to create a Mailgun account and if it's a free one you'll have to make a receiving mail as an authorized recipient
162 | st.markdown("## Info necessary for the MailGun to work")
163 |
164 | recipient_mail = st.text_input("Email To: ")
165 | sending_mail = st.text_input("Email from: ") #email you used to create a MailGun account
166 | mailgun_domain = st.text_input("Enter your mailgun Domain here: ")
167 | mailgun_api = st.text_input("Enter your mailgun API key here: ")
168 |
169 | if st.button('Submit'):
170 | st.session_state.serpapi_key = serpapi_key
171 | st.session_state.user_query = user_query
172 |
173 | # Split the result of get_latest_results into two separate variables
174 | st.session_state.get_splitted_text = get_latest_results(user_query, serpapi_key)
175 | if not st.session_state.get_splitted_text:
176 | st.write("No results found.")
177 | st.session_state.summarized_texts = summarize_text(st.session_state.get_splitted_text, openai_api_key)
178 |
179 | for title, summarized_text, url in st.session_state.summarized_texts:
180 | st.title(title)
181 | # Add the emoji before the summarized text
182 | st.write(f"❇️ {summarized_text}")
183 | st.write(f"🔗 {url}")
184 | # Create an empty line for a gap
185 | st.markdown("\n\n")
186 |
187 | email_body = ""
188 | for title, summarized_text, url in st.session_state.summarized_texts:
189 | email_body += f"❇️{title}\n\n"
190 | email_body += f"💬{summarized_text}\n\n"
191 | email_body += f"🔗{url}\n\n"
192 |
193 | # Send the email
194 | send_email_mailgun(
195 | subject="🤖🤯 This week news about AI", #you can change "AI" to accept the user query variable instead of hardcoded word, but I prefer it like this
196 | #because my keywords sometimes get weird and long
197 | body=email_body,
198 | to=recipient_mail,
199 | from_email=sending_mail,
200 | mailgun_domain=mailgun_domain,
201 | mailgun_api_key=mailgun_api
202 | )
203 |
204 | return openai_api_key
205 |
206 | if __name__ == "__main__":
207 | main()
208 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit
2 | requests
3 | numpy
4 | newspaper3k
5 | scikit-learn
6 | langchain
7 | mailgun
8 | tiktoken
9 | openai
10 |
--------------------------------------------------------------------------------