├── .github └── workflows │ └── pylint.yml ├── LICENSE ├── README.md ├── app.py └── requirements.txt /.github/workflows/pylint.yml: -------------------------------------------------------------------------------- 1 | name: Pylint 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | python-version: ["3.8", "3.9", "3.10"] 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: Set up Python ${{ matrix.python-version }} 14 | uses: actions/setup-python@v3 15 | with: 16 | python-version: ${{ matrix.python-version }} 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install pylint 21 | - name: Analysing the code with pylint 22 | run: | 23 | pylint $(git ls-files '*.py') 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Maya Akim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # autoGNT 2 | 3 | AutoGNT is a fully custamizable, personalized, "smart" newsletter that takes user's input and automatically creates a newsletter based on that input. 4 | 5 | Demo is here: 6 | https://majacinka-autognt-app-cruudz.streamlit.app/ 7 | 8 | Screenshot 2023-06-11 at 10 45 29 PM 9 | 10 | Screenshot 2023-06-10 at 7 49 16 PM 11 | 12 | Screenshot 2023-06-10 at 8 33 43 PM 13 | 14 | 15 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | #Import and install necessary dependencies 2 | 3 | #serpapi, requests → Scrape google results 4 | #sklearn → filter results based on how similar they are 5 | #Newspaper3K → extract text from articles 6 | #Langchain → split text/summarize it and prompt template in order to generate the title 7 | #MailGun → send email 8 | 9 | import streamlit as st 10 | import requests 11 | import json 12 | import numpy as np 13 | 14 | from newspaper import Article, ArticleException 15 | from langchain.text_splitter import TokenTextSplitter 16 | from langchain.chains.summarize import load_summarize_chain 17 | from langchain import PromptTemplate, LLMChain, OpenAI 18 | from sklearn.feature_extraction.text import TfidfVectorizer 19 | from sklearn.metrics.pairwise import cosine_similarity 20 | 21 | # checks if the fetched newsarticles are identical, and filters out the ones that are too similar 22 | def is_unique(new_article, articles): 23 | if not articles: # if the list is empty 24 | return True 25 | 26 | # Create a new TfidfVectorizer and transform the article texts into vectors 27 | vectorizer = TfidfVectorizer().fit([new_article] + articles) 28 | vectors = vectorizer.transform([new_article] + articles) 29 | 30 | # Calculate the cosine similarity of the new article to each of the existing articles 31 | similarity_scores = cosine_similarity(vectors[0:1], vectors[1:]) 32 | 33 | # If the highest similarity score is above a threshold (for example, 0.8), return False (not unique), keep at around 0.6 34 | if np.max(similarity_scores) > 0.6: 35 | return False 36 | 37 | # Otherwise, return True (unique) 38 | return True 39 | 40 | # Scrapes google search results 41 | def get_latest_results(query, api_key): 42 | params = { 43 | "q": query, 44 | "location": "United States", 45 | "h1": "en", 46 | "gl": "us", 47 | "google_domain": "google.com", 48 | "tbs": "qdr:d", #Search for results from previous day 49 | "api_key": api_key, 50 | } 51 | 52 | response = requests.get("https://serpapi.com/search", params) 53 | results = json.loads(response.text) 54 | 55 | # List of websites to exclude because you can't scrape them 56 | excluded_websites = ["ft.com", "cointelegraph.com", "cell.com", "futuretools.io"] 57 | 58 | urls = [r["link"] for r in results["organic_results"] if not any(excluded_site in r["link"] for excluded_site in excluded_websites)][:40] #limit to first 40 results 59 | 60 | parsed_texts = [] #list to store parsed text and corresponding URL 61 | article_texts = [] # list to store original article texts for similarity comparison 62 | 63 | # Initialize the text_splitter before using it 64 | text_splitter = TokenTextSplitter(chunk_size=3000, chunk_overlap=200) 65 | 66 | #iterate over each URL 67 | for url in urls: 68 | try: 69 | #create an article object 70 | article = Article(url) 71 | 72 | #download the article 73 | article.download() 74 | 75 | #parse the article 76 | article.parse() 77 | 78 | # Check if the new article is unique 79 | if not is_unique(article.text, article_texts): 80 | continue # If not unique, skip to the next article 81 | 82 | #split text into chunks of 4k tokens 83 | splitted_texts = text_splitter.split_text(article.text) 84 | if not splitted_texts: 85 | print(article.text) 86 | 87 | #Append tuple of splitted text and URL to the list 88 | parsed_texts.append((splitted_texts, url)) 89 | article_texts.append(article.text) # Add the text of the new unique article to the list 90 | 91 | except ArticleException: 92 | print(f"Failed to download and parse article: {url}") 93 | 94 | return parsed_texts 95 | 96 | #required by chain.run() 97 | class Document: 98 | def __init__(self, title, text): 99 | self.title = title 100 | self.page_content = text 101 | self.metadata = {"stop": []} 102 | 103 | def summarize_text(to_summarize_texts, openai_api_key): 104 | 105 | summarized_texts_titles_urls = [] 106 | 107 | llm = OpenAI(openai_api_key=openai_api_key, temperature=0.8) 108 | chain_summarize = load_summarize_chain(llm, chain_type="map_reduce") 109 | 110 | # Define prompt that generates titles for summarized text 111 | prompt = PromptTemplate( 112 | input_variables=["text"], 113 | template="Write an appropriate, clickbaity news article title in less than 70 characters for this text: {text}" 114 | ) 115 | 116 | for to_summarize_text, url in to_summarize_texts: 117 | # Convert each text string to a Document object 118 | to_summarize_text = [Document('Dummy Title', text) for text in to_summarize_text] 119 | if not to_summarize_text: # Check if list is empty before running the chain 120 | print(f"No text to summarize for URL: {url}") 121 | continue 122 | 123 | # Summarize chunks here 124 | summarized_text = chain_summarize.run(to_summarize_text) 125 | 126 | # prompt template that generates unique titles 127 | chain_prompt = LLMChain(llm=llm, prompt=prompt) 128 | clickbait_title = chain_prompt.run(summarized_text) 129 | 130 | summarized_texts_titles_urls.append((clickbait_title, summarized_text, url)) 131 | 132 | return summarized_texts_titles_urls 133 | 134 | def send_email_mailgun(subject, body, to, from_email, mailgun_domain, mailgun_api_key): 135 | response = requests.post( 136 | f"https://api.mailgun.net/v3/{mailgun_domain}/messages", 137 | auth=("api", mailgun_api_key), 138 | data={"from": from_email, 139 | "to": to, 140 | "subject": subject, 141 | "text": body}) 142 | 143 | #in case of an error, what's the status 144 | print("Status code:", response.status_code) 145 | print("Response data:", response.text) 146 | 147 | return response 148 | 149 | def main(): 150 | #frontend 151 | st.title('AutoNewsletter') 152 | st.markdown("## Please input your API keys") 153 | 154 | #create text input field for API keys 155 | serpapi_key = st.text_input("Insert your SerpAPI key here: ", type="password") 156 | openai_api_key = st.text_input("Insert your OpenAI api key: ", type="password") 157 | 158 | #create text input field for keyword 159 | user_query = st.text_input("Make me a newsletter about: ") 160 | 161 | #you'll have to create a Mailgun account and if it's a free one you'll have to make a receiving mail as an authorized recipient 162 | st.markdown("## Info necessary for the MailGun to work") 163 | 164 | recipient_mail = st.text_input("Email To: ") 165 | sending_mail = st.text_input("Email from: ") #email you used to create a MailGun account 166 | mailgun_domain = st.text_input("Enter your mailgun Domain here: ") 167 | mailgun_api = st.text_input("Enter your mailgun API key here: ") 168 | 169 | if st.button('Submit'): 170 | st.session_state.serpapi_key = serpapi_key 171 | st.session_state.user_query = user_query 172 | 173 | # Split the result of get_latest_results into two separate variables 174 | st.session_state.get_splitted_text = get_latest_results(user_query, serpapi_key) 175 | if not st.session_state.get_splitted_text: 176 | st.write("No results found.") 177 | st.session_state.summarized_texts = summarize_text(st.session_state.get_splitted_text, openai_api_key) 178 | 179 | for title, summarized_text, url in st.session_state.summarized_texts: 180 | st.title(title) 181 | # Add the emoji before the summarized text 182 | st.write(f"❇️ {summarized_text}") 183 | st.write(f"🔗 {url}") 184 | # Create an empty line for a gap 185 | st.markdown("\n\n") 186 | 187 | email_body = "" 188 | for title, summarized_text, url in st.session_state.summarized_texts: 189 | email_body += f"❇️{title}\n\n" 190 | email_body += f"💬{summarized_text}\n\n" 191 | email_body += f"🔗{url}\n\n" 192 | 193 | # Send the email 194 | send_email_mailgun( 195 | subject="🤖🤯 This week news about AI", #you can change "AI" to accept the user query variable instead of hardcoded word, but I prefer it like this 196 | #because my keywords sometimes get weird and long 197 | body=email_body, 198 | to=recipient_mail, 199 | from_email=sending_mail, 200 | mailgun_domain=mailgun_domain, 201 | mailgun_api_key=mailgun_api 202 | ) 203 | 204 | return openai_api_key 205 | 206 | if __name__ == "__main__": 207 | main() 208 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | requests 3 | numpy 4 | newspaper3k 5 | scikit-learn 6 | langchain 7 | mailgun 8 | tiktoken 9 | openai 10 | --------------------------------------------------------------------------------