├── .env.exmaple
├── .gitignore
├── Dockerfile
├── README.md
├── api.py
├── doc.py
├── nltk.txt
├── requirements.txt
├── summary.py
├── templates
    └── index.html
└── web.py


/.env.exmaple:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=...


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | Procfile
3 | AWS*
4 | __pycache__/
5 | *.docx


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use the official Python base image
 2 | FROM python:3.9-slim
 3 | 
 4 | # copy the requirements file into the image
 5 | COPY ./requirements.txt /app/requirements.txt
 6 | 
 7 | # switch working directory
 8 | WORKDIR /app
 9 | 
10 | # install the dependencies and packages in the requirements file
11 | RUN pip install -r requirements.txt
12 | 
13 | # copy every content from the local file to the image
14 | COPY . /app
15 | 
16 | # configure the container to run in an executed manner
17 | ENTRYPOINT [ "python" ]
18 | 
19 | # Start the application
20 | CMD ["api.py"]
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Install
 2 | 
 3 | pip install -r requirements.txt
 4 | 
 5 | # Configure
 6 | 
 7 | Copy and paste .env.example to .env
 8 | input your openai api key
 9 | 
10 | # Run
11 | 
12 | python api.py
13 | 


--------------------------------------------------------------------------------
/api.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, render_template, jsonify, request
 2 | from urllib.parse import unquote
 3 | 
 4 | from decouple import config
 5 | 
 6 | from web import decode_website
 7 | # from doc import decoded_doc
 8 | from summary import summarize_webpage
 9 | 
10 | import os
11 | import nltk
12 | nltk.data.path.append('nltk_data')
13 | 
14 | os.environ["OPENAI_API_KEY"] = config('OPENAI_API_KEY')
15 | app = Flask(__name__)
16 | 
17 | @app.route('/')
18 | def home():
19 |     return render_template('index.html')
20 | 
21 | @app.route('/api/data', methods=['GET','POST'])
22 | def get_data():
23 |     if request.method == 'GET':
24 |         sample_data = {
25 |             'message': 'Hello, Flask API!',
26 |             'data': [1, 2, 3, 4, 5]
27 |         }
28 |         print ("DEBUG",sample_data)
29 |         return jsonify(sample_data)
30 |     elif request.method == 'POST':
31 |         print ("DEBUG request",request)
32 |         encode_url = unquote(unquote(request.args.get('url')))
33 |         print ("DEBUG encode_url",encode_url)
34 |         if not encode_url:
35 |             return jsonify({'error': 'URL is required'}), 400
36 | 
37 |         decoded_text = decode_website(encode_url)
38 | 
39 |         print ("DEBUG decoded_text",decoded_text)
40 | 
41 |         summary = summarize_webpage(decoded_text)
42 | 
43 |         response = {
44 |             'submitted_url': encode_url,
45 |             'summary': summary,
46 |         }
47 | 
48 |         return jsonify(response)
49 |         
50 | 
51 | if __name__ == "__main__":
52 |     port = int(os.environ.get('PORT', 5001))
53 |     app.run(debug=True, host='0.0.0.0', port=port)
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/doc.py:
--------------------------------------------------------------------------------
 1 | from langchain.document_loaders import UnstructuredWordDocumentLoader
 2 | 
 3 | def decoded_doc(url) :
 4 |     loader = UnstructuredWordDocumentLoader(url)
 5 |     data = loader.load()
 6 |     web_text = ""
 7 | 
 8 |     for page in data:
 9 |         web_text += page.page_content + " "
10 |     print(web_text)
11 |     return web_text
12 | 
13 | 


--------------------------------------------------------------------------------
/nltk.txt:
--------------------------------------------------------------------------------
 1 | wordnet
 2 | pros_cons
 3 | reuters
 4 | maxent_treebank_pos_tagger
 5 | universal_tagset
 6 | punkt
 7 | averaged_perceptron_tagger_ru
 8 | averaged_perceptron_tagger
 9 | snowball_data
10 | rslp
11 | porter_test
12 | vader_lexicon
13 | treebank
14 | dependency_treebank


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Flask
 2 | gunicorn
 3 | langchain
 4 | openai==0.27.0
 5 | tiktoken
 6 | urllib3
 7 | bs4
 8 | selenium
 9 | unstructured
10 | python-decouple
11 | nltk
12 | python-magic
13 | python-dotenv


--------------------------------------------------------------------------------
/summary.py:
--------------------------------------------------------------------------------
 1 | from langchain import OpenAI
 2 | from langchain.text_splitter import CharacterTextSplitter
 3 | from langchain.docstore.document import Document
 4 | from langchain.chains.summarize import load_summarize_chain
 5 | import textwrap
 6 | 
 7 | def summarize_webpage(text):
 8 |     llm = OpenAI(temperature=0)
 9 |     text_splitter = CharacterTextSplitter()
10 |     texts = text_splitter.split_text(text)
11 |     print(len(texts))
12 |     docs = [Document(page_content=t) for t in texts[:4]]
13 |     chain = load_summarize_chain(llm, 
14 |                              chain_type="map_reduce")
15 | 
16 | 
17 |     output_summary = chain.run(docs)
18 |     wrapped_text = textwrap.fill(output_summary, width=100)
19 |     print(wrapped_text)
20 | 
21 |     return wrapped_text


--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | 
 3 | <html lang="en">
 4 |   <head>
 5 |     <meta charset="UTF-8" />
 6 |     <title>Flask Docker</title>
 7 |   </head>
 8 |   <body>
 9 |     <h1>This is a Flask App containerised with Docker</h1>
10 |   </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/web.py:
--------------------------------------------------------------------------------
 1 | from langchain.document_loaders import SeleniumURLLoader
 2 | 
 3 | def decode_website(url):
 4 |     print("url",url)
 5 |     loader = SeleniumURLLoader([url])
 6 |     data = loader.load()
 7 |     web_text = ""
 8 | 
 9 |     for page in data:
10 |         web_text += page.page_content + " "
11 |     
12 |     return web_text
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------