├── SDG.pdf ├── static ├── docs │ └── SDG.pdf └── output │ └── QA.csv ├── requirements.txt ├── README.md ├── LICENSE ├── .gitignore ├── script.py ├── app.py └── templates └── index.html /SDG.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIAnytime/Question-Answer-Generation-App/HEAD/SDG.pdf -------------------------------------------------------------------------------- /static/docs/SDG.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIAnytime/Question-Answer-Generation-App/HEAD/static/docs/SDG.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | langchain 2 | openai 3 | pypdf 4 | tiktoken 5 | aiofiles 6 | fastapi 7 | uvicorn 8 | jinja2 9 | python-multipart 10 | PyPDF2 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Question-Answer-Generation-App 2 | Question Answer Generation App from the documents. Primarily suited to Teachers and related Academia's posts. 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 AI Anytime 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /script.py: -------------------------------------------------------------------------------- 1 | from langchain.chat_models import ChatOpenAI 2 | from langchain.chains import QAGenerationChain 3 | from langchain.text_splitter import TokenTextSplitter 4 | from langchain.docstore.document import Document 5 | from langchain.document_loaders import PyPDFLoader 6 | from langchain.prompts import PromptTemplate 7 | from langchain.embeddings.openai import OpenAIEmbeddings 8 | from langchain.vectorstores import Chroma, FAISS 9 | from langchain.chains.summarize import load_summarize_chain 10 | from langchain.chains import RetrievalQA 11 | import os 12 | 13 | 14 | os.environ["OPENAI_API_KEY"] = "" 15 | 16 | # Set file path 17 | file_path = 'SDG.pdf' 18 | 19 | # Load data from PDF 20 | loader = PyPDFLoader(file_path) 21 | data = loader.load() 22 | 23 | question_gen = '' 24 | 25 | for page in data: 26 | question_gen += page.page_content 27 | 28 | splitter_ques_gen = TokenTextSplitter( 29 | model_name = 'gpt-3.5-turbo', 30 | chunk_size = 10000, 31 | chunk_overlap = 200 32 | ) 33 | 34 | chunks_ques_gen = splitter_ques_gen.split_text(question_gen) 35 | 36 | document_ques_gen = [Document(page_content=t) for t in chunks_ques_gen] 37 | 38 | splitter_ans_gen = TokenTextSplitter( 39 | model_name = 'gpt-3.5-turbo', 40 | chunk_size = 1000, 41 | chunk_overlap = 100 42 | ) 43 | 44 | 45 | document_answer_gen = splitter_ans_gen.split_documents( 46 | document_ques_gen 47 | ) 48 | 49 | llm_ques_gen_pipeline = ChatOpenAI( 50 | temperature = 0.3, 51 | model = "gpt-3.5-turbo" 52 | ) 53 | 54 | prompt_template = """ 55 | You are an expert at creating questions based on coding materials and documentation. 56 | Your goal is to prepare a coder or programmer for their exam and coding tests. 57 | You do this by asking questions about the text below: 58 | 59 | ------------ 60 | {text} 61 | ------------ 62 | 63 | Create questions that will prepare the coders or programmers for their tests. 64 | Make sure not to lose any important information. 65 | 66 | QUESTIONS: 67 | """ 68 | 69 | PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=["text"]) 70 | 71 | refine_template = (""" 72 | You are an expert at creating practice questions based on coding material and documentation. 73 | Your goal is to help a coder or programmer prepare for a coding test. 74 | We have received some practice questions to a certain extent: {existing_answer}. 75 | We have the option to refine the existing questions or add new ones. 76 | (only if necessary) with some more context below. 77 | ------------ 78 | {text} 79 | ------------ 80 | 81 | Given the new context, refine the original questions in English. 82 | If the context is not helpful, please provide the original questions. 83 | QUESTIONS: 84 | """ 85 | ) 86 | 87 | REFINE_PROMPT_QUESTIONS = PromptTemplate( 88 | input_variables=["existing_answer", "text"], 89 | template=refine_template, 90 | ) 91 | 92 | ques_gen_chain = load_summarize_chain(llm = llm_ques_gen_pipeline, 93 | chain_type = "refine", 94 | verbose = True, 95 | question_prompt=PROMPT_QUESTIONS, 96 | refine_prompt=REFINE_PROMPT_QUESTIONS) 97 | 98 | ques = ques_gen_chain.run(document_ques_gen) 99 | 100 | print(ques) 101 | 102 | 103 | embeddings = OpenAIEmbeddings() 104 | 105 | vector_store = FAISS.from_documents(document_answer_gen, embeddings) 106 | 107 | llm_answer_gen = ChatOpenAI(temperature=0.1, model="gpt-3.5-turbo") 108 | 109 | ques_list = ques.split("\n") 110 | 111 | ques_list 112 | 113 | answer_generation_chain = RetrievalQA.from_chain_type(llm=llm_answer_gen, 114 | chain_type="stuff", 115 | retriever=vector_store.as_retriever()) 116 | 117 | 118 | 119 | 120 | # Answer each question and save to a file 121 | for question in ques_list: 122 | print("Question: ", question) 123 | answer = answer_generation_chain.run(question) 124 | print("Answer: ", answer) 125 | print("--------------------------------------------------\\n\\n") 126 | # Save answer to file 127 | with open("answers.txt", "a") as f: 128 | f.write("Question: " + question + "\\n") 129 | f.write("Answer: " + answer + "\\n") 130 | f.write("--------------------------------------------------\\n\\n") 131 | 132 | 133 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, Form, Request, Response, File, Depends, HTTPException, status 2 | from fastapi.responses import RedirectResponse 3 | from fastapi.staticfiles import StaticFiles 4 | from fastapi.templating import Jinja2Templates 5 | from fastapi.encoders import jsonable_encoder 6 | from langchain.chat_models import ChatOpenAI 7 | from langchain.chains import QAGenerationChain 8 | from langchain.text_splitter import TokenTextSplitter 9 | from langchain.docstore.document import Document 10 | from langchain.document_loaders import PyPDFLoader 11 | from langchain.prompts import PromptTemplate 12 | from langchain.embeddings.openai import OpenAIEmbeddings 13 | from langchain.vectorstores import FAISS 14 | from langchain.chains.summarize import load_summarize_chain 15 | from langchain.chains import RetrievalQA 16 | import os 17 | import json 18 | import time 19 | import uvicorn 20 | import aiofiles 21 | from PyPDF2 import PdfReader 22 | import csv 23 | 24 | app = FastAPI() 25 | 26 | app.mount("/static", StaticFiles(directory="static"), name="static") 27 | 28 | templates = Jinja2Templates(directory="templates") 29 | 30 | os.environ["OPENAI_API_KEY"] = "" 31 | 32 | # Set file path 33 | # file_path = 'SDG.pdf' 34 | 35 | def count_pdf_pages(pdf_path): 36 | try: 37 | pdf = PdfReader(pdf_path) 38 | return len(pdf.pages) 39 | except Exception as e: 40 | print("Error:", e) 41 | return None 42 | 43 | def file_processing(file_path): 44 | 45 | # Load data from PDF 46 | loader = PyPDFLoader(file_path) 47 | data = loader.load() 48 | 49 | question_gen = '' 50 | 51 | for page in data: 52 | question_gen += page.page_content 53 | 54 | splitter_ques_gen = TokenTextSplitter( 55 | model_name = 'gpt-3.5-turbo', 56 | chunk_size = 10000, 57 | chunk_overlap = 200 58 | ) 59 | 60 | chunks_ques_gen = splitter_ques_gen.split_text(question_gen) 61 | 62 | document_ques_gen = [Document(page_content=t) for t in chunks_ques_gen] 63 | 64 | splitter_ans_gen = TokenTextSplitter( 65 | model_name = 'gpt-3.5-turbo', 66 | chunk_size = 1000, 67 | chunk_overlap = 100 68 | ) 69 | 70 | 71 | document_answer_gen = splitter_ans_gen.split_documents( 72 | document_ques_gen 73 | ) 74 | 75 | return document_ques_gen, document_answer_gen 76 | 77 | def llm_pipeline(file_path): 78 | 79 | document_ques_gen, document_answer_gen = file_processing(file_path) 80 | 81 | llm_ques_gen_pipeline = ChatOpenAI( 82 | temperature = 0.3, 83 | model = "gpt-3.5-turbo" 84 | ) 85 | 86 | prompt_template = """ 87 | You are an expert at creating questions based on coding materials and documentation. 88 | Your goal is to prepare a coder or programmer for their exam and coding tests. 89 | You do this by asking questions about the text below: 90 | 91 | ------------ 92 | {text} 93 | ------------ 94 | 95 | Create questions that will prepare the coders or programmers for their tests. 96 | Make sure not to lose any important information. 97 | 98 | QUESTIONS: 99 | """ 100 | 101 | PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=["text"]) 102 | 103 | refine_template = (""" 104 | You are an expert at creating practice questions based on coding material and documentation. 105 | Your goal is to help a coder or programmer prepare for a coding test. 106 | We have received some practice questions to a certain extent: {existing_answer}. 107 | We have the option to refine the existing questions or add new ones. 108 | (only if necessary) with some more context below. 109 | ------------ 110 | {text} 111 | ------------ 112 | 113 | Given the new context, refine the original questions in English. 114 | If the context is not helpful, please provide the original questions. 115 | QUESTIONS: 116 | """ 117 | ) 118 | 119 | REFINE_PROMPT_QUESTIONS = PromptTemplate( 120 | input_variables=["existing_answer", "text"], 121 | template=refine_template, 122 | ) 123 | 124 | ques_gen_chain = load_summarize_chain(llm = llm_ques_gen_pipeline, 125 | chain_type = "refine", 126 | verbose = True, 127 | question_prompt=PROMPT_QUESTIONS, 128 | refine_prompt=REFINE_PROMPT_QUESTIONS) 129 | 130 | ques = ques_gen_chain.run(document_ques_gen) 131 | 132 | embeddings = OpenAIEmbeddings() 133 | 134 | vector_store = FAISS.from_documents(document_answer_gen, embeddings) 135 | 136 | llm_answer_gen = ChatOpenAI(temperature=0.1, model="gpt-3.5-turbo") 137 | 138 | ques_list = ques.split("\n") 139 | filtered_ques_list = [element for element in ques_list if element.endswith('?') or element.endswith('.')] 140 | 141 | answer_generation_chain = RetrievalQA.from_chain_type(llm=llm_answer_gen, 142 | chain_type="stuff", 143 | retriever=vector_store.as_retriever()) 144 | 145 | return answer_generation_chain, filtered_ques_list 146 | 147 | 148 | 149 | 150 | # Answer each question and save to a file 151 | # for question in question_list: 152 | # print("Question: ", question) 153 | # answer = answer_gen_chain.run(question) 154 | # print("Answer: ", answer) 155 | # print("--------------------------------------------------\\n\\n") 156 | # # Save answer to file 157 | # with open("answers.txt", "a") as f: 158 | # f.write("Question: " + question + "\\n") 159 | # f.write("Answer: " + answer + "\\n") 160 | # f.write("--------------------------------------------------\\n\\n") 161 | 162 | def get_csv (file_path): 163 | answer_generation_chain, ques_list = llm_pipeline(file_path) 164 | base_folder = 'static/output/' 165 | if not os.path.isdir(base_folder): 166 | os.mkdir(base_folder) 167 | output_file = base_folder+"QA.csv" 168 | with open(output_file, "w", newline="", encoding="utf-8") as csvfile: 169 | csv_writer = csv.writer(csvfile) 170 | csv_writer.writerow(["Question", "Answer"]) # Writing the header row 171 | 172 | for question in ques_list: 173 | print("Question: ", question) 174 | answer = answer_generation_chain.run(question) 175 | print("Answer: ", answer) 176 | print("--------------------------------------------------\n\n") 177 | 178 | # Save answer to CSV file 179 | csv_writer.writerow([question, answer]) 180 | return output_file 181 | 182 | @app.get("/") 183 | async def index(request: Request): 184 | return templates.TemplateResponse("index.html", {"request": request}) 185 | 186 | @app.post("/upload") 187 | async def chat(request: Request, pdf_file: bytes = File(), filename: str = Form(...)): 188 | base_folder = 'static/docs/' 189 | if not os.path.isdir(base_folder): 190 | os.mkdir(base_folder) 191 | pdf_filename = os.path.join(base_folder, filename) 192 | 193 | async with aiofiles.open(pdf_filename, 'wb') as f: 194 | await f.write(pdf_file) 195 | # page_count = count_pdf_pages(pdf_filename) 196 | # if page_count > 5: 197 | # return Response(jsonable_encoder(json.dumps({"msg": 'error'}))) 198 | response_data = jsonable_encoder(json.dumps({"msg": 'success',"pdf_filename": pdf_filename})) 199 | res = Response(response_data) 200 | return res 201 | 202 | 203 | @app.post("/analyze") 204 | async def chat(request: Request, pdf_filename: str = Form(...)): 205 | output_file = get_csv(pdf_filename) 206 | response_data = jsonable_encoder(json.dumps({"output_file": output_file})) 207 | res = Response(response_data) 208 | return res 209 | 210 | if __name__ == "__main__": 211 | uvicorn.run("app:app", host='0.0.0.0', port=8000, reload=True) 212 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 7 |