├── .streamlit
    └── config.toml
├── requirements.txt
├── YouTube-Assistant.png
├── Dockerfile
├── README.md
├── main.py
├── langchain_helper.py
└── .gitignore


/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [theme]
2 | base="dark"
3 | primaryColor="purple"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dotenv
2 | langchain
3 | openai
4 | youtube-transcript-api
5 | faiss-cpu
6 | streamlit


--------------------------------------------------------------------------------
/YouTube-Assistant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rishabkumar7/youtube-assistant-langchain/HEAD/YouTube-Assistant.png


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9.2
2 | WORKDIR youtube-assistant
3 | COPY requirements.txt requirements.txt
4 | RUN pip3 install -r requirements.txt
5 | COPY . .
6 | EXPOSE 8501
7 | CMD ["streamlit", "run", "main.py", "--server.port=8501", "--server.address=0.0.0.0"]


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # YouTube Assistant
 2 | 
 3 | Ask questions about any YouTube video to this LLM powered assistant.
 4 | 
 5 | ## Running it locally
 6 | 
 7 | Install the required packages:
 8 | 
 9 | ```bash
10 | pip install -r requirements.txt
11 | ```
12 | 
13 | Run the streamlit app:
14 | 
15 | ```bash
16 | streamlit run main.py
17 | ```
18 | 
19 | ![YouTube Assistant App](/YouTube-Assistant.png)
20 | 
21 | ## Hosted On
22 | 
23 | The web-app uses streamlit and is hosted on [Azure Container Apps.](https://azure.microsoft.com/en-ca/products/container-apps)
24 | 
25 | ## Author
26 | 
27 | - Twitter: [@rishabkumar7](https://twitter.com/rishabk7)
28 | - LinkedIn: [rishabkumar7](https://linkedin.com/in/rishabkumar7)


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import langchain_helper as lch
 3 | import textwrap
 4 | 
 5 | st.title("YouTube Assistant")
 6 | 
 7 | with st.sidebar:
 8 |     with st.form(key='my_form'):
 9 |         youtube_url = st.sidebar.text_area(
10 |             label="What is the YouTube video URL?",
11 |             max_chars=50
12 |             )
13 |         query = st.sidebar.text_area(
14 |             label="Ask me about the video?",
15 |             max_chars=50,
16 |             key="query"
17 |             )
18 |         openai_api_key = st.sidebar.text_input(
19 |             label="OpenAI API Key",
20 |             key="langchain_search_api_key_openai",
21 |             max_chars=50,
22 |             type="password"
23 |             )
24 |         "[Get an OpenAI API key](https://platform.openai.com/account/api-keys)"
25 |         "[View the source code](https://github.com/rishabkumar7/pets-name-langchain/tree/main)"
26 |         submit_button = st.form_submit_button(label='Submit')
27 | 
28 | if query and youtube_url:
29 |     if not openai_api_key:
30 |         st.info("Please add your OpenAI API key to continue.")
31 |         st.stop()
32 |     else:
33 |         db = lch.create_db_from_youtube_video_url(youtube_url)
34 |         response, docs = lch.get_response_from_query(db, query)
35 |         st.subheader("Answer:")
36 |         st.text(textwrap.fill(response, width=85))


--------------------------------------------------------------------------------
/langchain_helper.py:
--------------------------------------------------------------------------------
 1 | from langchain.document_loaders import YoutubeLoader
 2 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 3 | from langchain.embeddings.openai import OpenAIEmbeddings
 4 | from langchain.vectorstores import FAISS
 5 | from langchain.llms import OpenAI
 6 | from langchain import PromptTemplate
 7 | from langchain.chains import LLMChain
 8 | from dotenv import load_dotenv
 9 | 
10 | 
11 | load_dotenv()
12 | embeddings = OpenAIEmbeddings()
13 | 
14 | 
15 | def create_db_from_youtube_video_url(video_url: str) -> FAISS:
16 |     loader = YoutubeLoader.from_youtube_url(video_url)
17 |     transcript = loader.load()
18 | 
19 |     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
20 |     docs = text_splitter.split_documents(transcript)
21 | 
22 |     db = FAISS.from_documents(docs, embeddings)
23 |     return db
24 | 
25 | 
26 | def get_response_from_query(db, query, k=4):
27 |     """
28 |     text-davinci-003 can handle up to 4097 tokens. Setting the chunksize to 1000 and k to 4 maximizes
29 |     the number of tokens to analyze.
30 |     """
31 | 
32 |     docs = db.similarity_search(query, k=k)
33 |     docs_page_content = " ".join([d.page_content for d in docs])
34 | 
35 |     llm = OpenAI(model_name="text-davinci-003")
36 | 
37 |     prompt = PromptTemplate(
38 |         input_variables=["question", "docs"],
39 |         template="""
40 |         You are a helpful assistant that that can answer questions about youtube videos 
41 |         based on the video's transcript.
42 |         
43 |         Answer the following question: {question}
44 |         By searching the following video transcript: {docs}
45 |         
46 |         Only use the factual information from the transcript to answer the question.
47 |         
48 |         If you feel like you don't have enough information to answer the question, say "I don't know".
49 |         
50 |         Your answers should be verbose and detailed.
51 |         """,
52 |     )
53 | 
54 |     chain = LLMChain(llm=llm, prompt=prompt)
55 | 
56 |     response = chain.run(question=query, docs=docs_page_content)
57 |     response = response.replace("\n", "")
58 |     return response, docs


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------