├── README.md ├── requirements.txt ├── web-qa.ipynb └── web-qa.py /README.md: -------------------------------------------------------------------------------- 1 | # Web Q&A with Embeddings 2 | 3 | Learn how to crawl your website and build a Q/A bot with the OpenAI API. You can find the full tutorial in the [OpenAI documentation](https://platform.openai.com/docs/tutorials/web-qa-embeddings). 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.5 2 | aiosignal==1.3.1 3 | appnope==0.1.3 4 | asttokens==2.2.1 5 | async-timeout==4.0.2 6 | attrs==22.2.0 7 | backcall==0.2.0 8 | beautifulsoup4==4.11.1 9 | blobfile==2.0.1 10 | bs4==0.0.1 11 | certifi==2023.7.22 12 | charset-normalizer==2.1.1 13 | comm==0.1.2 14 | contourpy==1.0.7 15 | cycler==0.11.0 16 | debugpy==1.6.5 17 | decorator==5.1.1 18 | docopt==0.6.2 19 | entrypoints==0.4 20 | executing==1.2.0 21 | filelock==3.9.0 22 | fonttools==4.38.0 23 | frozenlist==1.3.3 24 | huggingface-hub>=0.0.12 25 | idna==3.4 26 | ipykernel==6.20.1 27 | ipython==8.10.0 28 | jedi==0.18.2 29 | joblib==1.2.0 30 | jupyter_client==7.4.8 31 | jupyter_core==5.1.3 32 | kiwisolver==1.4.4 33 | lxml==4.9.2 34 | matplotlib==3.6.3 35 | matplotlib-inline==0.1.6 36 | multidict==6.0.4 37 | nest-asyncio==1.5.6 38 | numpy==1.24.1 39 | openai==0.26.1 40 | packaging==23.0 41 | pandas==1.5.2 42 | parso==0.8.3 43 | pexpect==4.8.0 44 | pickleshare==0.7.5 45 | Pillow==9.4.0 46 | pipreqs==0.4.12 47 | platformdirs==2.6.2 48 | plotly==5.12.0 49 | prompt-toolkit==3.0.36 50 | psutil==5.9.4 51 | ptyprocess==0.7.0 52 | pure-eval==0.2.2 53 | pycryptodomex==3.17 54 | Pygments==2.15.0 55 | pyparsing==3.0.9 56 | python-dateutil==2.8.2 57 | pytz==2022.7.1 58 | PyYAML==6.0 59 | pyzmq==24.0.1 60 | regex==2022.10.31 61 | requests==2.31.0 62 | scikit-learn==1.2.0 63 | scipy==1.10.0 64 | six==1.16.0 65 | soupsieve==2.3.2.post1 66 | stack-data==0.6.2 67 | tenacity==8.1.0 68 | threadpoolctl==3.1.0 69 | tiktoken==0.1.2 70 | tokenizers==0.13.2 71 | tornado==6.3.3 72 | tqdm==4.64.1 73 | traitlets==5.8.1 74 | transformers==4.30.0 75 | typing_extensions==4.4.0 76 | urllib3==1.26.13 77 | wcwidth==0.2.5 78 | yarg==0.1.9 79 | yarl==1.8.2 80 | -------------------------------------------------------------------------------- /web-qa.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 20, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "https://openai.com/\n", 13 | "https://openai.com/blog/tags/announcements\n", 14 | "https://openai.com/blog/introducing-openai\n", 15 | "https://openai.com/blog/authors/ilya\n", 16 | "https://openai.com/blog/requests-for-research-2\n", 17 | "https://openai.com/blog/authors/diederik\n", 18 | "https://openai.com/blog/block-sparse-gpu-kernels\n", 19 | "https://openai.com/blog/authors/alec\n", 20 | "https://openai.com/blog/fine-tuning-gpt-2\n", 21 | "https://openai.com/blog/authors/paul\n", 22 | "https://openai.com/blog/concrete-ai-safety-problems\n", 23 | "https://openai.com/blog/learning-to-summarize-with-human-feedback\n", 24 | "https://openai.com/blog/authors/long\n", 25 | "https://openai.com/blog/authors/lowe\n", 26 | "https://openai.com/blog/learning-to-cooperate-compete-and-communicate\n", 27 | "https://openai.com/blog/authors/jean\n", 28 | "https://openai.com/blog/authors/igor\n", 29 | "https://openai.com/blog/neural-mmo\n", 30 | "https://openai.com/blog/authors/phillip\n", 31 | "https://openai.com/blog/evolved-policy-gradients\n", 32 | "https://openai.com/blog/authors/richard\n", 33 | "https://openai.com/blog/better-exploration-with-parameter-noise\n", 34 | "https://openai.com/blog/authors/xi\n", 35 | "https://openai.com/blog/authors/matthias\n", 36 | "https://openai.com/blog/solving-rubiks-cube\n", 37 | "https://openai.com/blog/authors/ilge\n", 38 | "https://openai.com/blog/vpt\n", 39 | "https://openai.com/blog/authors/brandon\n", 40 | "https://openai.com/blog/authors/raul\n", 41 | "https://openai.com/blog/authors/bowen\n", 42 | "https://openai.com/blog/authors/jie\n", 43 | "https://openai.com/blog/tags/five\n", 44 | "https://openai.com/blog/openai-five-benchmark-results\n", 45 | "https://openai.com/blog/openai-five/#rapid\n", 46 | "https://openai.com/blog/authors/henrique\n", 47 | "https://openai.com/blog/authors/susan\n", 48 | "https://openai.com/blog/authors/brooke\n", 49 | "https://openai.com/blog/authors/michael-petrov\n", 50 | "https://openai.com/blog/multimodal-neurons\n", 51 | "https://openai.com/blog/authors/shan\n", 52 | "https://openai.com/blog/authors/daniela\n", 53 | "https://openai.com/blog/authors/nick\n", 54 | "https://openai.com/blog/authors/chris\n", 55 | "https://openai.com/blog/introducing-activation-atlases\n", 56 | "https://openai.com/blog/authors/ludwig-schubert\n", 57 | "https://openai.com/blog/authors/justin\n", 58 | "https://openai.com/blog/authors/gabriel\n", 59 | "https://openai.com/blog/microscope\n", 60 | "https://openai.com/blog/authors/przemyslaw\n", 61 | "https://openai.com/blog/authors/david\n", 62 | "https://openai.com/blog/authors/jakub-pachocki\n", 63 | "https://openai.com/blog/authors/christy\n", 64 | "https://openai.com/blog/improving-language-model-behavior\n", 65 | "https://openai.com/blog/authors/irene\n", 66 | "https://openai.com/blog/gpt-2-1-5b-release\n", 67 | "https://openai.com/blog/authors/jack-clark\n", 68 | "https://openai.com/blog/cooperation-on-safety\n", 69 | "https://openai.com/blog/authors/amanda\n", 70 | "https://openai.com/blog/ai-safety-needs-social-scientists\n", 71 | "https://openai.com/blog/adversarial-example-research\n", 72 | "https://openai.com/blog/authors/sandy\n", 73 | "https://openai.com/blog/authors/ian\n", 74 | "https://openai.com/blog/machine-learning-unconference\n", 75 | "https://openai.com/events/code-of-conduct.txt\n", 76 | "https://openai.com/blog/authors/rocky\n", 77 | "https://openai.com/blog/authors/nicolas\n", 78 | "https://openai.com/blog/preparing-for-malicious-uses-of-ai\n", 79 | "https://openai.com/blog/authors/michael\n", 80 | "https://openai.com/blog/spam-detection-in-the-physical-world\n", 81 | "https://openai.com/blog/authors/rachel\n", 82 | "https://openai.com/blog/authors/alex-ray\n", 83 | "https://openai.com/blog/generalizing-from-simulation\n", 84 | "https://openai.com/blog/authors/lerrel\n", 85 | "https://openai.com/blog/authors/xue\n", 86 | "https://openai.com/blog/faster-robot-simulation-in-python\n", 87 | "https://openai.com/blog/safety-gym\n", 88 | "https://openai.com/blog/authors/joshua\n", 89 | "https://openai.com/blog/spinning-up-in-deep-rl\n", 90 | "https://openai.com/blog/spinning-up-in-deep-rl-workshop-review\n", 91 | "https://openai.com/blog/hackathon-follow-up\n", 92 | "https://openai.com/blog/authors/parnian\n", 93 | "https://openai.com/blog/openai-hackathon\n", 94 | "https://openai.com/events/hackathon.txt\n", 95 | "https://openai.com/blog/authors/josh-tobin\n", 96 | "https://openai.com/blog/report-from-the-self-organizing-conference\n", 97 | "https://openai.com/blog/faulty-reward-functions\n", 98 | "https://openai.com/blog/authors/miles\n", 99 | "https://openai.com/blog/language-model-safety-and-misuse\n", 100 | "https://openai.com/blog/authors/tyna\n", 101 | "https://openai.com/blog/webgpt\n", 102 | "https://openai.com/blog/authors/jacob-hilton\n", 103 | "https://openai.com/blog/measuring-goodharts-law\n", 104 | "https://openai.com/careers/research-engineer\n", 105 | "https://openai.com/blog/authors/leo\n", 106 | "https://openai.com/blog/learning-to-summarize-with-human-feedback/#optimizingtherewardmodel\n", 107 | "https://openai.com/blog/procgen-benchmark\n", 108 | "https://openai.com/blog/first-retro-contest-retrospective\n", 109 | "https://openai.com/blog/authors/oleg\n", 110 | "https://openai.com/blog/roboschool\n", 111 | "https://openai.com/blog/gym-retro\n", 112 | "https://openai.com/blog/authors/vicki\n", 113 | "https://openai.com/blog/retro-contest\n", 114 | "https://openai.com/blog/authors/alex\n", 115 | "https://openai.com/blog/reptile\n", 116 | "https://openai.com/blog/dall-e-2-pre-training-mitigations\n", 117 | "https://openai.com/blog/authors/larissa\n", 118 | "https://openai.com/blog/openai-scholars-2018-final-projects\n", 119 | "https://openai.com/blog/authors/karl\n", 120 | "https://openai.com/blog/grade-school-math\n", 121 | "https://openai.com/blog/authors/vineet\n", 122 | "https://openai.com/blog/authors/christopher\n", 123 | "https://openai.com/blog/quantifying-generalization-in-reinforcement-learning\n", 124 | "https://openai.com/blog/authors/reiichiro\n", 125 | "https://openai.com/blog/authors/suchir\n", 126 | "https://openai.com/blog/authors/katie\n", 127 | "https://openai.com/blog/authors/sandhini\n", 128 | "https://openai.com/blog/authors/pamela\n", 129 | "https://openai.com/blog/authors/steven\n", 130 | "https://openai.com/blog/authors/gretchen\n", 131 | "https://openai.com/blog/authors/jan\n", 132 | "https://openai.com/blog/critiques\n", 133 | "https://openai.com/blog/authors/william-saunders\n", 134 | "https://openai.com/blog/authors/catherine\n", 135 | "https://openai.com/blog/our-approach-to-alignment-research\n", 136 | "https://openai.com/blog/best-practices-for-deploying-language-models\n", 137 | "https://openai.com/blog/instruction-following/#limitations\n", 138 | "https://openai.com/blog/economic-impacts\n", 139 | "https://openai.com/blog/authors/sam-manning\n", 140 | "https://openai.com/scholars\n", 141 | "https://openai.com/blog/openai-scholars-2021-final-projects\n", 142 | "https://openai.com/blog/openai-scholars-2020-final-projects\n", 143 | "https://openai.com/resources\n", 144 | "https://openai.com/blog/openai-scholars-spring-2020\n", 145 | "https://openai.com/blog/openai-scholars-class-of-19\n", 146 | "https://openai.com/blog/openai-scholars-2019-final-projects\n", 147 | "https://openai.com/blog/authors/jonathan\n", 148 | "https://openai.com/blog/discovering-types-for-entity-disambiguation\n", 149 | "https://openai.com/blog/openai-five-benchmark\n", 150 | "https://openai.com/blog/openai-five-defeats-dota-2-world-champions/#arena\n", 151 | "https://openai.com/blog/openai-five/#ourapproach\n", 152 | "https://openai.com/blog/more-on-dota-2/#botexploits\n", 153 | "https://openai.com/blog/openai-five-benchmark-results/#training\n", 154 | "https://openai.com/blog/openai-five-finals\n", 155 | "https://openai.com/five/#overview\n", 156 | "https://openai.com/blog/dota-2\n", 157 | "https://openai.com/the-international\n", 158 | "https://openai.com/blog/more-on-dota-2\n", 159 | "https://openai.com/blog/the-international-2018-results\n", 160 | "https://openai.com/blog/openai-five-defeats-dota-2-world-champions/#cooperativemode\n", 161 | "https://openai.com/blog/openai-five-defeats-dota-2-world-champions\n", 162 | "https://openai.com/blog/authors/jeff\n", 163 | "https://openai.com/blog/authors/adrien\n", 164 | "https://openai.com/blog/authors/joost\n", 165 | "https://openai.com/blog/authors/peter-zhokhov\n", 166 | "https://openai.com/blog/authors/glenn\n", 167 | "https://openai.com/blog/authors/peter\n", 168 | "https://openai.com/blog/authors/raphael\n", 169 | "https://openai.com/blog/authors/lilian\n", 170 | "https://openai.com/blog/techniques-for-training-large-neural-networks\n", 171 | "https://openai.com/blog/authors/alex-paino\n", 172 | "https://openai.com/blog/authors/nikolas\n", 173 | "https://openai.com/blog/openai-five\n", 174 | "https://openai.com/blog/authors/bob\n", 175 | "https://openai.com/blog/authors/qiming\n", 176 | "https://openai.com/blog/authors/wojciech\n", 177 | "https://openai.com/blog/authors/arthur\n", 178 | "https://openai.com/blog/authors/mateusz\n", 179 | "https://openai.com/blog/authors/maciek\n", 180 | "https://openai.com/blog/authors/jerry\n", 181 | "https://openai.com/blog/authors/lei\n", 182 | "https://openai.com/blog/how-to-train-your-openai-five\n", 183 | "https://openai.com/blog/authors/jonas-schneider\n", 184 | "https://openai.com/jobs/#robotics\n", 185 | "https://openai.com/interview-guide\n", 186 | "https://openai.com/blog/learning-dexterity\n", 187 | "https://openai.com/blog/authors/rafal\n", 188 | "https://openai.com/blog/ingredients-for-robotics-research\n", 189 | "https://openai.com/blog/authors/vikash\n", 190 | "https://openai.com/blog/authors/marcin\n", 191 | "https://openai.com/blog/authors/prafulla\n", 192 | "https://openai.com/blog/authors/szymon-sidor\n", 193 | "https://openai.com/blog/openai-baselines-dqn\n", 194 | "https://openai.com/blog/authors/tamim\n", 195 | "https://openai.com/blog/learning-montezumas-revenge-from-a-single-demonstration\n", 196 | "https://openai.com/blog/authors/bradly\n", 197 | "https://openai.com/blog/authors/rein\n", 198 | "https://openai.com/blog/authors/jonathan-ho\n", 199 | "https://openai.com/blog/learning-a-hierarchy\n", 200 | "https://openai.com/blog/authors/peter-chen\n", 201 | "https://openai.com/blog/authors/kevin\n", 202 | "https://openai.com/blog/authors/filip\n", 203 | "https://openai.com/five\n", 204 | "https://openai.com/blog/authors/yilun\n", 205 | "https://openai.com/blog/authors/joseph\n", 206 | "https://openai.com/blog/interpretable-machine-learning-through-teaching\n", 207 | "https://openai.com/blog/authors/smitha\n", 208 | "https://openai.com/blog/learning-to-model-other-minds\n", 209 | "https://openai.com/blog/authors/shimon\n", 210 | "https://openai.com/blog/authors/maruan\n", 211 | "https://openai.com/blog/authors/jakob-foerster\n", 212 | "https://openai.com/blog/nonlinear-computation-in-linear-networks\n", 213 | "https://openai.com/blog/energy-based-models\n", 214 | "https://openai.com/blog/emergent-tool-use\n", 215 | "https://openai.com/blog/authors/ingmar\n", 216 | "https://openai.com/blog/authors/todor\n", 217 | "https://openai.com/blog/learning-concepts-with-energy-functions\n", 218 | "https://openai.com/blog/authors/yi\n", 219 | "https://openai.com/blog/authors/pieter\n", 220 | "https://openai.com/blog/authors/aviv\n", 221 | "https://openai.com/blog/instruction-following\n", 222 | "https://openai.com/blog/learning-to-communicate\n", 223 | "https://openai.com/blog/authors/jon\n", 224 | "https://openai.com/blog/summarizing-books\n", 225 | "https://openai.com/blog/authors/chelsea\n", 226 | "https://openai.com/blog/gathering_human_feedback\n", 227 | "https://openai.com/blog/authors/dario-amodei\n", 228 | "https://openai.com/blog/science-of-ai\n", 229 | "https://openai.com/blog/authors/jared\n", 230 | "https://openai.com/blog/authors/sam\n", 231 | "https://openai.com/blog/gpt-2-6-month-follow-up\n", 232 | "https://openai.com/blog/better-language-models/#update\n", 233 | "https://openai.com/blog/authors/david-luan\n", 234 | "https://openai.com/blog/authors/danny\n", 235 | "https://openai.com/blog/ai-and-efficiency\n", 236 | "https://openai.com/blog/authors/david-lansky\n", 237 | "https://openai.com/blog/authors/tom\n", 238 | "https://openai.com/blog/testing-robustness\n", 239 | "https://openai.com/blog/authors/jacob\n", 240 | "https://openai.com/blog/authors/yi-sun\n", 241 | "https://openai.com/blog/authors/daniel\n", 242 | "https://openai.com/blog/authors/dan\n", 243 | "https://openai.com/blog/deep-reinforcement-learning-from-human-preferences\n", 244 | "https://openai.com/blog/authors/geoffrey\n", 245 | "https://openai.com/blog/debate\n", 246 | "https://openai.com/blog/authors/jeffrey\n", 247 | "https://openai.com/blog/authors/nisan\n", 248 | "https://openai.com/blog/amplifying-ai-training\n", 249 | "https://openai.com/blog/authors/daniel-ziegler\n", 250 | "https://openai.com/blog/baselines-acktr-a2c\n", 251 | "https://openai.com/blog/authors/yuhuai\n", 252 | "https://openai.com/blog/authors/shun\n", 253 | "https://openai.com/blog/authors/elman\n", 254 | "https://openai.com/blog/openai-baselines-ppo\n", 255 | "https://openai.com/blog/language-unsupervised\n", 256 | "https://openai.com/blog/tags/baselines\n", 257 | "https://openai.com/blog/authors/scott\n", 258 | "https://openai.com/blog/sparse-transformer\n", 259 | "https://openai.com/blog/authors/rewon\n", 260 | "https://openai.com/blog/glow\n", 261 | "https://openai.com/blog/authors/john\n", 262 | "https://openai.com/blog/openai-gym-beta\n", 263 | "https://openai.com/blog/authors/tim\n", 264 | "https://openai.com/jobs\n", 265 | "https://openai.com/blog/formal-math\n", 266 | "https://openai.com/blog/authors/stanislas\n", 267 | "https://openai.com/blog/authors/jesse\n", 268 | "https://openai.com/blog/generative-models\n", 269 | "https://openai.com/blog/authors/andrej\n", 270 | "https://openai.com/blog/distill\n", 271 | "https://openai.com/blog/authors/vicki-cheung\n", 272 | "https://openai.com/blog/jukebox\n", 273 | "https://openai.com/projects/five\n", 274 | "https://openai.com/blog/authors/christine\n", 275 | "https://openai.com/blog/authors/jong\n", 276 | "https://openai.com/blog/authors/heewoo\n", 277 | "https://openai.com/blog/musenet\n", 278 | "https://openai.com/blog/better-language-models\n", 279 | "https://openai.com/blog/robots-that-learn\n", 280 | "https://openai.com/blog/authors/ankur\n", 281 | "https://openai.com/blog/authors/erika-reinhardt\n", 282 | "https://openai.com/blog/deep-double-descent\n", 283 | "https://openai.com/blog/authors/tristan\n", 284 | "https://openai.com/blog/authors/preetum\n", 285 | "https://openai.com/blog/authors/boaz\n", 286 | "https://openai.com/blog/authors/yamini\n", 287 | "https://openai.com/blog/authors/gal\n", 288 | "https://openai.com/blog/tags/gpt-2\n", 289 | "https://openai.com/blog/clip\n", 290 | "https://openai.com/blog/ai-and-compute\n", 291 | "https://openai.com/blog/authors/girish\n", 292 | "https://openai.com/blog/special-projects\n", 293 | "https://openai.com/blog/authors/sam-altman\n", 294 | "https://openai.com/blog/unsupervised-sentiment-neuron\n", 295 | "https://openai.com/blog/dall-e\n", 296 | "https://openai.com/blog/authors/aditya\n", 297 | "https://openai.com/blog/authors/mark\n", 298 | "https://openai.com/blog/authors/mikhail\n", 299 | "https://openai.com/blog/authors/vedant\n", 300 | "https://openai.com/blog/competitive-self-play\n", 301 | "https://openai.com/blog/authors/trapit\n", 302 | "https://openai.com/blog/meta-learning-for-wrestling\n", 303 | "https://openai.com/blog/authors/yura\n", 304 | "https://openai.com/blog/reinforcement-learning-with-prediction-based-rewards\n", 305 | "https://openai.com/blog/authors/harri\n", 306 | "https://openai.com/blog/image-gpt\n", 307 | "https://openai.com/blog/evolution-strategies\n", 308 | "https://openai.com/blog/infrastructure-for-deep-learning\n", 309 | "https://openai.com/blog/generative-models/#gan\n", 310 | "https://openai.com/blog/generative-models#improving-gans\n", 311 | "https://openai.com/blog/tags/multimodal\n", 312 | "https://openai.com/gpt-3\n", 313 | "https://openai.com/javascript:setMathjaxCookie()\n", 314 | "HTTP Error 404: Not Found\n", 315 | "https://openai.com/abs/2005.14165v1\n", 316 | "HTTP Error 404: Not Found\n", 317 | "https://openai.com/list/cs.CL/new\n", 318 | "HTTP Error 404: Not Found\n", 319 | "https://openai.com/abs/2005.14165v3\n", 320 | "HTTP Error 404: Not Found\n", 321 | "https://openai.com/auth/show-endorsers/2005.14165\n", 322 | "HTTP Error 404: Not Found\n", 323 | "https://openai.com/list/cs/recent\n", 324 | "HTTP Error 404: Not Found\n", 325 | "https://openai.com/abs/2005.14165?context=cs\n", 326 | "HTTP Error 404: Not Found\n", 327 | "https://openai.com/{url_path('ignore_me')}\n", 328 | "HTTP Error 404: Not Found\n", 329 | "https://openai.com/abs/2005.14165v2\n", 330 | "HTTP Error 404: Not Found\n", 331 | "https://openai.com/show-email/b5cb66e9/2005.14165\n", 332 | "HTTP Error 404: Not Found\n", 333 | "https://openai.com/prevnext?id=2005.14165&function=next&context=cs.CL\n", 334 | "HTTP Error 404: Not Found\n", 335 | "https://openai.com/format/2005.14165\n", 336 | "HTTP Error 404: Not Found\n", 337 | "https://openai.com/prevnext?id=2005.14165&function=prev&context=cs.CL\n", 338 | "HTTP Error 404: Not Found\n", 339 | "https://openai.com/pdf/2005.14165\n", 340 | "HTTP Error 404: Not Found\n", 341 | "https://openai.com/tb/2005.14165\n", 342 | "HTTP Error 404: Not Found\n", 343 | "https://openai.com/list/cs.CL/2005\n", 344 | "HTTP Error 404: Not Found\n", 345 | "https://openai.com/list/cs.CL/recent\n", 346 | "HTTP Error 404: Not Found\n", 347 | "https://openai.com/blog/dall-e-2\n", 348 | "https://openai.com/blog/authors/openai\n", 349 | "https://openai.com/blog/improving-verifiability\n", 350 | "https://openai.com/blog/dall-e-2-extending-creativity\n", 351 | "https://openai.com/blog/the-international\n", 352 | "https://openai.com/blog/symposium-2019\n", 353 | "https://openai.com/blog/tags/culture\n", 354 | "https://openai.com/blog/learning-day\n", 355 | "https://openai.com/blog/openai-fellows-fall-2018\n", 356 | "https://openai.com/blog/neurips-2020\n", 357 | "https://openai.com/blog/tags/community\n", 358 | "https://openai.com/blog/universe\n", 359 | "https://openai.com/blog/openai-gym-beta/#rl\n", 360 | "https://openai.com/blog/openai-technical-goals/#goal4\n", 361 | "https://openai.com/blog/authors/elon\n", 362 | "https://openai.com/blog/scaling-kubernetes-to-7500-nodes\n", 363 | "https://openai.com/blog/scaling-kubernetes-to-2500-nodes\n", 364 | "https://openai.com/blog/authors/christopher-berner\n", 365 | "https://openai.com/blog/authors/bchess\n", 366 | "https://openai.com/blog/authors/eric\n", 367 | "https://openai.com/blog/forecasting-misuse\n", 368 | "https://openai.com/forecasting-misuse-paper\n", 369 | "https://openai.com/prevnext?id=2301.04246&function=prev&context=cs.CY\n", 370 | "HTTP Error 404: Not Found\n", 371 | "https://openai.com/auth/show-endorsers/2301.04246\n", 372 | "HTTP Error 404: Not Found\n", 373 | "https://openai.com/format/2301.04246\n", 374 | "HTTP Error 404: Not Found\n", 375 | "https://openai.com/pdf/2301.04246\n", 376 | "HTTP Error 404: Not Found\n", 377 | "https://openai.com/show-email/64c5c6bd/2301.04246\n", 378 | "HTTP Error 404: Not Found\n", 379 | "https://openai.com/list/cs.CY/recent\n", 380 | "HTTP Error 404: Not Found\n", 381 | "https://openai.com/prevnext?id=2301.04246&function=next&context=cs.CY\n", 382 | "HTTP Error 404: Not Found\n", 383 | "https://openai.com/list/cs.CY/new\n", 384 | "HTTP Error 404: Not Found\n", 385 | "https://openai.com/list/cs.CY/2301\n", 386 | "HTTP Error 404: Not Found\n", 387 | "https://openai.com/abs/2301.04246?context=cs\n", 388 | "HTTP Error 404: Not Found\n", 389 | "https://openai.com/blog/authors/greg\n", 390 | "https://openai.com/blog/dall-e-api-now-available-in-public-beta\n", 391 | "https://openai.com/blog/api-no-waitlist\n", 392 | "https://openai.com/blog/dall-e-introducing-outpainting\n", 393 | "https://openai.com/blog/team-update\n", 394 | "https://openai.com/blog/chatgpt-plus\n", 395 | "https://openai.com/blog/openai-api\n", 396 | "https://openai.com/jobs/#applied-ai\n", 397 | "https://openai.com/blog/authors/mira\n", 398 | "https://openai.com/join\n", 399 | "Unable to parse page https://openai.com/join due to JavaScript being required\n", 400 | "HTTP Error 403: Forbidden\n", 401 | "https://openai.com/blog/tags/residency\n", 402 | "https://openai.com/blog/openai-licenses-gpt-3-technology-to-microsoft\n", 403 | "https://openai.com/blog/microsoft\n", 404 | "https://openai.com/blog/team-update-august\n", 405 | "https://openai.com/blog/new-ai-classifier-for-indicating-ai-written-text\n", 406 | "https://openai.com/blog/authors/lama\n", 407 | "https://openai.com/blog/authors/scott-aaronson\n", 408 | "https://openai.com/blog/authors/jan-hendrik-kirchner\n", 409 | "https://openai.com/blog/tags/api\n", 410 | "https://openai.com/blog/openai-fellows\n", 411 | "https://openai.com/blog/tags/scholars\n", 412 | "https://openai.com/blog/openai-and-microsoft-extend-partnership\n", 413 | "https://openai.com/blog/dall-e-now-available-without-waitlist\n", 414 | "https://openai.com/blog/helen-toner-joins\n", 415 | "https://openai.com/blog/team-update-january\n", 416 | "https://openai.com/blog/team-plus-plus#interns\n", 417 | "https://openai.com/blog/openai-codex\n", 418 | "https://openai.com/blog/openai-scholars-2019\n", 419 | "https://openai.com/blog/authors/ashley\n", 420 | "https://openai.com/blog/openai-scholars\n", 421 | "https://openai.com/blog/dall-e-now-available-in-beta\n", 422 | "https://openai.com/blog/new-and-improved-embedding-model\n", 423 | "https://openai.com/blog/authors/ryan\n", 424 | "https://openai.com/blog/authors/arvind\n", 425 | "https://openai.com/blog/authors/ted\n", 426 | "https://openai.com/blog/dall-e-2-update\n", 427 | "https://openai.com/blog/authors/joanne\n", 428 | "https://openai.com/blog/tags/fellows\n", 429 | "https://openai.com/blog/openai-summer-fellows-2018\n", 430 | "https://openai.com/blog/authors/maddie\n", 431 | "https://openai.com/blog/codex-apps\n", 432 | "https://openai.com/blog/codex\n", 433 | "HTTP Error 404: Not Found\n", 434 | "https://openai.com/blog/new-and-improved-content-moderation-tooling\n", 435 | "https://openai.com/blog/authors/teddy\n", 436 | "https://openai.com/blog/authors/angela\n", 437 | "https://openai.com/blog/authors/chong\n", 438 | "https://openai.com/blog/welcome-pieter-and-shivon\n", 439 | "https://openai.com/blog/openai-technical-goals\n", 440 | "https://openai.com/blog/procgen-minerl-competitions\n", 441 | "https://openai.com/blog/will-hurd-joins\n", 442 | "https://openai.com/blog/fund\n", 443 | "https://openai.com/news\n", 444 | "HTTP Error 404: Not Found\n", 445 | "https://openai.com/news/introducing-our-first-investments\n", 446 | "HTTP Error 404: Not Found\n", 447 | "https://openai.com/blog/introducing-text-and-code-embeddings\n", 448 | "https://openai.com/blog/authors/boris\n", 449 | "https://openai.com/blog/openai-scholars-2018-meet-our-scholars\n", 450 | "https://openai.com/blog/team-plus-plus\n", 451 | "https://openai.com/blog/gpt-3-apps\n", 452 | "https://openai.com/jobs/#open\n", 453 | "https://openai.com/blog/customized-gpt-3\n", 454 | "https://openai.com/blog/authors/luke\n", 455 | "https://openai.com/blog/authors/rachel-lim\n", 456 | "https://openai.com/blog/authors/michael-wu\n", 457 | "https://openai.com/blog/openai-supporters\n", 458 | "https://openai.com/blog/openai-residency\n", 459 | "https://openai.com/blog/leadership-team-update\n", 460 | "https://openai.com/blog/organizational-update\n", 461 | "https://openai.com/blog/openai-fellows-interns-2019\n", 462 | "https://openai.com/blog/openai-scholars-2020\n", 463 | "https://openai.com/blog/gpt-3-edit-insert\n", 464 | "https://openai.com/blog/authors/mo\n", 465 | "https://openai.com/blog/openai-pytorch\n", 466 | "https://openai.com/blog/openai-scholars-2019-meet-our-scholars\n", 467 | "https://openai.com/blog/openai-charter\n", 468 | "https://openai.com/blog/openai-and-microsoft\n", 469 | "https://openai.com/blog/openai-lp\n", 470 | "https://openai.com/blog/reducing-bias-and-improving-safety-in-dall-e-2\n", 471 | "https://openai.com/terms\n", 472 | "https://openai.com/api/policies/service-terms\n", 473 | "https://openai.com/api/policies/sharing-publication\n", 474 | "https://openai.com/api/policies/terms\n", 475 | "https://openai.com/security/disclosure\n", 476 | "https://openai.com/blog/whisper\n", 477 | "https://openai.com/blog/authors/tao\n", 478 | "https://openai.com/research\n", 479 | "https://openai.com/api/docs\n", 480 | "Unable to parse page https://openai.com/api/docs due to JavaScript being required\n", 481 | "HTTP Error 403: Forbidden\n", 482 | "https://openai.com/dall-e-2\n", 483 | "https://openai.com/privacy\n", 484 | "https://openai.com/api\n", 485 | "https://openai.com/blog\n", 486 | "https://openai.com/blog/triton\n", 487 | "https://openai.com/blog/authors/philippe\n", 488 | "https://openai.com/jobs/#acceleration\n", 489 | "https://openai.com/blog/robust-adversarial-inputs\n", 490 | "https://openai.com/blog/authors/anish-athalye\n", 491 | "https://openai.com/blog/tags/milestones\n", 492 | "https://openai.com/alignment\n", 493 | "https://openai.com\n", 494 | "https://openai.com/publications\n", 495 | "https://openai.com/charter\n", 496 | "https://openai.com/blog/tags/research\n", 497 | "https://openai.com/fund\n", 498 | "https://openai.com/about\n", 499 | "https://openai.com/timeline\n", 500 | "https://openai.com/careers\n", 501 | "https://openai.com/api/examples\n", 502 | "Unable to parse page https://openai.com/api/examples due to JavaScript being required\n", 503 | "HTTP Error 403: Forbidden\n", 504 | "https://openai.com/api/login\n", 505 | "Unable to parse page https://openai.com/api/login due to JavaScript being required\n", 506 | "HTTP Error 403: Forbidden\n", 507 | "https://openai.com/newsroom\n", 508 | "https://openai.com/api/policies\n", 509 | "https://openai.com/api/pricing\n", 510 | "https://openai.com/contact-sales\n", 511 | "https://openai.com/api/pricing/#faq-fine-tuning-pricing-calculation\n", 512 | "https://openai.com/blog/tags/events\n", 513 | "https://openai.com/blog/chatgpt\n" 514 | ] 515 | } 516 | ], 517 | "source": [ 518 | "import requests\n", 519 | "import re\n", 520 | "import urllib.request\n", 521 | "from bs4 import BeautifulSoup\n", 522 | "from collections import deque\n", 523 | "from html.parser import HTMLParser\n", 524 | "from urllib.parse import urlparse\n", 525 | "import os\n", 526 | "\n", 527 | "# Regex pattern to match a URL\n", 528 | "HTTP_URL_PATTERN = r'^http[s]*://.+'\n", 529 | "\n", 530 | "# Define root domain to crawl\n", 531 | "domain = \"openai.com\"\n", 532 | "full_url = \"https://openai.com/\"\n", 533 | "\n", 534 | "# Create a class to parse the HTML and get the hyperlinks\n", 535 | "class HyperlinkParser(HTMLParser):\n", 536 | " def __init__(self):\n", 537 | " super().__init__()\n", 538 | " # Create a list to store the hyperlinks\n", 539 | " self.hyperlinks = []\n", 540 | "\n", 541 | " # Override the HTMLParser's handle_starttag method to get the hyperlinks\n", 542 | " def handle_starttag(self, tag, attrs):\n", 543 | " attrs = dict(attrs)\n", 544 | "\n", 545 | " # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks\n", 546 | " if tag == \"a\" and \"href\" in attrs:\n", 547 | " self.hyperlinks.append(attrs[\"href\"])\n", 548 | "\n", 549 | "# Function to get the hyperlinks from a URL\n", 550 | "def get_hyperlinks(url):\n", 551 | " \n", 552 | " # Try to open the URL and read the HTML\n", 553 | " try:\n", 554 | " # Open the URL and read the HTML\n", 555 | " with urllib.request.urlopen(url) as response:\n", 556 | "\n", 557 | " # If the response is not HTML, return an empty list\n", 558 | " if not response.info().get('Content-Type').startswith(\"text/html\"):\n", 559 | " return []\n", 560 | " \n", 561 | " # Decode the HTML\n", 562 | " html = response.read().decode('utf-8')\n", 563 | " except Exception as e:\n", 564 | " print(e)\n", 565 | " return []\n", 566 | "\n", 567 | " # Create the HTML Parser and then Parse the HTML to get hyperlinks\n", 568 | " parser = HyperlinkParser()\n", 569 | " parser.feed(html)\n", 570 | "\n", 571 | " return parser.hyperlinks\n", 572 | "\n", 573 | "# Function to get the hyperlinks from a URL that are within the same domain\n", 574 | "def get_domain_hyperlinks(local_domain, url):\n", 575 | " clean_links = []\n", 576 | " for link in set(get_hyperlinks(url)):\n", 577 | " clean_link = None\n", 578 | "\n", 579 | " # If the link is a URL, check if it is within the same domain\n", 580 | " if re.search(HTTP_URL_PATTERN, link):\n", 581 | " # Parse the URL and check if the domain is the same\n", 582 | " url_obj = urlparse(link)\n", 583 | " if url_obj.netloc == local_domain:\n", 584 | " clean_link = link\n", 585 | "\n", 586 | " # If the link is not a URL, check if it is a relative link\n", 587 | " else:\n", 588 | " if link.startswith(\"/\"):\n", 589 | " link = link[1:]\n", 590 | " elif link.startswith(\"#\") or link.startswith(\"mailto:\"):\n", 591 | " continue\n", 592 | " clean_link = \"https://\" + local_domain + \"/\" + link\n", 593 | "\n", 594 | " if clean_link is not None:\n", 595 | " if clean_link.endswith(\"/\"):\n", 596 | " clean_link = clean_link[:-1]\n", 597 | " clean_links.append(clean_link)\n", 598 | "\n", 599 | " # Return the list of hyperlinks that are within the same domain\n", 600 | " return list(set(clean_links))\n", 601 | "\n", 602 | "\n", 603 | "def crawl(url):\n", 604 | " # Parse the URL and get the domain\n", 605 | " local_domain = urlparse(url).netloc\n", 606 | "\n", 607 | " # Create a queue to store the URLs to crawl\n", 608 | " queue = deque([url])\n", 609 | "\n", 610 | " # Create a set to store the URLs that have already been seen (no duplicates)\n", 611 | " seen = set([url])\n", 612 | "\n", 613 | " # Create a directory to store the text files\n", 614 | " if not os.path.exists(\"text/\"):\n", 615 | " os.mkdir(\"text/\")\n", 616 | "\n", 617 | " if not os.path.exists(\"text/\"+local_domain+\"/\"):\n", 618 | " os.mkdir(\"text/\" + local_domain + \"/\")\n", 619 | "\n", 620 | " # Create a directory to store the csv files\n", 621 | " if not os.path.exists(\"processed\"):\n", 622 | " os.mkdir(\"processed\")\n", 623 | "\n", 624 | " # While the queue is not empty, continue crawling\n", 625 | " while queue:\n", 626 | "\n", 627 | " # Get the next URL from the queue\n", 628 | " url = queue.pop()\n", 629 | " print(url) # for debugging and to see the progress\n", 630 | "\n", 631 | " # Save text from the url to a .txt file\n", 632 | " with open('text/'+local_domain+'/'+url[8:].replace(\"/\", \"_\") + \".txt\", \"w\") as f:\n", 633 | "\n", 634 | " # Get the text from the URL using BeautifulSoup\n", 635 | " soup = BeautifulSoup(requests.get(url).text, \"html.parser\")\n", 636 | "\n", 637 | " # Get the text but remove the tags\n", 638 | " text = soup.get_text()\n", 639 | "\n", 640 | " # If the crawler gets to a page that requires JavaScript, it will stop the crawl\n", 641 | " if (\"You need to enable JavaScript to run this app.\" in text):\n", 642 | " print(\"Unable to parse page \" + url + \" due to JavaScript being required\")\n", 643 | " \n", 644 | " # Otherwise, write the text to the file in the text directory\n", 645 | " f.write(text)\n", 646 | "\n", 647 | " # Get the hyperlinks from the URL and add them to the queue\n", 648 | " for link in get_domain_hyperlinks(local_domain, url):\n", 649 | " if link not in seen:\n", 650 | " queue.append(link)\n", 651 | " seen.add(link)\n", 652 | "\n", 653 | "crawl(full_url)" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": 1, 659 | "metadata": {}, 660 | "outputs": [], 661 | "source": [ 662 | "def remove_newlines(serie):\n", 663 | " serie = serie.str.replace('\\n', ' ')\n", 664 | " serie = serie.str.replace('\\\\n', ' ')\n", 665 | " serie = serie.str.replace(' ', ' ')\n", 666 | " serie = serie.str.replace(' ', ' ')\n", 667 | " return serie" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": null, 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [ 676 | "import pandas as pd\n", 677 | "\n", 678 | "# Create a list to store the text files\n", 679 | "texts=[]\n", 680 | "\n", 681 | "# Get all the text files in the text directory\n", 682 | "for file in os.listdir(\"text/\" + domain + \"/\"):\n", 683 | "\n", 684 | " # Open the file and read the text\n", 685 | " with open(\"text/\" + domain + \"/\" + file, \"r\") as f:\n", 686 | " text = f.read()\n", 687 | "\n", 688 | " # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.\n", 689 | " texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))\n", 690 | "\n", 691 | "# Create a dataframe from the list of texts\n", 692 | "df = pd.DataFrame(texts, columns = ['fname', 'text'])\n", 693 | "\n", 694 | "# Set the text column to be the raw text with the newlines removed\n", 695 | "df['text'] = df.fname + \". \" + remove_newlines(df.text)\n", 696 | "df.to_csv('processed/scraped.csv')\n", 697 | "df.head()" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": 26, 703 | "metadata": {}, 704 | "outputs": [ 705 | { 706 | "data": { 707 | "text/plain": [ 708 | "" 709 | ] 710 | }, 711 | "execution_count": 26, 712 | "metadata": {}, 713 | "output_type": "execute_result" 714 | }, 715 | { 716 | "data": { 717 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAikAAAGdCAYAAADXIOPgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/P9b71AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAlRElEQVR4nO3df3RU9Z3/8VcSJhMCTELATEhJEIsFIyAKNcy2da2EBJrjas05iy3HTS0Ht2zwVNOlmi4iP9oTDtuv2tqIPbsW3LOlbOkpuiJiYhRYa/iVmkrAZsWlG3dxkhaaBIhMhuTz/cPv3K9jgjB4yXwmeT7OyTncez/zmc/7PdfJy5m5mSRjjBEAAIBlkuO9AAAAgIEQUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAVhoR7wVcjr6+Pp04cUJjxoxRUlJSvJcDAAAugTFGp0+fVm5urpKTL/46SUKGlBMnTigvLy/eywAAAJfhvffe08SJEy86LiFDypgxYyR9WKTP53Nt3nA4rNraWhUXF8vj8bg2byKhB/RAogcSPRju9Uv0QHK/B11dXcrLy3N+j19MQoaUyFs8Pp/P9ZCSnp4un883rE9IekAP6AE9GO71S/RAunI9uNSPavDBWQAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArjYj3Amw0ffXLCvVe2tdI2+IP60vjvQQAAFzFKykAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABY6VOFlPXr1yspKUkPPPCAs+/cuXOqqKjQuHHjNHr0aJWVlamtrS3qdq2trSotLVV6erqys7O1YsUKnT9//tMsBQAADDGXHVIOHjyon/70p5o5c2bU/gcffFAvvPCCtm3bpj179ujEiRO66667nOO9vb0qLS1VT0+P3njjDT377LPavHmzVq1adflVAACAIeeyQsqZM2e0ePFi/dM//ZPGjh3r7O/s7NQzzzyjxx57TLfddptmz56tTZs26Y033tC+ffskSbW1tTp69Kj+9V//VbNmzdLChQu1bt061dTUqKenx52qAABAwhtxOTeqqKhQaWmpioqK9P3vf9/Z39jYqHA4rKKiImfftGnTlJ+fr4aGBs2dO1cNDQ2aMWOG/H6/M6akpETLli3TkSNHdOONN/a7v1AopFAo5Gx3dXVJksLhsMLh8OWUMKDIXN5k49qcg8WtPkTmcbOviYYe0AOJHgz3+iV6ILnfg1jniTmkbN26Vb/97W918ODBfseCwaBSU1OVmZkZtd/v9ysYDDpjPhpQIscjxwZSXV2tNWvW9NtfW1ur9PT0WEu4qHVz+lyf80rbuXOnq/PV1dW5Ol8iogf0QKIHw71+iR5I7vWgu7s7pvExhZT33ntP3/72t1VXV6e0tLSY7ujTqKqqUmVlpbPd1dWlvLw8FRcXy+fzuXY/4XBYdXV1euRQskJ9Sa7NOxiaV5e4Mk+kB/Pnz5fH43FlzkRDD+iBRA+Ge/0SPZDc70HknZBLFVNIaWxsVHt7u2666SZnX29vr/bu3auf/OQnevnll9XT06OOjo6oV1Pa2tqUk5MjScrJydGBAwei5o1c/RMZ83Fer1der7fffo/Hc0VOnFBfkkK9iRVS3O7DleptIqEH9ECiB8O9fokeSO71INY5Yvrg7Lx583T48GE1NTU5P3PmzNHixYudf3s8HtXX1zu3aWlpUWtrqwKBgCQpEAjo8OHDam9vd8bU1dXJ5/OpoKAgpsUDAIChK6ZXUsaMGaPp06dH7Rs1apTGjRvn7F+yZIkqKyuVlZUln8+n+++/X4FAQHPnzpUkFRcXq6CgQPfcc482bNigYDColStXqqKiYsBXSwAAwPB0WVf3fJLHH39cycnJKisrUygUUklJiZ566inneEpKinbs2KFly5YpEAho1KhRKi8v19q1a91eCgAASGCfOqTs3r07ajstLU01NTWqqam54G0mTZrk+tUoAABgaOG7ewAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFaKKaRs3LhRM2fOlM/nk8/nUyAQ0EsvveQcP3funCoqKjRu3DiNHj1aZWVlamtri5qjtbVVpaWlSk9PV3Z2tlasWKHz58+7Uw0AABgyYgopEydO1Pr169XY2KhDhw7ptttu0x133KEjR45Ikh588EG98MIL2rZtm/bs2aMTJ07orrvucm7f29ur0tJS9fT06I033tCzzz6rzZs3a9WqVe5WBQAAEt6IWAbffvvtUds/+MEPtHHjRu3bt08TJ07UM888oy1btui2226TJG3atEnXXXed9u3bp7lz56q2tlZHjx7VK6+8Ir/fr1mzZmndunV66KGHtHr1aqWmprpXGQAASGgxhZSP6u3t1bZt23T27FkFAgE1NjYqHA6rqKjIGTNt2jTl5+eroaFBc+fOVUNDg2bMmCG/3++MKSkp0bJly3TkyBHdeOONA95XKBRSKBRytru6uiRJ4XBY4XD4ckvoJzKXN9m4NudgcasPkXnc7GuioQf0QKIHw71+iR5I7vcg1nliDimHDx9WIBDQuXPnNHr0aG3fvl0FBQVqampSamqqMjMzo8b7/X4Fg0FJUjAYjAookeORYxdSXV2tNWvW9NtfW1ur9PT0WEu4qHVz+lyf80rbuXOnq/PV1dW5Ol8iogf0QKIHw71+iR5I7vWgu7s7pvExh5SpU6eqqalJnZ2d+tWvfqXy8nLt2bMn1mliUlVVpcrKSme7q6tLeXl5Ki4uls/nc+1+wuGw6urq9MihZIX6klybdzA0ry5xZZ5ID+bPny+Px+PKnImGHtADiR4M9/oleiC534PIOyGXKuaQkpqaqilTpkiSZs+erYMHD+pHP/qRFi1apJ6eHnV0dES9mtLW1qacnBxJUk5Ojg4cOBA1X+Tqn8iYgXi9Xnm93n77PR7PFTlxQn1JCvUmVkhxuw9XqreJhB7QA4keDPf6JXogudeDWOf41H8npa+vT6FQSLNnz5bH41F9fb1zrKWlRa2trQoEApKkQCCgw4cPq7293RlTV1cnn8+ngoKCT7sUAAAwhMT0SkpVVZUWLlyo/Px8nT59Wlu2bNHu3bv18ssvKyMjQ0uWLFFlZaWysrLk8/l0//33KxAIaO7cuZKk4uJiFRQU6J577tGGDRsUDAa1cuVKVVRUDPhKCQAAGL5iCint7e36m7/5G73//vvKyMjQzJkz9fLLL2v+/PmSpMcff1zJyckqKytTKBRSSUmJnnrqKef2KSkp2rFjh5YtW6ZAIKBRo0apvLxca9eudbcqAACQ8GIKKc8888wnHk9LS1NNTY1qamouOGbSpEmuX4kCAACGHr67BwAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEoxhZTq6mp9/vOf15gxY5Sdna0777xTLS0tUWPOnTuniooKjRs3TqNHj1ZZWZna2tqixrS2tqq0tFTp6enKzs7WihUrdP78+U9fDQAAGDJiCil79uxRRUWF9u3bp7q6OoXDYRUXF+vs2bPOmAcffFAvvPCCtm3bpj179ujEiRO66667nOO9vb0qLS1VT0+P3njjDT377LPavHmzVq1a5V5VAAAg4Y2IZfCuXbuitjdv3qzs7Gw1NjbqlltuUWdnp5555hlt2bJFt912myRp06ZNuu6667Rv3z7NnTtXtbW1Onr0qF555RX5/X7NmjVL69at00MPPaTVq1crNTXVveoAAEDC+lSfSens7JQkZWVlSZIaGxsVDodVVFTkjJk2bZry8/PV0NAgSWpoaNCMGTPk9/udMSUlJerq6tKRI0c+zXIAAMAQEtMrKR/V19enBx54QF/4whc0ffp0SVIwGFRqaqoyMzOjxvr9fgWDQWfMRwNK5Hjk2EBCoZBCoZCz3dXVJUkKh8MKh8OXW0I/kbm8yca1OQeLW32IzONmXxMNPaAHEj0Y7vVL9EByvwexznPZIaWiokLNzc16/fXXL3eKS1ZdXa01a9b0219bW6v09HTX72/dnD7X57zSdu7c6ep8dXV1rs6XiOgBPZDowXCvX6IHkns96O7ujmn8ZYWU5cuXa8eOHdq7d68mTpzo7M/JyVFPT486OjqiXk1pa2tTTk6OM+bAgQNR80Wu/omM+biqqipVVlY6211dXcrLy1NxcbF8Pt/llDCgcDisuro6PXIoWaG+JNfmHQzNq0tcmSfSg/nz58vj8bgyZ6KhB/RAogfDvX6JHkju9yDyTsiliimkGGN0//33a/v27dq9e7cmT54cdXz27NnyeDyqr69XWVmZJKmlpUWtra0KBAKSpEAgoB/84Adqb29Xdna2pA8Tms/nU0FBwYD36/V65fV6++33eDxX5MQJ9SUp1JtYIcXtPlyp3iYSekAPJHow3OuX6IHkXg9inSOmkFJRUaEtW7bo+eef15gxY5zPkGRkZGjkyJHKyMjQkiVLVFlZqaysLPl8Pt1///0KBAKaO3euJKm4uFgFBQW65557tGHDBgWDQa1cuVIVFRUDBhEAADA8xRRSNm7cKEm69dZbo/Zv2rRJ3/jGNyRJjz/+uJKTk1VWVqZQKKSSkhI99dRTztiUlBTt2LFDy5YtUyAQ0KhRo1ReXq61a9d+ukoAAMCQEvPbPReTlpammpoa1dTUXHDMpEmTXP+gJwAAGFr47h4AAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVYg4pe/fu1e23367c3FwlJSXpueeeizpujNGqVas0YcIEjRw5UkVFRXrnnXeixpw6dUqLFy+Wz+dTZmamlixZojNnznyqQgAAwNASc0g5e/asbrjhBtXU1Ax4fMOGDfrxj3+sp59+Wvv379eoUaNUUlKic+fOOWMWL16sI0eOqK6uTjt27NDevXt13333XX4VAABgyBkR6w0WLlyohQsXDnjMGKMnnnhCK1eu1B133CFJ+pd/+Rf5/X4999xzuvvuu/X2229r165dOnjwoObMmSNJevLJJ/WVr3xFP/zhD5Wbm/spygEAAENFzCHlkxw/flzBYFBFRUXOvoyMDBUWFqqhoUF33323GhoalJmZ6QQUSSoqKlJycrL279+vr371q/3mDYVCCoVCznZXV5ckKRwOKxwOu7b+yFzeZOPanIPFrT5E5nGzr4mGHtADiR4M9/oleiC534NY53E1pASDQUmS3++P2u/3+51jwWBQ2dnZ0YsYMUJZWVnOmI+rrq7WmjVr+u2vra1Venq6G0uPsm5On+tzXmk7d+50db66ujpX50tE9IAeSPRguNcv0QPJvR50d3fHNN7VkHKlVFVVqbKy0tnu6upSXl6eiouL5fP5XLufcDisuro6PXIoWaG+JNfmHQzNq0tcmSfSg/nz58vj8bgyZ6KhB/RAogfDvX6JHkju9yDyTsilcjWk5OTkSJLa2to0YcIEZ39bW5tmzZrljGlvb4+63fnz53Xq1Cnn9h/n9Xrl9Xr77fd4PFfkxAn1JSnUm1ghxe0+XKneJhJ6QA8kejDc65fogeReD2Kdw9W/kzJ58mTl5OSovr7e2dfV1aX9+/crEAhIkgKBgDo6OtTY2OiMefXVV9XX16fCwkI3lwMAABJYzK+knDlzRseOHXO2jx8/rqamJmVlZSk/P18PPPCAvv/97+vaa6/V5MmT9cgjjyg3N1d33nmnJOm6667TggULtHTpUj399NMKh8Navny57r77bq7sAQAAjphDyqFDh/TlL3/Z2Y58VqS8vFybN2/Wd7/7XZ09e1b33XefOjo69MUvflG7du1SWlqac5uf//znWr58uebNm6fk5GSVlZXpxz/+sQvlAACAoSLmkHLrrbfKmAtfopuUlKS1a9dq7dq1FxyTlZWlLVu2xHrXAABgGOG7ewAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsNKIeC8A7rj64RddmcebYrThZmn66pcV6k1yZc4L+cP60is6PwAgsfFKCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFgpriGlpqZGV199tdLS0lRYWKgDBw7EczkAAMAiI+J1x//2b/+myspKPf300yosLNQTTzyhkpIStbS0KDs7O17LwiC6+uEX472EAXlTjDbcLE1f/bJCvUlRx/6wvjROqwKA4Sdur6Q89thjWrp0qe69914VFBTo6aefVnp6un72s5/Fa0kAAMAicXklpaenR42NjaqqqnL2JScnq6ioSA0NDf3Gh0IhhUIhZ7uzs1OSdOrUKYXDYdfWFQ6H1d3drRHhZPX2JV38BkPQiD6j7u4+enCBHpw8eTJOqxpckf8WTp48KY/HE+/lxMVw78Fwr19yrweF1fUurmrw7K+a5/p5cPr0aUmSMeaSxsclpPzpT39Sb2+v/H5/1H6/36/f//73/cZXV1drzZo1/fZPnjz5iq1xOPt6vBdggQv1YPz/GdRlAEDcXMnnu9OnTysjI+Oi4+L2mZRYVFVVqbKy0tnu6+vTqVOnNG7cOCUlufd/+11dXcrLy9N7770nn8/n2ryJhB7QA4keSPRguNcv0QPJ/R4YY3T69Gnl5uZe0vi4hJTx48crJSVFbW1tUfvb2tqUk5PTb7zX65XX643al5mZecXW5/P5hu0JGUEP6IFEDyR6MNzrl+iB5G4PLuUVlIi4fHA2NTVVs2fPVn39/3+frq+vT/X19QoEAvFYEgAAsEzc3u6prKxUeXm55syZo5tvvllPPPGEzp49q3vvvTdeSwIAABaJW0hZtGiR/vjHP2rVqlUKBoOaNWuWdu3a1e/DtIPJ6/Xq0Ucf7ffW0nBCD+iBRA8kejDc65fogRT/HiSZS70OCAAAYBDx3T0AAMBKhBQAAGAlQgoAALASIQUAAFiJkPIRNTU1uvrqq5WWlqbCwkIdOHAg3ku6LKtXr1ZSUlLUz7Rp05zj586dU0VFhcaNG6fRo0errKys3x/Wa21tVWlpqdLT05Wdna0VK1bo/PnzUWN2796tm266SV6vV1OmTNHmzZsHo7wB7d27V7fffrtyc3OVlJSk5557Luq4MUarVq3ShAkTNHLkSBUVFemdd96JGnPq1CktXrxYPp9PmZmZWrJkic6cORM15q233tKXvvQlpaWlKS8vTxs2bOi3lm3btmnatGlKS0vTjBkztHPnTtfr/biL1f+Nb3yj3zmxYMGCqDGJXL/04ddnfP7zn9eYMWOUnZ2tO++8Uy0tLVFjBvPcH+znk0up/9Zbb+13HnzrW9+KGpOo9UvSxo0bNXPmTOcPjwUCAb300kvO8aH8+EdcrAcJdw4YGGOM2bp1q0lNTTU/+9nPzJEjR8zSpUtNZmamaWtri/fSYvboo4+a66+/3rz//vvOzx//+Efn+Le+9S2Tl5dn6uvrzaFDh8zcuXPNX/zFXzjHz58/b6ZPn26KiorMm2++aXbu3GnGjx9vqqqqnDH/9V//ZdLT001lZaU5evSoefLJJ01KSorZtWvXoNYasXPnTvMP//AP5te//rWRZLZv3x51fP369SYjI8M899xz5ne/+535q7/6KzN58mTzwQcfOGMWLFhgbrjhBrNv3z7zH//xH2bKlCnma1/7mnO8s7PT+P1+s3jxYtPc3Gx+8YtfmJEjR5qf/vSnzpjf/OY3JiUlxWzYsMEcPXrUrFy50ng8HnP48OG41l9eXm4WLFgQdU6cOnUqakwi12+MMSUlJWbTpk2mubnZNDU1ma985SsmPz/fnDlzxhkzWOd+PJ5PLqX+v/zLvzRLly6NOg86OzuHRP3GGPPv//7v5sUXXzT/+Z//aVpaWsz3vvc94/F4THNzszFmaD/+l9qDRDsHCCn/z80332wqKiqc7d7eXpObm2uqq6vjuKrL8+ijj5obbrhhwGMdHR3G4/GYbdu2OfvefvttI8k0NDQYYz78hZecnGyCwaAzZuPGjcbn85lQKGSMMea73/2uuf7666PmXrRokSkpKXG5mth9/Jd0X1+fycnJMf/4j//o7Ovo6DBer9f84he/MMYYc/ToUSPJHDx40Bnz0ksvmaSkJPO///u/xhhjnnrqKTN27FinB8YY89BDD5mpU6c623/9139tSktLo9ZTWFho/vZv/9bVGj/JhULKHXfcccHbDKX6I9rb240ks2fPHmPM4J77NjyffLx+Yz78BfXtb3/7grcZSvVHjB071vzzP//zsHv8PyrSA2MS7xzg7R5JPT09amxsVFFRkbMvOTlZRUVFamhoiOPKLt8777yj3NxcXXPNNVq8eLFaW1slSY2NjQqHw1G1Tps2Tfn5+U6tDQ0NmjFjRtQf1ispKVFXV5eOHDnijPnoHJExNvbr+PHjCgaDUevNyMhQYWFhVM2ZmZmaM2eOM6aoqEjJycnav3+/M+aWW25RamqqM6akpEQtLS3685//7IyxtS+7d+9Wdna2pk6dqmXLlunkyZPOsaFYf2dnpyQpKytL0uCd+7Y8n3y8/oif//znGj9+vKZPn66qqip1d3c7x4ZS/b29vdq6davOnj2rQCAw7B5/qX8PIhLpHEiIb0G+0v70pz+pt7e331+79fv9+v3vfx+nVV2+wsJCbd68WVOnTtX777+vNWvW6Etf+pKam5sVDAaVmpra7wsa/X6/gsGgJCkYDA7Yi8ixTxrT1dWlDz74QCNHjrxC1cUusuaB1vvRerKzs6OOjxgxQllZWVFjJk+e3G+OyLGxY8desC+ROeJlwYIFuuuuuzR58mS9++67+t73vqeFCxeqoaFBKSkpQ67+vr4+PfDAA/rCF76g6dOnO2scjHP/z3/+c9yfTwaqX5K+/vWva9KkScrNzdVbb72lhx56SC0tLfr1r38taWjUf/jwYQUCAZ07d06jR4/W9u3bVVBQoKampmHz+F+oB1LinQOElCFo4cKFzr9nzpypwsJCTZo0Sb/85S+tCg8YPHfffbfz7xkzZmjmzJn67Gc/q927d2vevHlxXNmVUVFRoebmZr3++uvxXkpcXKj+++67z/n3jBkzNGHCBM2bN0/vvvuuPvvZzw72Mq+IqVOnqqmpSZ2dnfrVr36l8vJy7dmzJ97LGlQX6kFBQUHCnQO83SNp/PjxSklJ6fcp77a2NuXk5MRpVe7JzMzU5z73OR07dkw5OTnq6elRR0dH1JiP1pqTkzNgLyLHPmmMz+ezLghF1vxJj29OTo7a29ujjp8/f16nTp1ypS+2nUfXXHONxo8fr2PHjkkaWvUvX75cO3bs0GuvvaaJEyc6+wfr3I/388mF6h9IYWGhJEWdB4lef2pqqqZMmaLZs2erurpaN9xwg370ox8Nm8dfunAPBmL7OUBI0YcP6OzZs1VfX+/s6+vrU319fdT7eInqzJkzevfddzVhwgTNnj1bHo8nqtaWlha1trY6tQYCAR0+fDjql1ZdXZ18Pp/zkmEgEIiaIzLGxn5NnjxZOTk5Uevt6urS/v37o2ru6OhQY2OjM+bVV19VX1+f8x9xIBDQ3r17FQ6HnTF1dXWaOnWqxo4d64xJhL78z//8j06ePKkJEyZIGhr1G2O0fPlybd++Xa+++mq/t6YG69yP1/PJxeofSFNTkyRFnQeJWv+F9PX1KRQKDfnH/5NEejAQ68+BmD5mO4Rt3brVeL1es3nzZnP06FFz3333mczMzKhPOCeK73znO2b37t3m+PHj5je/+Y0pKioy48ePN+3t7caYDy/Dy8/PN6+++qo5dOiQCQQCJhAIOLePXIJWXFxsmpqazK5du8xVV1014CVoK1asMG+//bapqamJ6yXIp0+fNm+++aZ58803jSTz2GOPmTfffNP893//tzHmw0uQMzMzzfPPP2/eeustc8cddwx4CfKNN95o9u/fb15//XVz7bXXRl2C29HRYfx+v7nnnntMc3Oz2bp1q0lPT+93Ce6IESPMD3/4Q/P222+bRx99dFAuwf2k+k+fPm3+/u//3jQ0NJjjx4+bV155xdx0003m2muvNefOnRsS9RtjzLJly0xGRobZvXt31OWV3d3dzpjBOvfj8XxysfqPHTtm1q5daw4dOmSOHz9unn/+eXPNNdeYW265ZUjUb4wxDz/8sNmzZ485fvy4eeutt8zDDz9skpKSTG1trTFmaD/+l9KDRDwHCCkf8eSTT5r8/HyTmppqbr75ZrNv3754L+myLFq0yEyYMMGkpqaaz3zmM2bRokXm2LFjzvEPPvjA/N3f/Z0ZO3asSU9PN1/96lfN+++/HzXHH/7wB7Nw4UIzcuRIM378ePOd73zHhMPhqDGvvfaamTVrlklNTTXXXHON2bRp02CUN6DXXnvNSOr3U15eboz58DLkRx55xPj9fuP1es28efNMS0tL1BwnT540X/va18zo0aONz+cz9957rzl9+nTUmN/97nfmi1/8ovF6veYzn/mMWb9+fb+1/PKXvzSf+9znTGpqqrn++uvNiy++eMXqjvik+ru7u01xcbG56qqrjMfjMZMmTTJLly7t92SRyPUbYwasX1LUeTmY5/5gP59crP7W1lZzyy23mKysLOP1es2UKVPMihUrov5GhjGJW78xxnzzm980kyZNMqmpqeaqq64y8+bNcwKKMUP78Y/4pB4k4jmQZIwxsb32AgAAcOXxmRQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArPR/AaBUwVNsjA4BAAAAAElFTkSuQmCC", 718 | "text/plain": [ 719 | "
" 720 | ] 721 | }, 722 | "metadata": {}, 723 | "output_type": "display_data" 724 | } 725 | ], 726 | "source": [ 727 | "import tiktoken\n", 728 | "\n", 729 | "# Load the cl100k_base tokenizer which is designed to work with the ada-002 model\n", 730 | "tokenizer = tiktoken.get_encoding(\"cl100k_base\")\n", 731 | "\n", 732 | "df = pd.read_csv('processed/scraped.csv', index_col=0)\n", 733 | "df.columns = ['title', 'text']\n", 734 | "\n", 735 | "# Tokenize the text and save the number of tokens to a new column\n", 736 | "df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))\n", 737 | "\n", 738 | "# Visualize the distribution of the number of tokens per row using a histogram\n", 739 | "df.n_tokens.hist()" 740 | ] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": 27, 745 | "metadata": {}, 746 | "outputs": [], 747 | "source": [ 748 | "max_tokens = 500\n", 749 | "\n", 750 | "# Function to split the text into chunks of a maximum number of tokens\n", 751 | "def split_into_many(text, max_tokens = max_tokens):\n", 752 | "\n", 753 | " # Split the text into sentences\n", 754 | " sentences = text.split('. ')\n", 755 | "\n", 756 | " # Get the number of tokens for each sentence\n", 757 | " n_tokens = [len(tokenizer.encode(\" \" + sentence)) for sentence in sentences]\n", 758 | " \n", 759 | " chunks = []\n", 760 | " tokens_so_far = 0\n", 761 | " chunk = []\n", 762 | "\n", 763 | " # Loop through the sentences and tokens joined together in a tuple\n", 764 | " for sentence, token in zip(sentences, n_tokens):\n", 765 | "\n", 766 | " # If the number of tokens so far plus the number of tokens in the current sentence is greater \n", 767 | " # than the max number of tokens, then add the chunk to the list of chunks and reset\n", 768 | " # the chunk and tokens so far\n", 769 | " if tokens_so_far + token > max_tokens:\n", 770 | " chunks.append(\". \".join(chunk) + \".\")\n", 771 | " chunk = []\n", 772 | " tokens_so_far = 0\n", 773 | "\n", 774 | " # If the number of tokens in the current sentence is greater than the max number of \n", 775 | " # tokens, go to the next sentence\n", 776 | " if token > max_tokens:\n", 777 | " continue\n", 778 | "\n", 779 | " # Otherwise, add the sentence to the chunk and add the number of tokens to the total\n", 780 | " chunk.append(sentence)\n", 781 | " tokens_so_far += token + 1\n", 782 | "\n", 783 | " # Add the last chunk to the list of chunks\n", 784 | " if chunk:\n", 785 | " chunks.append(\". \".join(chunk) + \".\")\n", 786 | "\n", 787 | " return chunks\n", 788 | " \n", 789 | "\n", 790 | "shortened = []\n", 791 | "\n", 792 | "# Loop through the dataframe\n", 793 | "for row in df.iterrows():\n", 794 | "\n", 795 | " # If the text is None, go to the next row\n", 796 | " if row[1]['text'] is None:\n", 797 | " continue\n", 798 | "\n", 799 | " # If the number of tokens is greater than the max number of tokens, split the text into chunks\n", 800 | " if row[1]['n_tokens'] > max_tokens:\n", 801 | " shortened += split_into_many(row[1]['text'])\n", 802 | " \n", 803 | " # Otherwise, add the text to the list of shortened texts\n", 804 | " else:\n", 805 | " shortened.append( row[1]['text'] )" 806 | ] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": 28, 811 | "metadata": {}, 812 | "outputs": [ 813 | { 814 | "data": { 815 | "text/plain": [ 816 | "" 817 | ] 818 | }, 819 | "execution_count": 28, 820 | "metadata": {}, 821 | "output_type": "execute_result" 822 | }, 823 | { 824 | "data": { 825 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGgCAYAAACABpytAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/P9b71AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAsTklEQVR4nO3df3TU1Z3/8Vd+TCYEmMSgmSEVIlYrpIhQUDLVbq2ERIzWHzm7/mA1bTl6SoMrxKWaLkIAKy7tFsWNsN1FsGebZUtPoYqIhKBx1fArypYfNtWWNrQyyVYM4UcZJsn9/uE3nzomagYmM3fC83HO58Dn3jt37n07J7z8zHwyScYYIwAAAIskx3sBAAAAH0dAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWiSigXHTRRUpKSupxlJeXS5JOnTql8vJyDRs2TEOGDFFpaalaWlrC5mhublZJSYkyMjKUk5OjuXPnqqOjI3o7AgAACS81ksG7du1SZ2enc75v3z5NnTpVf/u3fytJmjNnjl544QWtW7dOmZmZmjVrlm677Ta9/vrrkqTOzk6VlJTI5/PpjTfe0OHDh3XPPffI5XLpscce6/M6urq69N5772no0KFKSkqKZAsAACBOjDE6duyYcnNzlZz8GddIzFl44IEHzOc//3nT1dVl2trajMvlMuvWrXP63377bSPJNDQ0GGOM2bRpk0lOTjaBQMAZs2LFCuPxeEwwGOzz8x46dMhI4uDg4ODg4EjA49ChQ5/5b31EV1A+6vTp0/rP//xPVVRUKCkpSY2NjQqFQiosLHTGjB49WiNHjlRDQ4MKCgrU0NCgyy+/XF6v1xlTXFysmTNnav/+/ZowYUKvzxUMBhUMBp1z8/+/gPngwYMaOnTomW4hTCgU0ssvv6yvfe1rcrlcUZkTPVHn2KDOsUGdY4dax0Z/1/nYsWMaNWpUn/7tPuOAsmHDBrW1tekb3/iGJCkQCCgtLU1ZWVlh47xerwKBgDPmo+Gku7+775MsWbJECxcu7NHe0NCgjIyMM91CDxkZGdqxY0fU5kPvqHNsUOfYoM6xQ61joz/rfPLkSUnq08czzjigrFq1StOmTVNubu6ZTtFnlZWVqqiocM7b29s1YsQIFRUVyePxROU5QqGQamtrNXXqVNJ5P6LOsUGdY4M6xw61jo3+rnN7e3ufx55RQPnDH/6grVu36he/+IXT5vP5dPr0abW1tYVdRWlpaZHP53PG7Ny5M2yu7rt8usf0xu12y+1292h3uVxRL2B/zImeqHNsUOfYoM6xQ61jo7/qHMmcZ/R7UFavXq2cnByVlJQ4bRMnTpTL5VJdXZ3T1tTUpObmZvn9fkmS3+/X3r171dra6oypra2Vx+NRfn7+mSwFAAAMQBFfQenq6tLq1atVVlam1NS/PjwzM1MzZsxQRUWFsrOz5fF4dP/998vv96ugoECSVFRUpPz8fN19991aunSpAoGA5s2bp/Ly8l6vkAAAgHNTxAFl69atam5u1re+9a0efcuWLVNycrJKS0sVDAZVXFysp59+2ulPSUnRxo0bNXPmTPn9fg0ePFhlZWVatGjR2e0CAAAMKBEHlKKiIuc2349LT09XdXW1qqurP/HxeXl52rRpU6RPCwAAziF8Fw8AALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYJ0z/jZjAADw2S56+IV4L6HP3ClGS6+Sxla9pKbv3xjXtXAFBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYJ+KA8qc//Ul///d/r2HDhmnQoEG6/PLLtXv3bqffGKP58+dr+PDhGjRokAoLC/XOO++EzXHkyBFNnz5dHo9HWVlZmjFjho4fP372uwEAAANCRAHlgw8+0NVXXy2Xy6UXX3xRBw4c0L/8y7/ovPPOc8YsXbpUy5cv18qVK7Vjxw4NHjxYxcXFOnXqlDNm+vTp2r9/v2pra7Vx40a9+uqruu+++6K3KwAAkNBSIxn8z//8zxoxYoRWr17ttI0aNcr5uzFGTzzxhObNm6ebb75ZkvSTn/xEXq9XGzZs0B133KG3335bmzdv1q5duzRp0iRJ0lNPPaUbbrhBP/zhD5Wbm9vjeYPBoILBoHPe3t4uSQqFQgqFQpFs4RN1zxOt+dA76hwb1Dk2qHPsJHKt3Skm3kvoM3eycf7sj1pHMmeSMabPlcvPz1dxcbH++Mc/qr6+Xp/73Of0ne98R/fee68k6Xe/+50+//nP66233tL48eOdx331q1/V+PHj9eSTT+qZZ57Rgw8+qA8++MDp7+joUHp6utatW6dbb721x/NWVVVp4cKFPdpramqUkZHR580CAID4OXnypO666y4dPXpUHo/nU8dGdAXld7/7nVasWKGKigp973vf065du/QP//APSktLU1lZmQKBgCTJ6/WGPc7r9Tp9gUBAOTk54YtITVV2drYz5uMqKytVUVHhnLe3t2vEiBEqKir6zA32VSgUUm1traZOnSqXyxWVOdETdY4N6hwb1Dl2ErnWY6teivcS+sydbLR4Upce2Z2sxvnXR33+7ndA+iKigNLV1aVJkybpsccekyRNmDBB+/bt08qVK1VWVhbZKiPgdrvldrt7tLtcrqi/UPtjTvREnWODOscGdY6dRKx1sDMp3kuIWLArqV/qHMmcEX1Idvjw4crPzw9rGzNmjJqbmyVJPp9PktTS0hI2pqWlxenz+XxqbW0N6+/o6NCRI0ecMQAA4NwWUUC5+uqr1dTUFNb2m9/8Rnl5eZI+/MCsz+dTXV2d09/e3q4dO3bI7/dLkvx+v9ra2tTY2OiM2bZtm7q6ujR58uQz3ggAABg4InqLZ86cOfryl7+sxx57TH/3d3+nnTt36sc//rF+/OMfS5KSkpI0e/ZsPfroo7r00ks1atQoPfLII8rNzdUtt9wi6cMrLtdff73uvfderVy5UqFQSLNmzdIdd9zR6x08AADg3BNRQLnyyiu1fv16VVZWatGiRRo1apSeeOIJTZ8+3Rnz3e9+VydOnNB9992ntrY2XXPNNdq8ebPS09OdMT/96U81a9YsTZkyRcnJySotLdXy5cujtysAAJDQIgooknTjjTfqxhtv/MT+pKQkLVq0SIsWLfrEMdnZ2aqpqYn0qQEAwDmC7+IBAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1okooFRVVSkpKSnsGD16tNN/6tQplZeXa9iwYRoyZIhKS0vV0tISNkdzc7NKSkqUkZGhnJwczZ07Vx0dHdHZDQAAGBBSI33AF7/4RW3duvWvE6T+dYo5c+bohRde0Lp165SZmalZs2bptttu0+uvvy5J6uzsVElJiXw+n9544w0dPnxY99xzj1wulx577LEobAcAAAwEEQeU1NRU+Xy+Hu1Hjx7VqlWrVFNTo+uuu06StHr1ao0ZM0bbt29XQUGBtmzZogMHDmjr1q3yer0aP368Fi9erIceekhVVVVKS0s7+x0BAICEF3FAeeedd5Sbm6v09HT5/X4tWbJEI0eOVGNjo0KhkAoLC52xo0eP1siRI9XQ0KCCggI1NDTo8ssvl9frdcYUFxdr5syZ2r9/vyZMmNDrcwaDQQWDQee8vb1dkhQKhRQKhSLdQq+654nWfOgddY4N6hwb1Dl2ErnW7hQT7yX0mTvZOH/2R60jmTOigDJ58mStWbNGl112mQ4fPqyFCxfqK1/5ivbt26dAIKC0tDRlZWWFPcbr9SoQCEiSAoFAWDjp7u/u+yRLlizRwoULe7Rv2bJFGRkZkWzhM9XW1kZ1PvSOOscGdY4N6hw7iVjrpVfFewWRWzypS5s2bYr6vCdPnuzz2IgCyrRp05y/jxs3TpMnT1ZeXp5+9rOfadCgQZFMFZHKykpVVFQ45+3t7RoxYoSKiork8Xii8hyhUEi1tbWaOnWqXC5XVOZET9Q5NqhzbFDn2EnkWo+teineS+gzd7LR4kldemR3shrnXx/1+bvfAemLiN/i+aisrCx94Qtf0LvvvqupU6fq9OnTamtrC7uK0tLS4nxmxefzaefOnWFzdN/l09vnWrq53W653e4e7S6XK+ov1P6YEz1R59igzrFBnWMnEWsd7EyK9xIiFuxK6pc6RzLnWf0elOPHj+u3v/2thg8frokTJ8rlcqmurs7pb2pqUnNzs/x+vyTJ7/dr7969am1tdcbU1tbK4/EoPz//bJYCAAAGkIiuoPzjP/6jbrrpJuXl5em9997TggULlJKSojvvvFOZmZmaMWOGKioqlJ2dLY/Ho/vvv19+v18FBQWSpKKiIuXn5+vuu+/W0qVLFQgENG/ePJWXl/d6hQQAAJybIgoof/zjH3XnnXfq/fff1wUXXKBrrrlG27dv1wUXXCBJWrZsmZKTk1VaWqpgMKji4mI9/fTTzuNTUlK0ceNGzZw5U36/X4MHD1ZZWZkWLVoU3V0BAICEFlFAWbt27af2p6enq7q6WtXV1Z84Ji8vr18+GQwAAAYOvosHAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArHNWAeXxxx9XUlKSZs+e7bSdOnVK5eXlGjZsmIYMGaLS0lK1tLSEPa65uVklJSXKyMhQTk6O5s6dq46OjrNZCgAAGEDOOKDs2rVL//Zv/6Zx48aFtc+ZM0fPP/+81q1bp/r6er333nu67bbbnP7Ozk6VlJTo9OnTeuONN/Tss89qzZo1mj9//pnvAgAADCipZ/Kg48ePa/r06fr3f/93Pfroo0770aNHtWrVKtXU1Oi6666TJK1evVpjxozR9u3bVVBQoC1btujAgQPaunWrvF6vxo8fr8WLF+uhhx5SVVWV0tLSejxfMBhUMBh0ztvb2yVJoVBIoVDoTLbQQ/c80ZoPvaPOsUGdY4M6x04i19qdYuK9hD5zJxvnz/6odSRzJhljIq5cWVmZsrOztWzZMl177bUaP368nnjiCW3btk1TpkzRBx98oKysLGd8Xl6eZs+erTlz5mj+/Pl67rnntGfPHqf/4MGDuvjii/Xmm29qwoQJPZ6vqqpKCxcu7NFeU1OjjIyMSJcPAADi4OTJk7rrrrt09OhReTyeTx0b8RWUtWvX6s0339SuXbt69AUCAaWlpYWFE0nyer0KBALOGK/X26O/u683lZWVqqiocM7b29s1YsQIFRUVfeYG+yoUCqm2tlZTp06Vy+WKypzoiTrHBnWODeocO4lc67FVL8V7CX3mTjZaPKlLj+xOVuP866M+f/c7IH0RUUA5dOiQHnjgAdXW1io9PT3ihZ0pt9stt9vdo93lckX9hdofc6In6hwb1Dk2qHPsJGKtg51J8V5CxIJdSf1S50jmjOhDso2NjWptbdWXvvQlpaamKjU1VfX19Vq+fLlSU1Pl9Xp1+vRptbW1hT2upaVFPp9PkuTz+Xrc1dN93j0GAACc2yIKKFOmTNHevXu1Z88e55g0aZKmT5/u/N3lcqmurs55TFNTk5qbm+X3+yVJfr9fe/fuVWtrqzOmtrZWHo9H+fn5UdoWAABIZBG9xTN06FCNHTs2rG3w4MEaNmyY0z5jxgxVVFQoOztbHo9H999/v/x+vwoKCiRJRUVFys/P1913362lS5cqEAho3rx5Ki8v7/VtHAAAcO45o9uMP82yZcuUnJys0tJSBYNBFRcX6+mnn3b6U1JStHHjRs2cOVN+v1+DBw9WWVmZFi1aFO2lAACABHXWAeWVV14JO09PT1d1dbWqq6s/8TF5eXnatGnT2T41AAAYoPguHgAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGCdiALKihUrNG7cOHk8Hnk8Hvn9fr344otO/6lTp1ReXq5hw4ZpyJAhKi0tVUtLS9gczc3NKikpUUZGhnJycjR37lx1dHREZzcAAGBAiCigXHjhhXr88cfV2Nio3bt367rrrtPNN9+s/fv3S5LmzJmj559/XuvWrVN9fb3ee+893Xbbbc7jOzs7VVJSotOnT+uNN97Qs88+qzVr1mj+/PnR3RUAAEhoqZEMvummm8LOv//972vFihXavn27LrzwQq1atUo1NTW67rrrJEmrV6/WmDFjtH37dhUUFGjLli06cOCAtm7dKq/Xq/Hjx2vx4sV66KGHVFVVpbS0tOjtDAAAJKyIAspHdXZ2at26dTpx4oT8fr8aGxsVCoVUWFjojBk9erRGjhyphoYGFRQUqKGhQZdffrm8Xq8zpri4WDNnztT+/fs1YcKEXp8rGAwqGAw65+3t7ZKkUCikUCh0plsI0z1PtOZD76hzbFDn2KDOsZPItXanmHgvoc/cycb5sz9qHcmcEQeUvXv3yu/369SpUxoyZIjWr1+v/Px87dmzR2lpacrKygob7/V6FQgEJEmBQCAsnHT3d/d9kiVLlmjhwoU92rds2aKMjIxIt/CpamtrozofekedY4M6xwZ1jp1ErPXSq+K9gsgtntSlTZs2RX3ekydP9nlsxAHlsssu0549e3T06FH9/Oc/V1lZmerr6yOdJiKVlZWqqKhwztvb2zVixAgVFRXJ4/FE5TlCoZBqa2s1depUuVyuqMyJnqhzbFDn2KDOsZPItR5b9VK8l9Bn7mSjxZO69MjuZDXOvz7q83e/A9IXEQeUtLQ0XXLJJZKkiRMnateuXXryySd1++236/Tp02prawu7itLS0iKfzydJ8vl82rlzZ9h83Xf5dI/pjdvtltvt7tHucrmi/kLtjznRE3WODeocG9Q5dhKx1sHOpHgvIWLBrqR+qXMkc57170Hp6upSMBjUxIkT5XK5VFdX5/Q1NTWpublZfr9fkuT3+7V37161trY6Y2pra+XxeJSfn3+2SwEAAANERFdQKisrNW3aNI0cOVLHjh1TTU2NXnnlFb300kvKzMzUjBkzVFFRoezsbHk8Ht1///3y+/0qKCiQJBUVFSk/P1933323li5dqkAgoHnz5qm8vLzXKyQAAODcFFFAaW1t1T333KPDhw8rMzNT48aN00svvaSpU6dKkpYtW6bk5GSVlpYqGAyquLhYTz/9tPP4lJQUbdy4UTNnzpTf79fgwYNVVlamRYsWRXdXAAAgoUUUUFatWvWp/enp6aqurlZ1dfUnjsnLy+uXTwYDAICBg+/iAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOtEFFCWLFmiK6+8UkOHDlVOTo5uueUWNTU1hY05deqUysvLNWzYMA0ZMkSlpaVqaWkJG9Pc3KySkhJlZGQoJydHc+fOVUdHx9nvBgAADAgRBZT6+nqVl5dr+/btqq2tVSgUUlFRkU6cOOGMmTNnjp5//nmtW7dO9fX1eu+993Tbbbc5/Z2dnSopKdHp06f1xhtv6Nlnn9WaNWs0f/786O0KAAAktNRIBm/evDnsfM2aNcrJyVFjY6P+5m/+RkePHtWqVatUU1Oj6667TpK0evVqjRkzRtu3b1dBQYG2bNmiAwcOaOvWrfJ6vRo/frwWL16shx56SFVVVUpLS+vxvMFgUMFg0Dlvb2+XJIVCIYVCoYg33ZvueaI1H3pHnWODOscGdY6dRK61O8XEewl95k42zp/9UetI5kwyxpxx5d59911deuml2rt3r8aOHatt27ZpypQp+uCDD5SVleWMy8vL0+zZszVnzhzNnz9fzz33nPbs2eP0Hzx4UBdffLHefPNNTZgwocfzVFVVaeHChT3aa2pqlJGRcabLBwAAMXTy5EndddddOnr0qDwez6eOjegKykd1dXVp9uzZuvrqqzV27FhJUiAQUFpaWlg4kSSv16tAIOCM8Xq9Pfq7+3pTWVmpiooK57y9vV0jRoxQUVHRZ26wr0KhkGprazV16lS5XK6ozImeqHNsUOfYoM6xk8i1Hlv1UryX0GfuZKPFk7r0yO5kNc6/Purzd78D0hdnHFDKy8u1b98+vfbaa2c6RZ+53W653e4e7S6XK+ov1P6YEz1R59igzrFBnWMnEWsd7EyK9xIiFuxK6pc6RzLnGd1mPGvWLG3cuFEvv/yyLrzwQqfd5/Pp9OnTamtrCxvf0tIin8/njPn4XT3d591jAADAuS2igGKM0axZs7R+/Xpt27ZNo0aNCuufOHGiXC6X6urqnLampiY1NzfL7/dLkvx+v/bu3avW1lZnTG1trTwej/Lz889mLwAAYICI6C2e8vJy1dTU6Je//KWGDh3qfGYkMzNTgwYNUmZmpmbMmKGKigplZ2fL4/Ho/vvvl9/vV0FBgSSpqKhI+fn5uvvuu7V06VIFAgHNmzdP5eXlvb6NAwAAzj0RBZQVK1ZIkq699tqw9tWrV+sb3/iGJGnZsmVKTk5WaWmpgsGgiouL9fTTTztjU1JStHHjRs2cOVN+v1+DBw9WWVmZFi1adHY7AQAAA0ZEAaUvdySnp6erurpa1dXVnzgmLy9PmzZtiuSpAQDAOYTv4gEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHVS470AIJFc9PAL8V5Cn7hTjJZeJY2teklN378x3ssBgIhxBQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdSIOKK+++qpuuukm5ebmKikpSRs2bAjrN8Zo/vz5Gj58uAYNGqTCwkK98847YWOOHDmi6dOny+PxKCsrSzNmzNDx48fPaiMAAGDgiDignDhxQldccYWqq6t77V+6dKmWL1+ulStXaseOHRo8eLCKi4t16tQpZ8z06dO1f/9+1dbWauPGjXr11Vd13333nfkuAADAgJIa6QOmTZumadOm9dpnjNETTzyhefPm6eabb5Yk/eQnP5HX69WGDRt0xx136O2339bmzZu1a9cuTZo0SZL01FNP6YYbbtAPf/hD5ebmnsV2AADAQBBxQPk0Bw8eVCAQUGFhodOWmZmpyZMnq6GhQXfccYcaGhqUlZXlhBNJKiwsVHJysnbs2KFbb721x7zBYFDBYNA5b29vlySFQiGFQqGorL17nmjNh94lep3dKSbeS+gTd7Jx/kzUWieCRH89J5JErnWi/NyQ+v9nRyRzRjWgBAIBSZLX6w1r93q9Tl8gEFBOTk74IlJTlZ2d7Yz5uCVLlmjhwoU92rds2aKMjIxoLN1RW1sb1fnQu0St89Kr4r2CyCye1KVNmzbFexkDXqK+nhNRItY60X5uSP33s+PkyZN9HhvVgNJfKisrVVFR4Zy3t7drxIgRKioqksfjicpzhEIh1dbWaurUqXK5XFGZEz0lep3HVr0U7yX0iTvZaPGkLj2yO1mN86+P93IGrER/PSeSRK51ovzckPr/Z0f3OyB9EdWA4vP5JEktLS0aPny4097S0qLx48c7Y1pbW8Me19HRoSNHjjiP/zi32y23292j3eVyRf2F2h9zoqdErXOwMyneS4hIsCspIeucaBL19ZyIErHWifZzQ+q/nx2RzBnV34MyatQo+Xw+1dXVOW3t7e3asWOH/H6/JMnv96utrU2NjY3OmG3btqmrq0uTJ0+O5nIAAECCivgKyvHjx/Xuu+865wcPHtSePXuUnZ2tkSNHavbs2Xr00Ud16aWXatSoUXrkkUeUm5urW265RZI0ZswYXX/99br33nu1cuVKhUIhzZo1S3fccQd38AAAAElnEFB2796tr33ta85592dDysrKtGbNGn33u9/ViRMndN9996mtrU3XXHONNm/erPT0dOcxP/3pTzVr1ixNmTJFycnJKi0t1fLly6OwHQAAMBBEHFCuvfZaGfPJt0wlJSVp0aJFWrRo0SeOyc7OVk1NTaRPDQAAzhF8Fw8AALAOAQUAAFgnIX4PCgAA0oe/UyQRb9tF5LiCAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADr8F08wAB30cMvxHsJEfv94yXxXgKAOOMKCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA63AXDwCcoxLpDi93itHSq+K9CsQSV1AAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA5fFoi4SaQvKkNsJcpro/sL7MZWvaSm798Y7+UAAwpXUAAAgHW4gtKLRPm/t4/6/eMl8V4CAABRwxUUAABgHQIKAACwDgEFAABYh4ACAACsw4dkASAKEvHD9YDNuIICAACsE9eAUl1drYsuukjp6emaPHmydu7cGc/lAAAAS8QtoPz3f/+3KioqtGDBAr355pu64oorVFxcrNbW1ngtCQAAWCJun0H50Y9+pHvvvVff/OY3JUkrV67UCy+8oGeeeUYPP/xw2NhgMKhgMOicHz16VJJ05MgRhUKhqKwnFArp5MmTev/995XacSIqc8bS+++/H+8l9Emi1zlRpHYZnTzZpdRQsjq7kuK9nAGLOscOtY6Nj9a5P/5dOXbsmCTJGPPZg00cBINBk5KSYtavXx/Wfs8995ivf/3rPcYvWLDASOLg4ODg4OAYAMehQ4c+MyvE5QrKn//8Z3V2dsrr9Ya1e71e/frXv+4xvrKyUhUVFc55V1eXjhw5omHDhikpKTpJur29XSNGjNChQ4fk8XiiMid6os6xQZ1jgzrHDrWOjf6uszFGx44dU25u7meOTYjbjN1ut9xud1hbVlZWvzyXx+PhxR8D1Dk2qHNsUOfYodax0Z91zszM7NO4uHxI9vzzz1dKSopaWlrC2ltaWuTz+eKxJAAAYJG4BJS0tDRNnDhRdXV1TltXV5fq6urk9/vjsSQAAGCRuL3FU1FRobKyMk2aNElXXXWVnnjiCZ04ccK5qyfW3G63FixY0OOtJEQXdY4N6hwb1Dl2qHVs2FTnJGP6cq9P//jXf/1X/eAHP1AgEND48eO1fPlyTZ48OV7LAQAAlohrQAEAAOgN38UDAACsQ0ABAADWIaAAAADrEFAAAIB1CCiSqqurddFFFyk9PV2TJ0/Wzp07472khPLqq6/qpptuUm5urpKSkrRhw4awfmOM5s+fr+HDh2vQoEEqLCzUO++8EzbmyJEjmj59ujwej7KysjRjxgwdP348hruw35IlS3TllVdq6NChysnJ0S233KKmpqawMadOnVJ5ebmGDRumIUOGqLS0tMcvRGxublZJSYkyMjKUk5OjuXPnqqOjI5ZbsdqKFSs0btw45zdp+v1+vfjii04/Ne4fjz/+uJKSkjR79mynjVpHR1VVlZKSksKO0aNHO/3W1vnsv/ovsa1du9akpaWZZ555xuzfv9/ce++9Jisry7S0tMR7aQlj06ZN5p/+6Z/ML37xCyOpx5dAPv744yYzM9Ns2LDB/O///q/5+te/bkaNGmX+8pe/OGOuv/56c8UVV5jt27eb//mf/zGXXHKJufPOO2O8E7sVFxeb1atXm3379pk9e/aYG264wYwcOdIcP37cGfPtb3/bjBgxwtTV1Zndu3ebgoIC8+Uvf9np7+joMGPHjjWFhYXmrbfeMps2bTLnn3++qaysjMeWrPTcc8+ZF154wfzmN78xTU1N5nvf+55xuVxm3759xhhq3B927txpLrroIjNu3DjzwAMPOO3UOjoWLFhgvvjFL5rDhw87x//93/85/bbW+ZwPKFdddZUpLy93zjs7O01ubq5ZsmRJHFeVuD4eULq6uozP5zM/+MEPnLa2tjbjdrvNf/3XfxljjDlw4ICRZHbt2uWMefHFF01SUpL505/+FLO1J5rW1lYjydTX1xtjPqyry+Uy69atc8a8/fbbRpJpaGgwxnwYJpOTk00gEHDGrFixwng8HhMMBmO7gQRy3nnnmf/4j/+gxv3g2LFj5tJLLzW1tbXmq1/9qhNQqHX0LFiwwFxxxRW99tlc53P6LZ7Tp0+rsbFRhYWFTltycrIKCwvV0NAQx5UNHAcPHlQgEAircWZmpiZPnuzUuKGhQVlZWZo0aZIzprCwUMnJydqxY0fM15wojh49KknKzs6WJDU2NioUCoXVevTo0Ro5cmRYrS+//PKwbxIvLi5We3u79u/fH8PVJ4bOzk6tXbtWJ06ckN/vp8b9oLy8XCUlJWE1lXg9R9s777yj3NxcXXzxxZo+fbqam5sl2V3nhPg24/7y5z//WZ2dnWFFlySv16tf//rXcVrVwBIIBCSp1xp39wUCAeXk5IT1p6amKjs72xmDcF1dXZo9e7auvvpqjR07VtKHdUxLS+vxTd8fr3Vv/y26+/ChvXv3yu/369SpUxoyZIjWr1+v/Px87dmzhxpH0dq1a/Xmm29q165dPfp4PUfP5MmTtWbNGl122WU6fPiwFi5cqK985Svat2+f1XU+pwMKkKjKy8u1b98+vfbaa/FeyoB02WWXac+ePTp69Kh+/vOfq6ysTPX19fFe1oBy6NAhPfDAA6qtrVV6enq8lzOgTZs2zfn7uHHjNHnyZOXl5elnP/uZBg0aFMeVfbpz+i2e888/XykpKT0+rdzS0iKfzxenVQ0s3XX8tBr7fD61traG9Xd0dOjIkSP8d+jFrFmztHHjRr388su68MILnXafz6fTp0+rra0tbPzHa93bf4vuPnwoLS1Nl1xyiSZOnKglS5boiiuu0JNPPkmNo6ixsVGtra360pe+pNTUVKWmpqq+vl7Lly9XamqqvF4vte4nWVlZ+sIXvqB3333X6tf0OR1Q0tLSNHHiRNXV1TltXV1dqqurk9/vj+PKBo5Ro0bJ5/OF1bi9vV07duxwauz3+9XW1qbGxkZnzLZt29TV1cWXR36EMUazZs3S+vXrtW3bNo0aNSqsf+LEiXK5XGG1bmpqUnNzc1it9+7dGxYIa2tr5fF4lJ+fH5uNJKCuri4Fg0FqHEVTpkzR3r17tWfPHueYNGmSpk+f7vydWveP48eP67e//a2GDx9u92u63z5+myDWrl1r3G63WbNmjTlw4IC57777TFZWVtinlfHpjh07Zt566y3z1ltvGUnmRz/6kXnrrbfMH/7wB2PMh7cZZ2VlmV/+8pfmV7/6lbn55pt7vc14woQJZseOHea1114zl156KbcZf8zMmTNNZmameeWVV8JuFzx58qQz5tvf/rYZOXKk2bZtm9m9e7fx+/3G7/c7/d23CxYVFZk9e/aYzZs3mwsuuIDbMj/i4YcfNvX19ebgwYPmV7/6lXn44YdNUlKS2bJlizGGGvenj97FYwy1jpYHH3zQvPLKK+bgwYPm9ddfN4WFheb88883ra2txhh763zOBxRjjHnqqafMyJEjTVpamrnqqqvM9u3b472khPLyyy8bST2OsrIyY8yHtxo/8sgjxuv1GrfbbaZMmWKamprC5nj//ffNnXfeaYYMGWI8Ho/55je/aY4dOxaH3dirtxpLMqtXr3bG/OUvfzHf+c53zHnnnWcyMjLMrbfeag4fPhw2z+9//3szbdo0M2jQIHP++eebBx980IRCoRjvxl7f+ta3TF5enklLSzMXXHCBmTJlihNOjKHG/enjAYVaR8ftt99uhg8fbtLS0sznPvc5c/vtt5t3333X6be1zknGGNN/12cAAAAid05/BgUAANiJgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1vl/gvKK3Dyq3sYAAAAASUVORK5CYII=", 826 | "text/plain": [ 827 | "
" 828 | ] 829 | }, 830 | "metadata": {}, 831 | "output_type": "display_data" 832 | } 833 | ], 834 | "source": [ 835 | "df = pd.DataFrame(shortened, columns = ['text'])\n", 836 | "df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))\n", 837 | "df.n_tokens.hist()" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": 29, 843 | "metadata": {}, 844 | "outputs": [ 845 | { 846 | "data": { 847 | "text/html": [ 848 | "
\n", 849 | "\n", 862 | "\n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | "
textn_tokensembeddings
0blog authors maddie. Maddie Hall - OpenAI ...175[-0.012958061881363392, -0.006103983614593744,...
1blog authors tom. Tom Brown - OpenAI ...228[-0.0053874170407652855, -0.009962032549083233...
2blog openai scholars 2019 final projects. Op...492[0.0019150723237544298, -0.0070442273281514645...
3In this project, I used curiosity-driven explo...478[-0.0067560747265815735, 0.0004431474662851542...
4Results revealed that the optimal RL policies ...499[-0.012868616729974747, 0.0029640409629791975,...
\n", 904 | "
" 905 | ], 906 | "text/plain": [ 907 | " text n_tokens \\\n", 908 | "0 blog authors maddie. Maddie Hall - OpenAI ... 175 \n", 909 | "1 blog authors tom. Tom Brown - OpenAI ... 228 \n", 910 | "2 blog openai scholars 2019 final projects. Op... 492 \n", 911 | "3 In this project, I used curiosity-driven explo... 478 \n", 912 | "4 Results revealed that the optimal RL policies ... 499 \n", 913 | "\n", 914 | " embeddings \n", 915 | "0 [-0.012958061881363392, -0.006103983614593744,... \n", 916 | "1 [-0.0053874170407652855, -0.009962032549083233... \n", 917 | "2 [0.0019150723237544298, -0.0070442273281514645... \n", 918 | "3 [-0.0067560747265815735, 0.0004431474662851542... \n", 919 | "4 [-0.012868616729974747, 0.0029640409629791975,... " 920 | ] 921 | }, 922 | "execution_count": 29, 923 | "metadata": {}, 924 | "output_type": "execute_result" 925 | } 926 | ], 927 | "source": [ 928 | "import openai\n", 929 | "\n", 930 | "df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])\n", 931 | "df.to_csv('processed/embeddings.csv')\n", 932 | "df.head()" 933 | ] 934 | }, 935 | { 936 | "cell_type": "code", 937 | "execution_count": 31, 938 | "metadata": {}, 939 | "outputs": [ 940 | { 941 | "data": { 942 | "text/html": [ 943 | "
\n", 944 | "\n", 957 | "\n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | "
textn_tokensembeddings
0blog authors maddie. Maddie Hall - OpenAI ...175[-0.012958061881363392, -0.006103983614593744,...
1blog authors tom. Tom Brown - OpenAI ...228[-0.0053874170407652855, -0.009962032549083233...
2blog openai scholars 2019 final projects. Op...492[0.0019150723237544298, -0.0070442273281514645...
3In this project, I used curiosity-driven explo...478[-0.0067560747265815735, 0.0004431474662851542...
4Results revealed that the optimal RL policies ...499[-0.012868616729974747, 0.0029640409629791975,...
\n", 999 | "
" 1000 | ], 1001 | "text/plain": [ 1002 | " text n_tokens \\\n", 1003 | "0 blog authors maddie. Maddie Hall - OpenAI ... 175 \n", 1004 | "1 blog authors tom. Tom Brown - OpenAI ... 228 \n", 1005 | "2 blog openai scholars 2019 final projects. Op... 492 \n", 1006 | "3 In this project, I used curiosity-driven explo... 478 \n", 1007 | "4 Results revealed that the optimal RL policies ... 499 \n", 1008 | "\n", 1009 | " embeddings \n", 1010 | "0 [-0.012958061881363392, -0.006103983614593744,... \n", 1011 | "1 [-0.0053874170407652855, -0.009962032549083233... \n", 1012 | "2 [0.0019150723237544298, -0.0070442273281514645... \n", 1013 | "3 [-0.0067560747265815735, 0.0004431474662851542... \n", 1014 | "4 [-0.012868616729974747, 0.0029640409629791975,... " 1015 | ] 1016 | }, 1017 | "execution_count": 31, 1018 | "metadata": {}, 1019 | "output_type": "execute_result" 1020 | } 1021 | ], 1022 | "source": [ 1023 | "import pandas as pd\n", 1024 | "import numpy as np\n", 1025 | "from ast import literal_eval\n", 1026 | "from openai.embeddings_utils import distances_from_embeddings, cosine_similarity\n", 1027 | "\n", 1028 | "df=pd.read_csv('processed/embeddings.csv', index_col=0)\n", 1029 | "df['embeddings'] = df['embeddings'].apply(literal_eval).apply(np.array)\n", 1030 | "\n", 1031 | "df.head()" 1032 | ] 1033 | }, 1034 | { 1035 | "cell_type": "code", 1036 | "execution_count": 32, 1037 | "metadata": {}, 1038 | "outputs": [ 1039 | { 1040 | "data": { 1041 | "text/plain": [ 1042 | "'No, you are not allowed to publish model outputs to Twitter without a human review. You must manually review each generation before sharing or while streaming, and indicate that the content is AI-generated in a way no user could reasonably miss or misunderstand.'" 1043 | ] 1044 | }, 1045 | "execution_count": 32, 1046 | "metadata": {}, 1047 | "output_type": "execute_result" 1048 | } 1049 | ], 1050 | "source": [ 1051 | "def create_context(\n", 1052 | " question, df, max_len=1800, size=\"ada\"\n", 1053 | "):\n", 1054 | " \"\"\"\n", 1055 | " Create a context for a question by finding the most similar context from the dataframe\n", 1056 | " \"\"\"\n", 1057 | "\n", 1058 | " # Get the embeddings for the question\n", 1059 | " q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']\n", 1060 | "\n", 1061 | " # Get the distances from the embeddings\n", 1062 | " df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')\n", 1063 | "\n", 1064 | "\n", 1065 | " returns = []\n", 1066 | " cur_len = 0\n", 1067 | "\n", 1068 | " # Sort by distance and add the text to the context until the context is too long\n", 1069 | " for i, row in df.sort_values('distances', ascending=True).iterrows():\n", 1070 | " \n", 1071 | " # Add the length of the text to the current length\n", 1072 | " cur_len += row['n_tokens'] + 4\n", 1073 | " \n", 1074 | " # If the context is too long, break\n", 1075 | " if cur_len > max_len:\n", 1076 | " break\n", 1077 | " \n", 1078 | " # Else add it to the text that is being returned\n", 1079 | " returns.append(row[\"text\"])\n", 1080 | "\n", 1081 | " # Return the context\n", 1082 | " return \"\\n\\n###\\n\\n\".join(returns)\n", 1083 | "\n", 1084 | "def answer_question(\n", 1085 | " df,\n", 1086 | " model=\"text-davinci-003\",\n", 1087 | " question=\"Am I allowed to publish model outputs to Twitter, without a human review?\",\n", 1088 | " max_len=1800,\n", 1089 | " size=\"ada\",\n", 1090 | " debug=False,\n", 1091 | " max_tokens=150,\n", 1092 | " stop_sequence=None\n", 1093 | "):\n", 1094 | " \"\"\"\n", 1095 | " Answer a question based on the most similar context from the dataframe texts\n", 1096 | " \"\"\"\n", 1097 | " context = create_context(\n", 1098 | " question,\n", 1099 | " df,\n", 1100 | " max_len=max_len,\n", 1101 | " size=size,\n", 1102 | " )\n", 1103 | " # If debug, print the raw model response\n", 1104 | " if debug:\n", 1105 | " print(\"Context:\\n\" + context)\n", 1106 | " print(\"\\n\\n\")\n", 1107 | "\n", 1108 | " try:\n", 1109 | " # Create a completions using the question and context\n", 1110 | " response = openai.Completion.create(\n", 1111 | " prompt=f\"Answer the question based on the context below, and if the question can't be answered based on the context, say \\\"I don't know\\\"\\n\\nContext: {context}\\n\\n---\\n\\nQuestion: {question}\\nAnswer:\",\n", 1112 | " temperature=0,\n", 1113 | " max_tokens=max_tokens,\n", 1114 | " top_p=1,\n", 1115 | " frequency_penalty=0,\n", 1116 | " presence_penalty=0,\n", 1117 | " stop=stop_sequence,\n", 1118 | " model=model,\n", 1119 | " )\n", 1120 | " return response[\"choices\"][0][\"text\"].strip()\n", 1121 | " except Exception as e:\n", 1122 | " print(e)\n", 1123 | " return \"\"" 1124 | ] 1125 | }, 1126 | { 1127 | "cell_type": "code", 1128 | "execution_count": 33, 1129 | "metadata": {}, 1130 | "outputs": [ 1131 | { 1132 | "data": { 1133 | "text/plain": [ 1134 | "\"I don't know.\"" 1135 | ] 1136 | }, 1137 | "execution_count": 33, 1138 | "metadata": {}, 1139 | "output_type": "execute_result" 1140 | } 1141 | ], 1142 | "source": [ 1143 | "answer_question(df, question=\"What day is it?\", debug=False)" 1144 | ] 1145 | }, 1146 | { 1147 | "cell_type": "code", 1148 | "execution_count": 34, 1149 | "metadata": {}, 1150 | "outputs": [ 1151 | { 1152 | "data": { 1153 | "text/plain": [ 1154 | "'The newest embeddings model is text-embedding-ada-002.'" 1155 | ] 1156 | }, 1157 | "execution_count": 34, 1158 | "metadata": {}, 1159 | "output_type": "execute_result" 1160 | } 1161 | ], 1162 | "source": [ 1163 | "answer_question(df, question=\"What is our newest embeddings model?\")" 1164 | ] 1165 | } 1166 | ], 1167 | "metadata": { 1168 | "kernelspec": { 1169 | "display_name": "env", 1170 | "language": "python", 1171 | "name": "python3" 1172 | }, 1173 | "language_info": { 1174 | "codemirror_mode": { 1175 | "name": "ipython", 1176 | "version": 3 1177 | }, 1178 | "file_extension": ".py", 1179 | "mimetype": "text/x-python", 1180 | "name": "python", 1181 | "nbconvert_exporter": "python", 1182 | "pygments_lexer": "ipython3", 1183 | "version": "3.9.6" 1184 | }, 1185 | "orig_nbformat": 4, 1186 | "vscode": { 1187 | "interpreter": { 1188 | "hash": "05f34a34d73b71652304030c1097be3a5720ea2447153dd6542d145a26b73181" 1189 | } 1190 | } 1191 | }, 1192 | "nbformat": 4, 1193 | "nbformat_minor": 2 1194 | } 1195 | -------------------------------------------------------------------------------- /web-qa.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### Step 1 3 | ################################################################################ 4 | 5 | import requests 6 | import re 7 | import urllib.request 8 | from bs4 import BeautifulSoup 9 | from collections import deque 10 | from html.parser import HTMLParser 11 | from urllib.parse import urlparse 12 | import os 13 | import pandas as pd 14 | import tiktoken 15 | import openai 16 | import numpy as np 17 | from openai.embeddings_utils import distances_from_embeddings, cosine_similarity 18 | from ast import literal_eval 19 | 20 | # Regex pattern to match a URL 21 | HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$' 22 | 23 | # Define OpenAI api_key 24 | # openai.api_key = '' 25 | 26 | # Define root domain to crawl 27 | domain = "openai.com" 28 | full_url = "https://openai.com/" 29 | 30 | # Create a class to parse the HTML and get the hyperlinks 31 | class HyperlinkParser(HTMLParser): 32 | def __init__(self): 33 | super().__init__() 34 | # Create a list to store the hyperlinks 35 | self.hyperlinks = [] 36 | 37 | # Override the HTMLParser's handle_starttag method to get the hyperlinks 38 | def handle_starttag(self, tag, attrs): 39 | attrs = dict(attrs) 40 | 41 | # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks 42 | if tag == "a" and "href" in attrs: 43 | self.hyperlinks.append(attrs["href"]) 44 | 45 | ################################################################################ 46 | ### Step 2 47 | ################################################################################ 48 | 49 | # Function to get the hyperlinks from a URL 50 | def get_hyperlinks(url): 51 | 52 | # Try to open the URL and read the HTML 53 | try: 54 | # Open the URL and read the HTML 55 | with urllib.request.urlopen(url) as response: 56 | 57 | # If the response is not HTML, return an empty list 58 | if not response.info().get('Content-Type').startswith("text/html"): 59 | return [] 60 | 61 | # Decode the HTML 62 | html = response.read().decode('utf-8') 63 | except Exception as e: 64 | print(e) 65 | return [] 66 | 67 | # Create the HTML Parser and then Parse the HTML to get hyperlinks 68 | parser = HyperlinkParser() 69 | parser.feed(html) 70 | 71 | return parser.hyperlinks 72 | 73 | ################################################################################ 74 | ### Step 3 75 | ################################################################################ 76 | 77 | # Function to get the hyperlinks from a URL that are within the same domain 78 | def get_domain_hyperlinks(local_domain, url): 79 | clean_links = [] 80 | for link in set(get_hyperlinks(url)): 81 | clean_link = None 82 | 83 | # If the link is a URL, check if it is within the same domain 84 | if re.search(HTTP_URL_PATTERN, link): 85 | # Parse the URL and check if the domain is the same 86 | url_obj = urlparse(link) 87 | if url_obj.netloc == local_domain: 88 | clean_link = link 89 | 90 | # If the link is not a URL, check if it is a relative link 91 | else: 92 | if link.startswith("/"): 93 | link = link[1:] 94 | elif ( 95 | link.startswith("#") 96 | or link.startswith("mailto:") 97 | or link.startswith("tel:") 98 | ): 99 | continue 100 | clean_link = "https://" + local_domain + "/" + link 101 | 102 | if clean_link is not None: 103 | if clean_link.endswith("/"): 104 | clean_link = clean_link[:-1] 105 | clean_links.append(clean_link) 106 | 107 | # Return the list of hyperlinks that are within the same domain 108 | return list(set(clean_links)) 109 | 110 | 111 | ################################################################################ 112 | ### Step 4 113 | ################################################################################ 114 | 115 | def crawl(url): 116 | # Parse the URL and get the domain 117 | local_domain = urlparse(url).netloc 118 | 119 | # Create a queue to store the URLs to crawl 120 | queue = deque([url]) 121 | 122 | # Create a set to store the URLs that have already been seen (no duplicates) 123 | seen = set([url]) 124 | 125 | # Create a directory to store the text files 126 | if not os.path.exists("text/"): 127 | os.mkdir("text/") 128 | 129 | if not os.path.exists("text/"+local_domain+"/"): 130 | os.mkdir("text/" + local_domain + "/") 131 | 132 | # Create a directory to store the csv files 133 | if not os.path.exists("processed"): 134 | os.mkdir("processed") 135 | 136 | # While the queue is not empty, continue crawling 137 | while queue: 138 | 139 | # Get the next URL from the queue 140 | url = queue.pop() 141 | print(url) # for debugging and to see the progress 142 | 143 | # Try extracting the text from the link, if failed proceed with the next item in the queue 144 | try: 145 | # Save text from the url to a .txt file 146 | with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f: 147 | 148 | # Get the text from the URL using BeautifulSoup 149 | soup = BeautifulSoup(requests.get(url).text, "html.parser") 150 | 151 | # Get the text but remove the tags 152 | text = soup.get_text() 153 | 154 | # If the crawler gets to a page that requires JavaScript, it will stop the crawl 155 | if ("You need to enable JavaScript to run this app." in text): 156 | print("Unable to parse page " + url + " due to JavaScript being required") 157 | 158 | # Otherwise, write the text to the file in the text directory 159 | f.write(text) 160 | except Exception as e: 161 | print("Unable to parse page " + url) 162 | 163 | # Get the hyperlinks from the URL and add them to the queue 164 | for link in get_domain_hyperlinks(local_domain, url): 165 | if link not in seen: 166 | queue.append(link) 167 | seen.add(link) 168 | 169 | crawl(full_url) 170 | 171 | ################################################################################ 172 | ### Step 5 173 | ################################################################################ 174 | 175 | def remove_newlines(serie): 176 | serie = serie.str.replace('\n', ' ') 177 | serie = serie.str.replace('\\n', ' ') 178 | serie = serie.str.replace(' ', ' ') 179 | serie = serie.str.replace(' ', ' ') 180 | return serie 181 | 182 | 183 | ################################################################################ 184 | ### Step 6 185 | ################################################################################ 186 | 187 | # Create a list to store the text files 188 | texts=[] 189 | 190 | # Get all the text files in the text directory 191 | for file in os.listdir("text/" + domain + "/"): 192 | 193 | # Open the file and read the text 194 | with open("text/" + domain + "/" + file, "r", encoding="UTF-8") as f: 195 | text = f.read() 196 | 197 | # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces. 198 | texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text)) 199 | 200 | # Create a dataframe from the list of texts 201 | df = pd.DataFrame(texts, columns = ['fname', 'text']) 202 | 203 | # Set the text column to be the raw text with the newlines removed 204 | df['text'] = df.fname + ". " + remove_newlines(df.text) 205 | df.to_csv('processed/scraped.csv') 206 | df.head() 207 | 208 | ################################################################################ 209 | ### Step 7 210 | ################################################################################ 211 | 212 | # Load the cl100k_base tokenizer which is designed to work with the ada-002 model 213 | tokenizer = tiktoken.get_encoding("cl100k_base") 214 | 215 | df = pd.read_csv('processed/scraped.csv', index_col=0) 216 | df.columns = ['title', 'text'] 217 | 218 | # Tokenize the text and save the number of tokens to a new column 219 | df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x))) 220 | 221 | # Visualize the distribution of the number of tokens per row using a histogram 222 | df.n_tokens.hist() 223 | 224 | ################################################################################ 225 | ### Step 8 226 | ################################################################################ 227 | 228 | max_tokens = 500 229 | 230 | # Function to split the text into chunks of a maximum number of tokens 231 | def split_into_many(text, max_tokens = max_tokens): 232 | 233 | # Split the text into sentences 234 | sentences = text.split('. ') 235 | 236 | # Get the number of tokens for each sentence 237 | n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences] 238 | 239 | chunks = [] 240 | tokens_so_far = 0 241 | chunk = [] 242 | 243 | # Loop through the sentences and tokens joined together in a tuple 244 | for sentence, token in zip(sentences, n_tokens): 245 | 246 | # If the number of tokens so far plus the number of tokens in the current sentence is greater 247 | # than the max number of tokens, then add the chunk to the list of chunks and reset 248 | # the chunk and tokens so far 249 | if tokens_so_far + token > max_tokens: 250 | chunks.append(". ".join(chunk) + ".") 251 | chunk = [] 252 | tokens_so_far = 0 253 | 254 | # If the number of tokens in the current sentence is greater than the max number of 255 | # tokens, go to the next sentence 256 | if token > max_tokens: 257 | continue 258 | 259 | # Otherwise, add the sentence to the chunk and add the number of tokens to the total 260 | chunk.append(sentence) 261 | tokens_so_far += token + 1 262 | 263 | # Add the last chunk to the list of chunks 264 | if chunk: 265 | chunks.append(". ".join(chunk) + ".") 266 | 267 | return chunks 268 | 269 | 270 | shortened = [] 271 | 272 | # Loop through the dataframe 273 | for row in df.iterrows(): 274 | 275 | # If the text is None, go to the next row 276 | if row[1]['text'] is None: 277 | continue 278 | 279 | # If the number of tokens is greater than the max number of tokens, split the text into chunks 280 | if row[1]['n_tokens'] > max_tokens: 281 | shortened += split_into_many(row[1]['text']) 282 | 283 | # Otherwise, add the text to the list of shortened texts 284 | else: 285 | shortened.append( row[1]['text'] ) 286 | 287 | ################################################################################ 288 | ### Step 9 289 | ################################################################################ 290 | 291 | df = pd.DataFrame(shortened, columns = ['text']) 292 | df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x))) 293 | df.n_tokens.hist() 294 | 295 | ################################################################################ 296 | ### Step 10 297 | ################################################################################ 298 | 299 | # Note that you may run into rate limit issues depending on how many files you try to embed 300 | # Please check out our rate limit guide to learn more on how to handle this: https://platform.openai.com/docs/guides/rate-limits 301 | 302 | df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding']) 303 | df.to_csv('processed/embeddings.csv') 304 | df.head() 305 | 306 | ################################################################################ 307 | ### Step 11 308 | ################################################################################ 309 | 310 | df=pd.read_csv('processed/embeddings.csv', index_col=0) 311 | df['embeddings'] = df['embeddings'].apply(literal_eval).apply(np.array) 312 | 313 | df.head() 314 | 315 | ################################################################################ 316 | ### Step 12 317 | ################################################################################ 318 | 319 | def create_context( 320 | question, df, max_len=1800, size="ada" 321 | ): 322 | """ 323 | Create a context for a question by finding the most similar context from the dataframe 324 | """ 325 | 326 | # Get the embeddings for the question 327 | q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding'] 328 | 329 | # Get the distances from the embeddings 330 | df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine') 331 | 332 | 333 | returns = [] 334 | cur_len = 0 335 | 336 | # Sort by distance and add the text to the context until the context is too long 337 | for i, row in df.sort_values('distances', ascending=True).iterrows(): 338 | 339 | # Add the length of the text to the current length 340 | cur_len += row['n_tokens'] + 4 341 | 342 | # If the context is too long, break 343 | if cur_len > max_len: 344 | break 345 | 346 | # Else add it to the text that is being returned 347 | returns.append(row["text"]) 348 | 349 | # Return the context 350 | return "\n\n###\n\n".join(returns) 351 | 352 | def answer_question( 353 | df, 354 | model="text-davinci-003", 355 | question="Am I allowed to publish model outputs to Twitter, without a human review?", 356 | max_len=1800, 357 | size="ada", 358 | debug=False, 359 | max_tokens=150, 360 | stop_sequence=None 361 | ): 362 | """ 363 | Answer a question based on the most similar context from the dataframe texts 364 | """ 365 | context = create_context( 366 | question, 367 | df, 368 | max_len=max_len, 369 | size=size, 370 | ) 371 | # If debug, print the raw model response 372 | if debug: 373 | print("Context:\n" + context) 374 | print("\n\n") 375 | 376 | try: 377 | # Create a completions using the questin and context 378 | response = openai.Completion.create( 379 | prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:", 380 | temperature=0, 381 | max_tokens=max_tokens, 382 | top_p=1, 383 | frequency_penalty=0, 384 | presence_penalty=0, 385 | stop=stop_sequence, 386 | model=model, 387 | ) 388 | return response["choices"][0]["text"].strip() 389 | except Exception as e: 390 | print(e) 391 | return "" 392 | 393 | ################################################################################ 394 | ### Step 13 395 | ################################################################################ 396 | 397 | print(answer_question(df, question="What day is it?", debug=False)) 398 | 399 | print(answer_question(df, question="What is our newest embeddings model?")) 400 | --------------------------------------------------------------------------------