├── README.md
├── requirements.txt
├── web-qa.ipynb
└── web-qa.py


/README.md:
--------------------------------------------------------------------------------
1 | # Web  Q&A with Embeddings
2 | 
3 | Learn how to crawl your website and build a Q/A bot with the OpenAI API. You can find the full tutorial in the [OpenAI documentation](https://platform.openai.com/docs/tutorials/web-qa-embeddings).
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.8.5
 2 | aiosignal==1.3.1
 3 | appnope==0.1.3
 4 | asttokens==2.2.1
 5 | async-timeout==4.0.2
 6 | attrs==22.2.0
 7 | backcall==0.2.0
 8 | beautifulsoup4==4.11.1
 9 | blobfile==2.0.1
10 | bs4==0.0.1
11 | certifi==2023.7.22
12 | charset-normalizer==2.1.1
13 | comm==0.1.2
14 | contourpy==1.0.7
15 | cycler==0.11.0
16 | debugpy==1.6.5
17 | decorator==5.1.1
18 | docopt==0.6.2
19 | entrypoints==0.4
20 | executing==1.2.0
21 | filelock==3.9.0
22 | fonttools==4.38.0
23 | frozenlist==1.3.3
24 | huggingface-hub>=0.0.12
25 | idna==3.4
26 | ipykernel==6.20.1
27 | ipython==8.10.0
28 | jedi==0.18.2
29 | joblib==1.2.0
30 | jupyter_client==7.4.8
31 | jupyter_core==5.1.3
32 | kiwisolver==1.4.4
33 | lxml==4.9.2
34 | matplotlib==3.6.3
35 | matplotlib-inline==0.1.6
36 | multidict==6.0.4
37 | nest-asyncio==1.5.6
38 | numpy==1.24.1
39 | openai==0.26.1
40 | packaging==23.0
41 | pandas==1.5.2
42 | parso==0.8.3
43 | pexpect==4.8.0
44 | pickleshare==0.7.5
45 | Pillow==9.4.0
46 | pipreqs==0.4.12
47 | platformdirs==2.6.2
48 | plotly==5.12.0
49 | prompt-toolkit==3.0.36
50 | psutil==5.9.4
51 | ptyprocess==0.7.0
52 | pure-eval==0.2.2
53 | pycryptodomex==3.17
54 | Pygments==2.15.0
55 | pyparsing==3.0.9
56 | python-dateutil==2.8.2
57 | pytz==2022.7.1
58 | PyYAML==6.0
59 | pyzmq==24.0.1
60 | regex==2022.10.31
61 | requests==2.31.0
62 | scikit-learn==1.2.0
63 | scipy==1.10.0
64 | six==1.16.0
65 | soupsieve==2.3.2.post1
66 | stack-data==0.6.2
67 | tenacity==8.1.0
68 | threadpoolctl==3.1.0
69 | tiktoken==0.1.2
70 | tokenizers==0.13.2
71 | tornado==6.3.3
72 | tqdm==4.64.1
73 | traitlets==5.8.1
74 | transformers==4.30.0
75 | typing_extensions==4.4.0
76 | urllib3==1.26.13
77 | wcwidth==0.2.5
78 | yarg==0.1.9
79 | yarl==1.8.2
80 | 


--------------------------------------------------------------------------------
/web-qa.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 20,
   6 |    "metadata": {},
   7 |    "outputs": [
   8 |     {
   9 |      "name": "stdout",
  10 |      "output_type": "stream",
  11 |      "text": [
  12 |       "https://openai.com/\n",
  13 |       "https://openai.com/blog/tags/announcements\n",
  14 |       "https://openai.com/blog/introducing-openai\n",
  15 |       "https://openai.com/blog/authors/ilya\n",
  16 |       "https://openai.com/blog/requests-for-research-2\n",
  17 |       "https://openai.com/blog/authors/diederik\n",
  18 |       "https://openai.com/blog/block-sparse-gpu-kernels\n",
  19 |       "https://openai.com/blog/authors/alec\n",
  20 |       "https://openai.com/blog/fine-tuning-gpt-2\n",
  21 |       "https://openai.com/blog/authors/paul\n",
  22 |       "https://openai.com/blog/concrete-ai-safety-problems\n",
  23 |       "https://openai.com/blog/learning-to-summarize-with-human-feedback\n",
  24 |       "https://openai.com/blog/authors/long\n",
  25 |       "https://openai.com/blog/authors/lowe\n",
  26 |       "https://openai.com/blog/learning-to-cooperate-compete-and-communicate\n",
  27 |       "https://openai.com/blog/authors/jean\n",
  28 |       "https://openai.com/blog/authors/igor\n",
  29 |       "https://openai.com/blog/neural-mmo\n",
  30 |       "https://openai.com/blog/authors/phillip\n",
  31 |       "https://openai.com/blog/evolved-policy-gradients\n",
  32 |       "https://openai.com/blog/authors/richard\n",
  33 |       "https://openai.com/blog/better-exploration-with-parameter-noise\n",
  34 |       "https://openai.com/blog/authors/xi\n",
  35 |       "https://openai.com/blog/authors/matthias\n",
  36 |       "https://openai.com/blog/solving-rubiks-cube\n",
  37 |       "https://openai.com/blog/authors/ilge\n",
  38 |       "https://openai.com/blog/vpt\n",
  39 |       "https://openai.com/blog/authors/brandon\n",
  40 |       "https://openai.com/blog/authors/raul\n",
  41 |       "https://openai.com/blog/authors/bowen\n",
  42 |       "https://openai.com/blog/authors/jie\n",
  43 |       "https://openai.com/blog/tags/five\n",
  44 |       "https://openai.com/blog/openai-five-benchmark-results\n",
  45 |       "https://openai.com/blog/openai-five/#rapid\n",
  46 |       "https://openai.com/blog/authors/henrique\n",
  47 |       "https://openai.com/blog/authors/susan\n",
  48 |       "https://openai.com/blog/authors/brooke\n",
  49 |       "https://openai.com/blog/authors/michael-petrov\n",
  50 |       "https://openai.com/blog/multimodal-neurons\n",
  51 |       "https://openai.com/blog/authors/shan\n",
  52 |       "https://openai.com/blog/authors/daniela\n",
  53 |       "https://openai.com/blog/authors/nick\n",
  54 |       "https://openai.com/blog/authors/chris\n",
  55 |       "https://openai.com/blog/introducing-activation-atlases\n",
  56 |       "https://openai.com/blog/authors/ludwig-schubert\n",
  57 |       "https://openai.com/blog/authors/justin\n",
  58 |       "https://openai.com/blog/authors/gabriel\n",
  59 |       "https://openai.com/blog/microscope\n",
  60 |       "https://openai.com/blog/authors/przemyslaw\n",
  61 |       "https://openai.com/blog/authors/david\n",
  62 |       "https://openai.com/blog/authors/jakub-pachocki\n",
  63 |       "https://openai.com/blog/authors/christy\n",
  64 |       "https://openai.com/blog/improving-language-model-behavior\n",
  65 |       "https://openai.com/blog/authors/irene\n",
  66 |       "https://openai.com/blog/gpt-2-1-5b-release\n",
  67 |       "https://openai.com/blog/authors/jack-clark\n",
  68 |       "https://openai.com/blog/cooperation-on-safety\n",
  69 |       "https://openai.com/blog/authors/amanda\n",
  70 |       "https://openai.com/blog/ai-safety-needs-social-scientists\n",
  71 |       "https://openai.com/blog/adversarial-example-research\n",
  72 |       "https://openai.com/blog/authors/sandy\n",
  73 |       "https://openai.com/blog/authors/ian\n",
  74 |       "https://openai.com/blog/machine-learning-unconference\n",
  75 |       "https://openai.com/events/code-of-conduct.txt\n",
  76 |       "https://openai.com/blog/authors/rocky\n",
  77 |       "https://openai.com/blog/authors/nicolas\n",
  78 |       "https://openai.com/blog/preparing-for-malicious-uses-of-ai\n",
  79 |       "https://openai.com/blog/authors/michael\n",
  80 |       "https://openai.com/blog/spam-detection-in-the-physical-world\n",
  81 |       "https://openai.com/blog/authors/rachel\n",
  82 |       "https://openai.com/blog/authors/alex-ray\n",
  83 |       "https://openai.com/blog/generalizing-from-simulation\n",
  84 |       "https://openai.com/blog/authors/lerrel\n",
  85 |       "https://openai.com/blog/authors/xue\n",
  86 |       "https://openai.com/blog/faster-robot-simulation-in-python\n",
  87 |       "https://openai.com/blog/safety-gym\n",
  88 |       "https://openai.com/blog/authors/joshua\n",
  89 |       "https://openai.com/blog/spinning-up-in-deep-rl\n",
  90 |       "https://openai.com/blog/spinning-up-in-deep-rl-workshop-review\n",
  91 |       "https://openai.com/blog/hackathon-follow-up\n",
  92 |       "https://openai.com/blog/authors/parnian\n",
  93 |       "https://openai.com/blog/openai-hackathon\n",
  94 |       "https://openai.com/events/hackathon.txt\n",
  95 |       "https://openai.com/blog/authors/josh-tobin\n",
  96 |       "https://openai.com/blog/report-from-the-self-organizing-conference\n",
  97 |       "https://openai.com/blog/faulty-reward-functions\n",
  98 |       "https://openai.com/blog/authors/miles\n",
  99 |       "https://openai.com/blog/language-model-safety-and-misuse\n",
 100 |       "https://openai.com/blog/authors/tyna\n",
 101 |       "https://openai.com/blog/webgpt\n",
 102 |       "https://openai.com/blog/authors/jacob-hilton\n",
 103 |       "https://openai.com/blog/measuring-goodharts-law\n",
 104 |       "https://openai.com/careers/research-engineer\n",
 105 |       "https://openai.com/blog/authors/leo\n",
 106 |       "https://openai.com/blog/learning-to-summarize-with-human-feedback/#optimizingtherewardmodel\n",
 107 |       "https://openai.com/blog/procgen-benchmark\n",
 108 |       "https://openai.com/blog/first-retro-contest-retrospective\n",
 109 |       "https://openai.com/blog/authors/oleg\n",
 110 |       "https://openai.com/blog/roboschool\n",
 111 |       "https://openai.com/blog/gym-retro\n",
 112 |       "https://openai.com/blog/authors/vicki\n",
 113 |       "https://openai.com/blog/retro-contest\n",
 114 |       "https://openai.com/blog/authors/alex\n",
 115 |       "https://openai.com/blog/reptile\n",
 116 |       "https://openai.com/blog/dall-e-2-pre-training-mitigations\n",
 117 |       "https://openai.com/blog/authors/larissa\n",
 118 |       "https://openai.com/blog/openai-scholars-2018-final-projects\n",
 119 |       "https://openai.com/blog/authors/karl\n",
 120 |       "https://openai.com/blog/grade-school-math\n",
 121 |       "https://openai.com/blog/authors/vineet\n",
 122 |       "https://openai.com/blog/authors/christopher\n",
 123 |       "https://openai.com/blog/quantifying-generalization-in-reinforcement-learning\n",
 124 |       "https://openai.com/blog/authors/reiichiro\n",
 125 |       "https://openai.com/blog/authors/suchir\n",
 126 |       "https://openai.com/blog/authors/katie\n",
 127 |       "https://openai.com/blog/authors/sandhini\n",
 128 |       "https://openai.com/blog/authors/pamela\n",
 129 |       "https://openai.com/blog/authors/steven\n",
 130 |       "https://openai.com/blog/authors/gretchen\n",
 131 |       "https://openai.com/blog/authors/jan\n",
 132 |       "https://openai.com/blog/critiques\n",
 133 |       "https://openai.com/blog/authors/william-saunders\n",
 134 |       "https://openai.com/blog/authors/catherine\n",
 135 |       "https://openai.com/blog/our-approach-to-alignment-research\n",
 136 |       "https://openai.com/blog/best-practices-for-deploying-language-models\n",
 137 |       "https://openai.com/blog/instruction-following/#limitations\n",
 138 |       "https://openai.com/blog/economic-impacts\n",
 139 |       "https://openai.com/blog/authors/sam-manning\n",
 140 |       "https://openai.com/scholars\n",
 141 |       "https://openai.com/blog/openai-scholars-2021-final-projects\n",
 142 |       "https://openai.com/blog/openai-scholars-2020-final-projects\n",
 143 |       "https://openai.com/resources\n",
 144 |       "https://openai.com/blog/openai-scholars-spring-2020\n",
 145 |       "https://openai.com/blog/openai-scholars-class-of-19\n",
 146 |       "https://openai.com/blog/openai-scholars-2019-final-projects\n",
 147 |       "https://openai.com/blog/authors/jonathan\n",
 148 |       "https://openai.com/blog/discovering-types-for-entity-disambiguation\n",
 149 |       "https://openai.com/blog/openai-five-benchmark\n",
 150 |       "https://openai.com/blog/openai-five-defeats-dota-2-world-champions/#arena\n",
 151 |       "https://openai.com/blog/openai-five/#ourapproach\n",
 152 |       "https://openai.com/blog/more-on-dota-2/#botexploits\n",
 153 |       "https://openai.com/blog/openai-five-benchmark-results/#training\n",
 154 |       "https://openai.com/blog/openai-five-finals\n",
 155 |       "https://openai.com/five/#overview\n",
 156 |       "https://openai.com/blog/dota-2\n",
 157 |       "https://openai.com/the-international\n",
 158 |       "https://openai.com/blog/more-on-dota-2\n",
 159 |       "https://openai.com/blog/the-international-2018-results\n",
 160 |       "https://openai.com/blog/openai-five-defeats-dota-2-world-champions/#cooperativemode\n",
 161 |       "https://openai.com/blog/openai-five-defeats-dota-2-world-champions\n",
 162 |       "https://openai.com/blog/authors/jeff\n",
 163 |       "https://openai.com/blog/authors/adrien\n",
 164 |       "https://openai.com/blog/authors/joost\n",
 165 |       "https://openai.com/blog/authors/peter-zhokhov\n",
 166 |       "https://openai.com/blog/authors/glenn\n",
 167 |       "https://openai.com/blog/authors/peter\n",
 168 |       "https://openai.com/blog/authors/raphael\n",
 169 |       "https://openai.com/blog/authors/lilian\n",
 170 |       "https://openai.com/blog/techniques-for-training-large-neural-networks\n",
 171 |       "https://openai.com/blog/authors/alex-paino\n",
 172 |       "https://openai.com/blog/authors/nikolas\n",
 173 |       "https://openai.com/blog/openai-five\n",
 174 |       "https://openai.com/blog/authors/bob\n",
 175 |       "https://openai.com/blog/authors/qiming\n",
 176 |       "https://openai.com/blog/authors/wojciech\n",
 177 |       "https://openai.com/blog/authors/arthur\n",
 178 |       "https://openai.com/blog/authors/mateusz\n",
 179 |       "https://openai.com/blog/authors/maciek\n",
 180 |       "https://openai.com/blog/authors/jerry\n",
 181 |       "https://openai.com/blog/authors/lei\n",
 182 |       "https://openai.com/blog/how-to-train-your-openai-five\n",
 183 |       "https://openai.com/blog/authors/jonas-schneider\n",
 184 |       "https://openai.com/jobs/#robotics\n",
 185 |       "https://openai.com/interview-guide\n",
 186 |       "https://openai.com/blog/learning-dexterity\n",
 187 |       "https://openai.com/blog/authors/rafal\n",
 188 |       "https://openai.com/blog/ingredients-for-robotics-research\n",
 189 |       "https://openai.com/blog/authors/vikash\n",
 190 |       "https://openai.com/blog/authors/marcin\n",
 191 |       "https://openai.com/blog/authors/prafulla\n",
 192 |       "https://openai.com/blog/authors/szymon-sidor\n",
 193 |       "https://openai.com/blog/openai-baselines-dqn\n",
 194 |       "https://openai.com/blog/authors/tamim\n",
 195 |       "https://openai.com/blog/learning-montezumas-revenge-from-a-single-demonstration\n",
 196 |       "https://openai.com/blog/authors/bradly\n",
 197 |       "https://openai.com/blog/authors/rein\n",
 198 |       "https://openai.com/blog/authors/jonathan-ho\n",
 199 |       "https://openai.com/blog/learning-a-hierarchy\n",
 200 |       "https://openai.com/blog/authors/peter-chen\n",
 201 |       "https://openai.com/blog/authors/kevin\n",
 202 |       "https://openai.com/blog/authors/filip\n",
 203 |       "https://openai.com/five\n",
 204 |       "https://openai.com/blog/authors/yilun\n",
 205 |       "https://openai.com/blog/authors/joseph\n",
 206 |       "https://openai.com/blog/interpretable-machine-learning-through-teaching\n",
 207 |       "https://openai.com/blog/authors/smitha\n",
 208 |       "https://openai.com/blog/learning-to-model-other-minds\n",
 209 |       "https://openai.com/blog/authors/shimon\n",
 210 |       "https://openai.com/blog/authors/maruan\n",
 211 |       "https://openai.com/blog/authors/jakob-foerster\n",
 212 |       "https://openai.com/blog/nonlinear-computation-in-linear-networks\n",
 213 |       "https://openai.com/blog/energy-based-models\n",
 214 |       "https://openai.com/blog/emergent-tool-use\n",
 215 |       "https://openai.com/blog/authors/ingmar\n",
 216 |       "https://openai.com/blog/authors/todor\n",
 217 |       "https://openai.com/blog/learning-concepts-with-energy-functions\n",
 218 |       "https://openai.com/blog/authors/yi\n",
 219 |       "https://openai.com/blog/authors/pieter\n",
 220 |       "https://openai.com/blog/authors/aviv\n",
 221 |       "https://openai.com/blog/instruction-following\n",
 222 |       "https://openai.com/blog/learning-to-communicate\n",
 223 |       "https://openai.com/blog/authors/jon\n",
 224 |       "https://openai.com/blog/summarizing-books\n",
 225 |       "https://openai.com/blog/authors/chelsea\n",
 226 |       "https://openai.com/blog/gathering_human_feedback\n",
 227 |       "https://openai.com/blog/authors/dario-amodei\n",
 228 |       "https://openai.com/blog/science-of-ai\n",
 229 |       "https://openai.com/blog/authors/jared\n",
 230 |       "https://openai.com/blog/authors/sam\n",
 231 |       "https://openai.com/blog/gpt-2-6-month-follow-up\n",
 232 |       "https://openai.com/blog/better-language-models/#update\n",
 233 |       "https://openai.com/blog/authors/david-luan\n",
 234 |       "https://openai.com/blog/authors/danny\n",
 235 |       "https://openai.com/blog/ai-and-efficiency\n",
 236 |       "https://openai.com/blog/authors/david-lansky\n",
 237 |       "https://openai.com/blog/authors/tom\n",
 238 |       "https://openai.com/blog/testing-robustness\n",
 239 |       "https://openai.com/blog/authors/jacob\n",
 240 |       "https://openai.com/blog/authors/yi-sun\n",
 241 |       "https://openai.com/blog/authors/daniel\n",
 242 |       "https://openai.com/blog/authors/dan\n",
 243 |       "https://openai.com/blog/deep-reinforcement-learning-from-human-preferences\n",
 244 |       "https://openai.com/blog/authors/geoffrey\n",
 245 |       "https://openai.com/blog/debate\n",
 246 |       "https://openai.com/blog/authors/jeffrey\n",
 247 |       "https://openai.com/blog/authors/nisan\n",
 248 |       "https://openai.com/blog/amplifying-ai-training\n",
 249 |       "https://openai.com/blog/authors/daniel-ziegler\n",
 250 |       "https://openai.com/blog/baselines-acktr-a2c\n",
 251 |       "https://openai.com/blog/authors/yuhuai\n",
 252 |       "https://openai.com/blog/authors/shun\n",
 253 |       "https://openai.com/blog/authors/elman\n",
 254 |       "https://openai.com/blog/openai-baselines-ppo\n",
 255 |       "https://openai.com/blog/language-unsupervised\n",
 256 |       "https://openai.com/blog/tags/baselines\n",
 257 |       "https://openai.com/blog/authors/scott\n",
 258 |       "https://openai.com/blog/sparse-transformer\n",
 259 |       "https://openai.com/blog/authors/rewon\n",
 260 |       "https://openai.com/blog/glow\n",
 261 |       "https://openai.com/blog/authors/john\n",
 262 |       "https://openai.com/blog/openai-gym-beta\n",
 263 |       "https://openai.com/blog/authors/tim\n",
 264 |       "https://openai.com/jobs\n",
 265 |       "https://openai.com/blog/formal-math\n",
 266 |       "https://openai.com/blog/authors/stanislas\n",
 267 |       "https://openai.com/blog/authors/jesse\n",
 268 |       "https://openai.com/blog/generative-models\n",
 269 |       "https://openai.com/blog/authors/andrej\n",
 270 |       "https://openai.com/blog/distill\n",
 271 |       "https://openai.com/blog/authors/vicki-cheung\n",
 272 |       "https://openai.com/blog/jukebox\n",
 273 |       "https://openai.com/projects/five\n",
 274 |       "https://openai.com/blog/authors/christine\n",
 275 |       "https://openai.com/blog/authors/jong\n",
 276 |       "https://openai.com/blog/authors/heewoo\n",
 277 |       "https://openai.com/blog/musenet\n",
 278 |       "https://openai.com/blog/better-language-models\n",
 279 |       "https://openai.com/blog/robots-that-learn\n",
 280 |       "https://openai.com/blog/authors/ankur\n",
 281 |       "https://openai.com/blog/authors/erika-reinhardt\n",
 282 |       "https://openai.com/blog/deep-double-descent\n",
 283 |       "https://openai.com/blog/authors/tristan\n",
 284 |       "https://openai.com/blog/authors/preetum\n",
 285 |       "https://openai.com/blog/authors/boaz\n",
 286 |       "https://openai.com/blog/authors/yamini\n",
 287 |       "https://openai.com/blog/authors/gal\n",
 288 |       "https://openai.com/blog/tags/gpt-2\n",
 289 |       "https://openai.com/blog/clip\n",
 290 |       "https://openai.com/blog/ai-and-compute\n",
 291 |       "https://openai.com/blog/authors/girish\n",
 292 |       "https://openai.com/blog/special-projects\n",
 293 |       "https://openai.com/blog/authors/sam-altman\n",
 294 |       "https://openai.com/blog/unsupervised-sentiment-neuron\n",
 295 |       "https://openai.com/blog/dall-e\n",
 296 |       "https://openai.com/blog/authors/aditya\n",
 297 |       "https://openai.com/blog/authors/mark\n",
 298 |       "https://openai.com/blog/authors/mikhail\n",
 299 |       "https://openai.com/blog/authors/vedant\n",
 300 |       "https://openai.com/blog/competitive-self-play\n",
 301 |       "https://openai.com/blog/authors/trapit\n",
 302 |       "https://openai.com/blog/meta-learning-for-wrestling\n",
 303 |       "https://openai.com/blog/authors/yura\n",
 304 |       "https://openai.com/blog/reinforcement-learning-with-prediction-based-rewards\n",
 305 |       "https://openai.com/blog/authors/harri\n",
 306 |       "https://openai.com/blog/image-gpt\n",
 307 |       "https://openai.com/blog/evolution-strategies\n",
 308 |       "https://openai.com/blog/infrastructure-for-deep-learning\n",
 309 |       "https://openai.com/blog/generative-models/#gan\n",
 310 |       "https://openai.com/blog/generative-models#improving-gans\n",
 311 |       "https://openai.com/blog/tags/multimodal\n",
 312 |       "https://openai.com/gpt-3\n",
 313 |       "https://openai.com/javascript:setMathjaxCookie()\n",
 314 |       "HTTP Error 404: Not Found\n",
 315 |       "https://openai.com/abs/2005.14165v1\n",
 316 |       "HTTP Error 404: Not Found\n",
 317 |       "https://openai.com/list/cs.CL/new\n",
 318 |       "HTTP Error 404: Not Found\n",
 319 |       "https://openai.com/abs/2005.14165v3\n",
 320 |       "HTTP Error 404: Not Found\n",
 321 |       "https://openai.com/auth/show-endorsers/2005.14165\n",
 322 |       "HTTP Error 404: Not Found\n",
 323 |       "https://openai.com/list/cs/recent\n",
 324 |       "HTTP Error 404: Not Found\n",
 325 |       "https://openai.com/abs/2005.14165?context=cs\n",
 326 |       "HTTP Error 404: Not Found\n",
 327 |       "https://openai.com/{url_path('ignore_me')}\n",
 328 |       "HTTP Error 404: Not Found\n",
 329 |       "https://openai.com/abs/2005.14165v2\n",
 330 |       "HTTP Error 404: Not Found\n",
 331 |       "https://openai.com/show-email/b5cb66e9/2005.14165\n",
 332 |       "HTTP Error 404: Not Found\n",
 333 |       "https://openai.com/prevnext?id=2005.14165&function=next&context=cs.CL\n",
 334 |       "HTTP Error 404: Not Found\n",
 335 |       "https://openai.com/format/2005.14165\n",
 336 |       "HTTP Error 404: Not Found\n",
 337 |       "https://openai.com/prevnext?id=2005.14165&function=prev&context=cs.CL\n",
 338 |       "HTTP Error 404: Not Found\n",
 339 |       "https://openai.com/pdf/2005.14165\n",
 340 |       "HTTP Error 404: Not Found\n",
 341 |       "https://openai.com/tb/2005.14165\n",
 342 |       "HTTP Error 404: Not Found\n",
 343 |       "https://openai.com/list/cs.CL/2005\n",
 344 |       "HTTP Error 404: Not Found\n",
 345 |       "https://openai.com/list/cs.CL/recent\n",
 346 |       "HTTP Error 404: Not Found\n",
 347 |       "https://openai.com/blog/dall-e-2\n",
 348 |       "https://openai.com/blog/authors/openai\n",
 349 |       "https://openai.com/blog/improving-verifiability\n",
 350 |       "https://openai.com/blog/dall-e-2-extending-creativity\n",
 351 |       "https://openai.com/blog/the-international\n",
 352 |       "https://openai.com/blog/symposium-2019\n",
 353 |       "https://openai.com/blog/tags/culture\n",
 354 |       "https://openai.com/blog/learning-day\n",
 355 |       "https://openai.com/blog/openai-fellows-fall-2018\n",
 356 |       "https://openai.com/blog/neurips-2020\n",
 357 |       "https://openai.com/blog/tags/community\n",
 358 |       "https://openai.com/blog/universe\n",
 359 |       "https://openai.com/blog/openai-gym-beta/#rl\n",
 360 |       "https://openai.com/blog/openai-technical-goals/#goal4\n",
 361 |       "https://openai.com/blog/authors/elon\n",
 362 |       "https://openai.com/blog/scaling-kubernetes-to-7500-nodes\n",
 363 |       "https://openai.com/blog/scaling-kubernetes-to-2500-nodes\n",
 364 |       "https://openai.com/blog/authors/christopher-berner\n",
 365 |       "https://openai.com/blog/authors/bchess\n",
 366 |       "https://openai.com/blog/authors/eric\n",
 367 |       "https://openai.com/blog/forecasting-misuse\n",
 368 |       "https://openai.com/forecasting-misuse-paper\n",
 369 |       "https://openai.com/prevnext?id=2301.04246&function=prev&context=cs.CY\n",
 370 |       "HTTP Error 404: Not Found\n",
 371 |       "https://openai.com/auth/show-endorsers/2301.04246\n",
 372 |       "HTTP Error 404: Not Found\n",
 373 |       "https://openai.com/format/2301.04246\n",
 374 |       "HTTP Error 404: Not Found\n",
 375 |       "https://openai.com/pdf/2301.04246\n",
 376 |       "HTTP Error 404: Not Found\n",
 377 |       "https://openai.com/show-email/64c5c6bd/2301.04246\n",
 378 |       "HTTP Error 404: Not Found\n",
 379 |       "https://openai.com/list/cs.CY/recent\n",
 380 |       "HTTP Error 404: Not Found\n",
 381 |       "https://openai.com/prevnext?id=2301.04246&function=next&context=cs.CY\n",
 382 |       "HTTP Error 404: Not Found\n",
 383 |       "https://openai.com/list/cs.CY/new\n",
 384 |       "HTTP Error 404: Not Found\n",
 385 |       "https://openai.com/list/cs.CY/2301\n",
 386 |       "HTTP Error 404: Not Found\n",
 387 |       "https://openai.com/abs/2301.04246?context=cs\n",
 388 |       "HTTP Error 404: Not Found\n",
 389 |       "https://openai.com/blog/authors/greg\n",
 390 |       "https://openai.com/blog/dall-e-api-now-available-in-public-beta\n",
 391 |       "https://openai.com/blog/api-no-waitlist\n",
 392 |       "https://openai.com/blog/dall-e-introducing-outpainting\n",
 393 |       "https://openai.com/blog/team-update\n",
 394 |       "https://openai.com/blog/chatgpt-plus\n",
 395 |       "https://openai.com/blog/openai-api\n",
 396 |       "https://openai.com/jobs/#applied-ai\n",
 397 |       "https://openai.com/blog/authors/mira\n",
 398 |       "https://openai.com/join\n",
 399 |       "Unable to parse page https://openai.com/join due to JavaScript being required\n",
 400 |       "HTTP Error 403: Forbidden\n",
 401 |       "https://openai.com/blog/tags/residency\n",
 402 |       "https://openai.com/blog/openai-licenses-gpt-3-technology-to-microsoft\n",
 403 |       "https://openai.com/blog/microsoft\n",
 404 |       "https://openai.com/blog/team-update-august\n",
 405 |       "https://openai.com/blog/new-ai-classifier-for-indicating-ai-written-text\n",
 406 |       "https://openai.com/blog/authors/lama\n",
 407 |       "https://openai.com/blog/authors/scott-aaronson\n",
 408 |       "https://openai.com/blog/authors/jan-hendrik-kirchner\n",
 409 |       "https://openai.com/blog/tags/api\n",
 410 |       "https://openai.com/blog/openai-fellows\n",
 411 |       "https://openai.com/blog/tags/scholars\n",
 412 |       "https://openai.com/blog/openai-and-microsoft-extend-partnership\n",
 413 |       "https://openai.com/blog/dall-e-now-available-without-waitlist\n",
 414 |       "https://openai.com/blog/helen-toner-joins\n",
 415 |       "https://openai.com/blog/team-update-january\n",
 416 |       "https://openai.com/blog/team-plus-plus#interns\n",
 417 |       "https://openai.com/blog/openai-codex\n",
 418 |       "https://openai.com/blog/openai-scholars-2019\n",
 419 |       "https://openai.com/blog/authors/ashley\n",
 420 |       "https://openai.com/blog/openai-scholars\n",
 421 |       "https://openai.com/blog/dall-e-now-available-in-beta\n",
 422 |       "https://openai.com/blog/new-and-improved-embedding-model\n",
 423 |       "https://openai.com/blog/authors/ryan\n",
 424 |       "https://openai.com/blog/authors/arvind\n",
 425 |       "https://openai.com/blog/authors/ted\n",
 426 |       "https://openai.com/blog/dall-e-2-update\n",
 427 |       "https://openai.com/blog/authors/joanne\n",
 428 |       "https://openai.com/blog/tags/fellows\n",
 429 |       "https://openai.com/blog/openai-summer-fellows-2018\n",
 430 |       "https://openai.com/blog/authors/maddie\n",
 431 |       "https://openai.com/blog/codex-apps\n",
 432 |       "https://openai.com/blog/codex\n",
 433 |       "HTTP Error 404: Not Found\n",
 434 |       "https://openai.com/blog/new-and-improved-content-moderation-tooling\n",
 435 |       "https://openai.com/blog/authors/teddy\n",
 436 |       "https://openai.com/blog/authors/angela\n",
 437 |       "https://openai.com/blog/authors/chong\n",
 438 |       "https://openai.com/blog/welcome-pieter-and-shivon\n",
 439 |       "https://openai.com/blog/openai-technical-goals\n",
 440 |       "https://openai.com/blog/procgen-minerl-competitions\n",
 441 |       "https://openai.com/blog/will-hurd-joins\n",
 442 |       "https://openai.com/blog/fund\n",
 443 |       "https://openai.com/news\n",
 444 |       "HTTP Error 404: Not Found\n",
 445 |       "https://openai.com/news/introducing-our-first-investments\n",
 446 |       "HTTP Error 404: Not Found\n",
 447 |       "https://openai.com/blog/introducing-text-and-code-embeddings\n",
 448 |       "https://openai.com/blog/authors/boris\n",
 449 |       "https://openai.com/blog/openai-scholars-2018-meet-our-scholars\n",
 450 |       "https://openai.com/blog/team-plus-plus\n",
 451 |       "https://openai.com/blog/gpt-3-apps\n",
 452 |       "https://openai.com/jobs/#open\n",
 453 |       "https://openai.com/blog/customized-gpt-3\n",
 454 |       "https://openai.com/blog/authors/luke\n",
 455 |       "https://openai.com/blog/authors/rachel-lim\n",
 456 |       "https://openai.com/blog/authors/michael-wu\n",
 457 |       "https://openai.com/blog/openai-supporters\n",
 458 |       "https://openai.com/blog/openai-residency\n",
 459 |       "https://openai.com/blog/leadership-team-update\n",
 460 |       "https://openai.com/blog/organizational-update\n",
 461 |       "https://openai.com/blog/openai-fellows-interns-2019\n",
 462 |       "https://openai.com/blog/openai-scholars-2020\n",
 463 |       "https://openai.com/blog/gpt-3-edit-insert\n",
 464 |       "https://openai.com/blog/authors/mo\n",
 465 |       "https://openai.com/blog/openai-pytorch\n",
 466 |       "https://openai.com/blog/openai-scholars-2019-meet-our-scholars\n",
 467 |       "https://openai.com/blog/openai-charter\n",
 468 |       "https://openai.com/blog/openai-and-microsoft\n",
 469 |       "https://openai.com/blog/openai-lp\n",
 470 |       "https://openai.com/blog/reducing-bias-and-improving-safety-in-dall-e-2\n",
 471 |       "https://openai.com/terms\n",
 472 |       "https://openai.com/api/policies/service-terms\n",
 473 |       "https://openai.com/api/policies/sharing-publication\n",
 474 |       "https://openai.com/api/policies/terms\n",
 475 |       "https://openai.com/security/disclosure\n",
 476 |       "https://openai.com/blog/whisper\n",
 477 |       "https://openai.com/blog/authors/tao\n",
 478 |       "https://openai.com/research\n",
 479 |       "https://openai.com/api/docs\n",
 480 |       "Unable to parse page https://openai.com/api/docs due to JavaScript being required\n",
 481 |       "HTTP Error 403: Forbidden\n",
 482 |       "https://openai.com/dall-e-2\n",
 483 |       "https://openai.com/privacy\n",
 484 |       "https://openai.com/api\n",
 485 |       "https://openai.com/blog\n",
 486 |       "https://openai.com/blog/triton\n",
 487 |       "https://openai.com/blog/authors/philippe\n",
 488 |       "https://openai.com/jobs/#acceleration\n",
 489 |       "https://openai.com/blog/robust-adversarial-inputs\n",
 490 |       "https://openai.com/blog/authors/anish-athalye\n",
 491 |       "https://openai.com/blog/tags/milestones\n",
 492 |       "https://openai.com/alignment\n",
 493 |       "https://openai.com\n",
 494 |       "https://openai.com/publications\n",
 495 |       "https://openai.com/charter\n",
 496 |       "https://openai.com/blog/tags/research\n",
 497 |       "https://openai.com/fund\n",
 498 |       "https://openai.com/about\n",
 499 |       "https://openai.com/timeline\n",
 500 |       "https://openai.com/careers\n",
 501 |       "https://openai.com/api/examples\n",
 502 |       "Unable to parse page https://openai.com/api/examples due to JavaScript being required\n",
 503 |       "HTTP Error 403: Forbidden\n",
 504 |       "https://openai.com/api/login\n",
 505 |       "Unable to parse page https://openai.com/api/login due to JavaScript being required\n",
 506 |       "HTTP Error 403: Forbidden\n",
 507 |       "https://openai.com/newsroom\n",
 508 |       "https://openai.com/api/policies\n",
 509 |       "https://openai.com/api/pricing\n",
 510 |       "https://openai.com/contact-sales\n",
 511 |       "https://openai.com/api/pricing/#faq-fine-tuning-pricing-calculation\n",
 512 |       "https://openai.com/blog/tags/events\n",
 513 |       "https://openai.com/blog/chatgpt\n"
 514 |      ]
 515 |     }
 516 |    ],
 517 |    "source": [
 518 |     "import requests\n",
 519 |     "import re\n",
 520 |     "import urllib.request\n",
 521 |     "from bs4 import BeautifulSoup\n",
 522 |     "from collections import deque\n",
 523 |     "from html.parser import HTMLParser\n",
 524 |     "from urllib.parse import urlparse\n",
 525 |     "import os\n",
 526 |     "\n",
 527 |     "# Regex pattern to match a URL\n",
 528 |     "HTTP_URL_PATTERN = r'^http[s]*://.+'\n",
 529 |     "\n",
 530 |     "# Define root domain to crawl\n",
 531 |     "domain = \"openai.com\"\n",
 532 |     "full_url = \"https://openai.com/\"\n",
 533 |     "\n",
 534 |     "# Create a class to parse the HTML and get the hyperlinks\n",
 535 |     "class HyperlinkParser(HTMLParser):\n",
 536 |     "    def __init__(self):\n",
 537 |     "        super().__init__()\n",
 538 |     "        # Create a list to store the hyperlinks\n",
 539 |     "        self.hyperlinks = []\n",
 540 |     "\n",
 541 |     "    # Override the HTMLParser's handle_starttag method to get the hyperlinks\n",
 542 |     "    def handle_starttag(self, tag, attrs):\n",
 543 |     "        attrs = dict(attrs)\n",
 544 |     "\n",
 545 |     "        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks\n",
 546 |     "        if tag == \"a\" and \"href\" in attrs:\n",
 547 |     "            self.hyperlinks.append(attrs[\"href\"])\n",
 548 |     "\n",
 549 |     "# Function to get the hyperlinks from a URL\n",
 550 |     "def get_hyperlinks(url):\n",
 551 |     "    \n",
 552 |     "    # Try to open the URL and read the HTML\n",
 553 |     "    try:\n",
 554 |     "        # Open the URL and read the HTML\n",
 555 |     "        with urllib.request.urlopen(url) as response:\n",
 556 |     "\n",
 557 |     "            # If the response is not HTML, return an empty list\n",
 558 |     "            if not response.info().get('Content-Type').startswith(\"text/html\"):\n",
 559 |     "                return []\n",
 560 |     "            \n",
 561 |     "            # Decode the HTML\n",
 562 |     "            html = response.read().decode('utf-8')\n",
 563 |     "    except Exception as e:\n",
 564 |     "        print(e)\n",
 565 |     "        return []\n",
 566 |     "\n",
 567 |     "    # Create the HTML Parser and then Parse the HTML to get hyperlinks\n",
 568 |     "    parser = HyperlinkParser()\n",
 569 |     "    parser.feed(html)\n",
 570 |     "\n",
 571 |     "    return parser.hyperlinks\n",
 572 |     "\n",
 573 |     "# Function to get the hyperlinks from a URL that are within the same domain\n",
 574 |     "def get_domain_hyperlinks(local_domain, url):\n",
 575 |     "    clean_links = []\n",
 576 |     "    for link in set(get_hyperlinks(url)):\n",
 577 |     "        clean_link = None\n",
 578 |     "\n",
 579 |     "        # If the link is a URL, check if it is within the same domain\n",
 580 |     "        if re.search(HTTP_URL_PATTERN, link):\n",
 581 |     "            # Parse the URL and check if the domain is the same\n",
 582 |     "            url_obj = urlparse(link)\n",
 583 |     "            if url_obj.netloc == local_domain:\n",
 584 |     "                clean_link = link\n",
 585 |     "\n",
 586 |     "        # If the link is not a URL, check if it is a relative link\n",
 587 |     "        else:\n",
 588 |     "            if link.startswith(\"/\"):\n",
 589 |     "                link = link[1:]\n",
 590 |     "            elif link.startswith(\"#\") or link.startswith(\"mailto:\"):\n",
 591 |     "                continue\n",
 592 |     "            clean_link = \"https://\" + local_domain + \"/\" + link\n",
 593 |     "\n",
 594 |     "        if clean_link is not None:\n",
 595 |     "            if clean_link.endswith(\"/\"):\n",
 596 |     "                clean_link = clean_link[:-1]\n",
 597 |     "            clean_links.append(clean_link)\n",
 598 |     "\n",
 599 |     "    # Return the list of hyperlinks that are within the same domain\n",
 600 |     "    return list(set(clean_links))\n",
 601 |     "\n",
 602 |     "\n",
 603 |     "def crawl(url):\n",
 604 |     "    # Parse the URL and get the domain\n",
 605 |     "    local_domain = urlparse(url).netloc\n",
 606 |     "\n",
 607 |     "    # Create a queue to store the URLs to crawl\n",
 608 |     "    queue = deque([url])\n",
 609 |     "\n",
 610 |     "    # Create a set to store the URLs that have already been seen (no duplicates)\n",
 611 |     "    seen = set([url])\n",
 612 |     "\n",
 613 |     "    # Create a directory to store the text files\n",
 614 |     "    if not os.path.exists(\"text/\"):\n",
 615 |     "            os.mkdir(\"text/\")\n",
 616 |     "\n",
 617 |     "    if not os.path.exists(\"text/\"+local_domain+\"/\"):\n",
 618 |     "            os.mkdir(\"text/\" + local_domain + \"/\")\n",
 619 |     "\n",
 620 |     "    # Create a directory to store the csv files\n",
 621 |     "    if not os.path.exists(\"processed\"):\n",
 622 |     "            os.mkdir(\"processed\")\n",
 623 |     "\n",
 624 |     "    # While the queue is not empty, continue crawling\n",
 625 |     "    while queue:\n",
 626 |     "\n",
 627 |     "        # Get the next URL from the queue\n",
 628 |     "        url = queue.pop()\n",
 629 |     "        print(url) # for debugging and to see the progress\n",
 630 |     "\n",
 631 |     "        # Save text from the url to a <url>.txt file\n",
 632 |     "        with open('text/'+local_domain+'/'+url[8:].replace(\"/\", \"_\") + \".txt\", \"w\") as f:\n",
 633 |     "\n",
 634 |     "            # Get the text from the URL using BeautifulSoup\n",
 635 |     "            soup = BeautifulSoup(requests.get(url).text, \"html.parser\")\n",
 636 |     "\n",
 637 |     "            # Get the text but remove the tags\n",
 638 |     "            text = soup.get_text()\n",
 639 |     "\n",
 640 |     "            # If the crawler gets to a page that requires JavaScript, it will stop the crawl\n",
 641 |     "            if (\"You need to enable JavaScript to run this app.\" in text):\n",
 642 |     "                print(\"Unable to parse page \" + url + \" due to JavaScript being required\")\n",
 643 |     "            \n",
 644 |     "            # Otherwise, write the text to the file in the text directory\n",
 645 |     "            f.write(text)\n",
 646 |     "\n",
 647 |     "        # Get the hyperlinks from the URL and add them to the queue\n",
 648 |     "        for link in get_domain_hyperlinks(local_domain, url):\n",
 649 |     "            if link not in seen:\n",
 650 |     "                queue.append(link)\n",
 651 |     "                seen.add(link)\n",
 652 |     "\n",
 653 |     "crawl(full_url)"
 654 |    ]
 655 |   },
 656 |   {
 657 |    "cell_type": "code",
 658 |    "execution_count": 1,
 659 |    "metadata": {},
 660 |    "outputs": [],
 661 |    "source": [
 662 |     "def remove_newlines(serie):\n",
 663 |     "    serie = serie.str.replace('\\n', ' ')\n",
 664 |     "    serie = serie.str.replace('\\\\n', ' ')\n",
 665 |     "    serie = serie.str.replace('  ', ' ')\n",
 666 |     "    serie = serie.str.replace('  ', ' ')\n",
 667 |     "    return serie"
 668 |    ]
 669 |   },
 670 |   {
 671 |    "cell_type": "code",
 672 |    "execution_count": null,
 673 |    "metadata": {},
 674 |    "outputs": [],
 675 |    "source": [
 676 |     "import pandas as pd\n",
 677 |     "\n",
 678 |     "# Create a list to store the text files\n",
 679 |     "texts=[]\n",
 680 |     "\n",
 681 |     "# Get all the text files in the text directory\n",
 682 |     "for file in os.listdir(\"text/\" + domain + \"/\"):\n",
 683 |     "\n",
 684 |     "    # Open the file and read the text\n",
 685 |     "    with open(\"text/\" + domain + \"/\" + file, \"r\") as f:\n",
 686 |     "        text = f.read()\n",
 687 |     "\n",
 688 |     "        # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.\n",
 689 |     "        texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))\n",
 690 |     "\n",
 691 |     "# Create a dataframe from the list of texts\n",
 692 |     "df = pd.DataFrame(texts, columns = ['fname', 'text'])\n",
 693 |     "\n",
 694 |     "# Set the text column to be the raw text with the newlines removed\n",
 695 |     "df['text'] = df.fname + \". \" + remove_newlines(df.text)\n",
 696 |     "df.to_csv('processed/scraped.csv')\n",
 697 |     "df.head()"
 698 |    ]
 699 |   },
 700 |   {
 701 |    "cell_type": "code",
 702 |    "execution_count": 26,
 703 |    "metadata": {},
 704 |    "outputs": [
 705 |     {
 706 |      "data": {
 707 |       "text/plain": [
 708 |        "<AxesSubplot: >"
 709 |       ]
 710 |      },
 711 |      "execution_count": 26,
 712 |      "metadata": {},
 713 |      "output_type": "execute_result"
 714 |     },
 715 |     {
 716 |      "data": {
 717 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAikAAAGdCAYAAADXIOPgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/P9b71AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAlRElEQVR4nO3df3RU9Z3/8VcSJhMCTELATEhJEIsFIyAKNcy2da2EBJrjas05iy3HTS0Ht2zwVNOlmi4iP9oTDtuv2tqIPbsW3LOlbOkpuiJiYhRYa/iVmkrAZsWlG3dxkhaaBIhMhuTz/cPv3K9jgjB4yXwmeT7OyTncez/zmc/7PdfJy5m5mSRjjBEAAIBlkuO9AAAAgIEQUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAVhoR7wVcjr6+Pp04cUJjxoxRUlJSvJcDAAAugTFGp0+fVm5urpKTL/46SUKGlBMnTigvLy/eywAAAJfhvffe08SJEy86LiFDypgxYyR9WKTP53Nt3nA4rNraWhUXF8vj8bg2byKhB/RAogcSPRju9Uv0QHK/B11dXcrLy3N+j19MQoaUyFs8Pp/P9ZCSnp4un883rE9IekAP6AE9GO71S/RAunI9uNSPavDBWQAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArjYj3Amw0ffXLCvVe2tdI2+IP60vjvQQAAFzFKykAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABY6VOFlPXr1yspKUkPPPCAs+/cuXOqqKjQuHHjNHr0aJWVlamtrS3qdq2trSotLVV6erqys7O1YsUKnT9//tMsBQAADDGXHVIOHjyon/70p5o5c2bU/gcffFAvvPCCtm3bpj179ujEiRO66667nOO9vb0qLS1VT0+P3njjDT377LPavHmzVq1adflVAACAIeeyQsqZM2e0ePFi/dM//ZPGjh3r7O/s7NQzzzyjxx57TLfddptmz56tTZs26Y033tC+ffskSbW1tTp69Kj+9V//VbNmzdLChQu1bt061dTUqKenx52qAABAwhtxOTeqqKhQaWmpioqK9P3vf9/Z39jYqHA4rKKiImfftGnTlJ+fr4aGBs2dO1cNDQ2aMWOG/H6/M6akpETLli3TkSNHdOONN/a7v1AopFAo5Gx3dXVJksLhsMLh8OWUMKDIXN5k49qcg8WtPkTmcbOviYYe0AOJHgz3+iV6ILnfg1jniTmkbN26Vb/97W918ODBfseCwaBSU1OVmZkZtd/v9ysYDDpjPhpQIscjxwZSXV2tNWvW9NtfW1ur9PT0WEu4qHVz+lyf80rbuXOnq/PV1dW5Ol8iogf0QKIHw71+iR5I7vWgu7s7pvExhZT33ntP3/72t1VXV6e0tLSY7ujTqKqqUmVlpbPd1dWlvLw8FRcXy+fzuXY/4XBYdXV1euRQskJ9Sa7NOxiaV5e4Mk+kB/Pnz5fH43FlzkRDD+iBRA+Ge/0SPZDc70HknZBLFVNIaWxsVHt7u2666SZnX29vr/bu3auf/OQnevnll9XT06OOjo6oV1Pa2tqUk5MjScrJydGBAwei5o1c/RMZ83Fer1der7fffo/Hc0VOnFBfkkK9iRVS3O7DleptIqEH9ECiB8O9fokeSO71INY5Yvrg7Lx583T48GE1NTU5P3PmzNHixYudf3s8HtXX1zu3aWlpUWtrqwKBgCQpEAjo8OHDam9vd8bU1dXJ5/OpoKAgpsUDAIChK6ZXUsaMGaPp06dH7Rs1apTGjRvn7F+yZIkqKyuVlZUln8+n+++/X4FAQHPnzpUkFRcXq6CgQPfcc482bNigYDColStXqqKiYsBXSwAAwPB0WVf3fJLHH39cycnJKisrUygUUklJiZ566inneEpKinbs2KFly5YpEAho1KhRKi8v19q1a91eCgAASGCfOqTs3r07ajstLU01NTWqqam54G0mTZrk+tUoAABgaOG7ewAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFaKKaRs3LhRM2fOlM/nk8/nUyAQ0EsvveQcP3funCoqKjRu3DiNHj1aZWVlamtri5qjtbVVpaWlSk9PV3Z2tlasWKHz58+7Uw0AABgyYgopEydO1Pr169XY2KhDhw7ptttu0x133KEjR45Ikh588EG98MIL2rZtm/bs2aMTJ07orrvucm7f29ur0tJS9fT06I033tCzzz6rzZs3a9WqVe5WBQAAEt6IWAbffvvtUds/+MEPtHHjRu3bt08TJ07UM888oy1btui2226TJG3atEnXXXed9u3bp7lz56q2tlZHjx7VK6+8Ir/fr1mzZmndunV66KGHtHr1aqWmprpXGQAASGgxhZSP6u3t1bZt23T27FkFAgE1NjYqHA6rqKjIGTNt2jTl5+eroaFBc+fOVUNDg2bMmCG/3++MKSkp0bJly3TkyBHdeOONA95XKBRSKBRytru6uiRJ4XBY4XD4ckvoJzKXN9m4NudgcasPkXnc7GuioQf0QKIHw71+iR5I7vcg1nliDimHDx9WIBDQuXPnNHr0aG3fvl0FBQVqampSamqqMjMzo8b7/X4Fg0FJUjAYjAookeORYxdSXV2tNWvW9NtfW1ur9PT0WEu4qHVz+lyf80rbuXOnq/PV1dW5Ol8iogf0QKIHw71+iR5I7vWgu7s7pvExh5SpU6eqqalJnZ2d+tWvfqXy8nLt2bMn1mliUlVVpcrKSme7q6tLeXl5Ki4uls/nc+1+wuGw6urq9MihZIX6klybdzA0ry5xZZ5ID+bPny+Px+PKnImGHtADiR4M9/oleiC534PIOyGXKuaQkpqaqilTpkiSZs+erYMHD+pHP/qRFi1apJ6eHnV0dES9mtLW1qacnBxJUk5Ojg4cOBA1X+Tqn8iYgXi9Xnm93n77PR7PFTlxQn1JCvUmVkhxuw9XqreJhB7QA4keDPf6JXogudeDWOf41H8npa+vT6FQSLNnz5bH41F9fb1zrKWlRa2trQoEApKkQCCgw4cPq7293RlTV1cnn8+ngoKCT7sUAAAwhMT0SkpVVZUWLlyo/Px8nT59Wlu2bNHu3bv18ssvKyMjQ0uWLFFlZaWysrLk8/l0//33KxAIaO7cuZKk4uJiFRQU6J577tGGDRsUDAa1cuVKVVRUDPhKCQAAGL5iCint7e36m7/5G73//vvKyMjQzJkz9fLLL2v+/PmSpMcff1zJyckqKytTKBRSSUmJnnrqKef2KSkp2rFjh5YtW6ZAIKBRo0apvLxca9eudbcqAACQ8GIKKc8888wnHk9LS1NNTY1qamouOGbSpEmuX4kCAACGHr67BwAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEoxhZTq6mp9/vOf15gxY5Sdna0777xTLS0tUWPOnTuniooKjRs3TqNHj1ZZWZna2tqixrS2tqq0tFTp6enKzs7WihUrdP78+U9fDQAAGDJiCil79uxRRUWF9u3bp7q6OoXDYRUXF+vs2bPOmAcffFAvvPCCtm3bpj179ujEiRO66667nOO9vb0qLS1VT0+P3njjDT377LPavHmzVq1a5V5VAAAg4Y2IZfCuXbuitjdv3qzs7Gw1NjbqlltuUWdnp5555hlt2bJFt912myRp06ZNuu6667Rv3z7NnTtXtbW1Onr0qF555RX5/X7NmjVL69at00MPPaTVq1crNTXVveoAAEDC+lSfSens7JQkZWVlSZIaGxsVDodVVFTkjJk2bZry8/PV0NAgSWpoaNCMGTPk9/udMSUlJerq6tKRI0c+zXIAAMAQEtMrKR/V19enBx54QF/4whc0ffp0SVIwGFRqaqoyMzOjxvr9fgWDQWfMRwNK5Hjk2EBCoZBCoZCz3dXVJUkKh8MKh8OXW0I/kbm8yca1OQeLW32IzONmXxMNPaAHEj0Y7vVL9EByvwexznPZIaWiokLNzc16/fXXL3eKS1ZdXa01a9b0219bW6v09HTX72/dnD7X57zSdu7c6ep8dXV1rs6XiOgBPZDowXCvX6IHkns96O7ujmn8ZYWU5cuXa8eOHdq7d68mTpzo7M/JyVFPT486OjqiXk1pa2tTTk6OM+bAgQNR80Wu/omM+biqqipVVlY6211dXcrLy1NxcbF8Pt/llDCgcDisuro6PXIoWaG+JNfmHQzNq0tcmSfSg/nz58vj8bgyZ6KhB/RAogfDvX6JHkju9yDyTsiliimkGGN0//33a/v27dq9e7cmT54cdXz27NnyeDyqr69XWVmZJKmlpUWtra0KBAKSpEAgoB/84Adqb29Xdna2pA8Tms/nU0FBwYD36/V65fV6++33eDxX5MQJ9SUp1JtYIcXtPlyp3iYSekAPJHow3OuX6IHkXg9inSOmkFJRUaEtW7bo+eef15gxY5zPkGRkZGjkyJHKyMjQkiVLVFlZqaysLPl8Pt1///0KBAKaO3euJKm4uFgFBQW65557tGHDBgWDQa1cuVIVFRUDBhEAADA8xRRSNm7cKEm69dZbo/Zv2rRJ3/jGNyRJjz/+uJKTk1VWVqZQKKSSkhI99dRTztiUlBTt2LFDy5YtUyAQ0KhRo1ReXq61a9d+ukoAAMCQEvPbPReTlpammpoa1dTUXHDMpEmTXP+gJwAAGFr47h4AAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVYg4pe/fu1e23367c3FwlJSXpueeeizpujNGqVas0YcIEjRw5UkVFRXrnnXeixpw6dUqLFy+Wz+dTZmamlixZojNnznyqQgAAwNASc0g5e/asbrjhBtXU1Ax4fMOGDfrxj3+sp59+Wvv379eoUaNUUlKic+fOOWMWL16sI0eOqK6uTjt27NDevXt13333XX4VAABgyBkR6w0WLlyohQsXDnjMGKMnnnhCK1eu1B133CFJ+pd/+Rf5/X4999xzuvvuu/X2229r165dOnjwoObMmSNJevLJJ/WVr3xFP/zhD5Wbm/spygEAAENFzCHlkxw/flzBYFBFRUXOvoyMDBUWFqqhoUF33323GhoalJmZ6QQUSSoqKlJycrL279+vr371q/3mDYVCCoVCznZXV5ckKRwOKxwOu7b+yFzeZOPanIPFrT5E5nGzr4mGHtADiR4M9/oleiC534NY53E1pASDQUmS3++P2u/3+51jwWBQ2dnZ0YsYMUJZWVnOmI+rrq7WmjVr+u2vra1Venq6G0uPsm5On+tzXmk7d+50db66ujpX50tE9IAeSPRguNcv0QPJvR50d3fHNN7VkHKlVFVVqbKy0tnu6upSXl6eiouL5fP5XLufcDisuro6PXIoWaG+JNfmHQzNq0tcmSfSg/nz58vj8bgyZ6KhB/RAogfDvX6JHkju9yDyTsilcjWk5OTkSJLa2to0YcIEZ39bW5tmzZrljGlvb4+63fnz53Xq1Cnn9h/n9Xrl9Xr77fd4PFfkxAn1JSnUm1ghxe0+XKneJhJ6QA8kejDc65fogeReD2Kdw9W/kzJ58mTl5OSovr7e2dfV1aX9+/crEAhIkgKBgDo6OtTY2OiMefXVV9XX16fCwkI3lwMAABJYzK+knDlzRseOHXO2jx8/rqamJmVlZSk/P18PPPCAvv/97+vaa6/V5MmT9cgjjyg3N1d33nmnJOm6667TggULtHTpUj399NMKh8Navny57r77bq7sAQAAjphDyqFDh/TlL3/Z2Y58VqS8vFybN2/Wd7/7XZ09e1b33XefOjo69MUvflG7du1SWlqac5uf//znWr58uebNm6fk5GSVlZXpxz/+sQvlAACAoSLmkHLrrbfKmAtfopuUlKS1a9dq7dq1FxyTlZWlLVu2xHrXAABgGOG7ewAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsNKIeC8A7rj64RddmcebYrThZmn66pcV6k1yZc4L+cP60is6PwAgsfFKCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFgpriGlpqZGV199tdLS0lRYWKgDBw7EczkAAMAiI+J1x//2b/+myspKPf300yosLNQTTzyhkpIStbS0KDs7O17LwiC6+uEX472EAXlTjDbcLE1f/bJCvUlRx/6wvjROqwKA4Sdur6Q89thjWrp0qe69914VFBTo6aefVnp6un72s5/Fa0kAAMAicXklpaenR42NjaqqqnL2JScnq6ioSA0NDf3Gh0IhhUIhZ7uzs1OSdOrUKYXDYdfWFQ6H1d3drRHhZPX2JV38BkPQiD6j7u4+enCBHpw8eTJOqxpckf8WTp48KY/HE+/lxMVw78Fwr19yrweF1fUurmrw7K+a5/p5cPr0aUmSMeaSxsclpPzpT39Sb2+v/H5/1H6/36/f//73/cZXV1drzZo1/fZPnjz5iq1xOPt6vBdggQv1YPz/GdRlAEDcXMnnu9OnTysjI+Oi4+L2mZRYVFVVqbKy0tnu6+vTqVOnNG7cOCUlufd/+11dXcrLy9N7770nn8/n2ryJhB7QA4keSPRguNcv0QPJ/R4YY3T69Gnl5uZe0vi4hJTx48crJSVFbW1tUfvb2tqUk5PTb7zX65XX643al5mZecXW5/P5hu0JGUEP6IFEDyR6MNzrl+iB5G4PLuUVlIi4fHA2NTVVs2fPVn39/3+frq+vT/X19QoEAvFYEgAAsEzc3u6prKxUeXm55syZo5tvvllPPPGEzp49q3vvvTdeSwIAABaJW0hZtGiR/vjHP2rVqlUKBoOaNWuWdu3a1e/DtIPJ6/Xq0Ucf7ffW0nBCD+iBRA8kejDc65fogRT/HiSZS70OCAAAYBDx3T0AAMBKhBQAAGAlQgoAALASIQUAAFiJkPIRNTU1uvrqq5WWlqbCwkIdOHAg3ku6LKtXr1ZSUlLUz7Rp05zj586dU0VFhcaNG6fRo0errKys3x/Wa21tVWlpqdLT05Wdna0VK1bo/PnzUWN2796tm266SV6vV1OmTNHmzZsHo7wB7d27V7fffrtyc3OVlJSk5557Luq4MUarVq3ShAkTNHLkSBUVFemdd96JGnPq1CktXrxYPp9PmZmZWrJkic6cORM15q233tKXvvQlpaWlKS8vTxs2bOi3lm3btmnatGlKS0vTjBkztHPnTtfr/biL1f+Nb3yj3zmxYMGCqDGJXL/04ddnfP7zn9eYMWOUnZ2tO++8Uy0tLVFjBvPcH+znk0up/9Zbb+13HnzrW9+KGpOo9UvSxo0bNXPmTOcPjwUCAb300kvO8aH8+EdcrAcJdw4YGGOM2bp1q0lNTTU/+9nPzJEjR8zSpUtNZmamaWtri/fSYvboo4+a66+/3rz//vvOzx//+Efn+Le+9S2Tl5dn6uvrzaFDh8zcuXPNX/zFXzjHz58/b6ZPn26KiorMm2++aXbu3GnGjx9vqqqqnDH/9V//ZdLT001lZaU5evSoefLJJ01KSorZtWvXoNYasXPnTvMP//AP5te//rWRZLZv3x51fP369SYjI8M899xz5ne/+535q7/6KzN58mTzwQcfOGMWLFhgbrjhBrNv3z7zH//xH2bKlCnma1/7mnO8s7PT+P1+s3jxYtPc3Gx+8YtfmJEjR5qf/vSnzpjf/OY3JiUlxWzYsMEcPXrUrFy50ng8HnP48OG41l9eXm4WLFgQdU6cOnUqakwi12+MMSUlJWbTpk2mubnZNDU1ma985SsmPz/fnDlzxhkzWOd+PJ5PLqX+v/zLvzRLly6NOg86OzuHRP3GGPPv//7v5sUXXzT/+Z//aVpaWsz3vvc94/F4THNzszFmaD/+l9qDRDsHCCn/z80332wqKiqc7d7eXpObm2uqq6vjuKrL8+ijj5obbrhhwGMdHR3G4/GYbdu2OfvefvttI8k0NDQYYz78hZecnGyCwaAzZuPGjcbn85lQKGSMMea73/2uuf7666PmXrRokSkpKXG5mth9/Jd0X1+fycnJMf/4j//o7Ovo6DBer9f84he/MMYYc/ToUSPJHDx40Bnz0ksvmaSkJPO///u/xhhjnnrqKTN27FinB8YY89BDD5mpU6c623/9139tSktLo9ZTWFho/vZv/9bVGj/JhULKHXfcccHbDKX6I9rb240ks2fPHmPM4J77NjyffLx+Yz78BfXtb3/7grcZSvVHjB071vzzP//zsHv8PyrSA2MS7xzg7R5JPT09amxsVFFRkbMvOTlZRUVFamhoiOPKLt8777yj3NxcXXPNNVq8eLFaW1slSY2NjQqHw1G1Tps2Tfn5+U6tDQ0NmjFjRtQf1ispKVFXV5eOHDnijPnoHJExNvbr+PHjCgaDUevNyMhQYWFhVM2ZmZmaM2eOM6aoqEjJycnav3+/M+aWW25RamqqM6akpEQtLS3685//7IyxtS+7d+9Wdna2pk6dqmXLlunkyZPOsaFYf2dnpyQpKytL0uCd+7Y8n3y8/oif//znGj9+vKZPn66qqip1d3c7x4ZS/b29vdq6davOnj2rQCAw7B5/qX8PIhLpHEiIb0G+0v70pz+pt7e331+79fv9+v3vfx+nVV2+wsJCbd68WVOnTtX777+vNWvW6Etf+pKam5sVDAaVmpra7wsa/X6/gsGgJCkYDA7Yi8ixTxrT1dWlDz74QCNHjrxC1cUusuaB1vvRerKzs6OOjxgxQllZWVFjJk+e3G+OyLGxY8desC+ROeJlwYIFuuuuuzR58mS9++67+t73vqeFCxeqoaFBKSkpQ67+vr4+PfDAA/rCF76g6dOnO2scjHP/z3/+c9yfTwaqX5K+/vWva9KkScrNzdVbb72lhx56SC0tLfr1r38taWjUf/jwYQUCAZ07d06jR4/W9u3bVVBQoKampmHz+F+oB1LinQOElCFo4cKFzr9nzpypwsJCTZo0Sb/85S+tCg8YPHfffbfz7xkzZmjmzJn67Gc/q927d2vevHlxXNmVUVFRoebmZr3++uvxXkpcXKj+++67z/n3jBkzNGHCBM2bN0/vvvuuPvvZzw72Mq+IqVOnqqmpSZ2dnfrVr36l8vJy7dmzJ97LGlQX6kFBQUHCnQO83SNp/PjxSklJ6fcp77a2NuXk5MRpVe7JzMzU5z73OR07dkw5OTnq6elRR0dH1JiP1pqTkzNgLyLHPmmMz+ezLghF1vxJj29OTo7a29ujjp8/f16nTp1ypS+2nUfXXHONxo8fr2PHjkkaWvUvX75cO3bs0GuvvaaJEyc6+wfr3I/388mF6h9IYWGhJEWdB4lef2pqqqZMmaLZs2erurpaN9xwg370ox8Nm8dfunAPBmL7OUBI0YcP6OzZs1VfX+/s6+vrU319fdT7eInqzJkzevfddzVhwgTNnj1bHo8nqtaWlha1trY6tQYCAR0+fDjql1ZdXZ18Pp/zkmEgEIiaIzLGxn5NnjxZOTk5Uevt6urS/v37o2ru6OhQY2OjM+bVV19VX1+f8x9xIBDQ3r17FQ6HnTF1dXWaOnWqxo4d64xJhL78z//8j06ePKkJEyZIGhr1G2O0fPlybd++Xa+++mq/t6YG69yP1/PJxeofSFNTkyRFnQeJWv+F9PX1KRQKDfnH/5NEejAQ68+BmD5mO4Rt3brVeL1es3nzZnP06FFz3333mczMzKhPOCeK73znO2b37t3m+PHj5je/+Y0pKioy48ePN+3t7caYDy/Dy8/PN6+++qo5dOiQCQQCJhAIOLePXIJWXFxsmpqazK5du8xVV1014CVoK1asMG+//bapqamJ6yXIp0+fNm+++aZ58803jSTz2GOPmTfffNP893//tzHmw0uQMzMzzfPPP2/eeustc8cddwx4CfKNN95o9u/fb15//XVz7bXXRl2C29HRYfx+v7nnnntMc3Oz2bp1q0lPT+93Ce6IESPMD3/4Q/P222+bRx99dFAuwf2k+k+fPm3+/u//3jQ0NJjjx4+bV155xdx0003m2muvNefOnRsS9RtjzLJly0xGRobZvXt31OWV3d3dzpjBOvfj8XxysfqPHTtm1q5daw4dOmSOHz9unn/+eXPNNdeYW265ZUjUb4wxDz/8sNmzZ485fvy4eeutt8zDDz9skpKSTG1trTFmaD/+l9KDRDwHCCkf8eSTT5r8/HyTmppqbr75ZrNv3754L+myLFq0yEyYMMGkpqaaz3zmM2bRokXm2LFjzvEPPvjA/N3f/Z0ZO3asSU9PN1/96lfN+++/HzXHH/7wB7Nw4UIzcuRIM378ePOd73zHhMPhqDGvvfaamTVrlklNTTXXXHON2bRp02CUN6DXXnvNSOr3U15eboz58DLkRx55xPj9fuP1es28efNMS0tL1BwnT540X/va18zo0aONz+cz9957rzl9+nTUmN/97nfmi1/8ovF6veYzn/mMWb9+fb+1/PKXvzSf+9znTGpqqrn++uvNiy++eMXqjvik+ru7u01xcbG56qqrjMfjMZMmTTJLly7t92SRyPUbYwasX1LUeTmY5/5gP59crP7W1lZzyy23mKysLOP1es2UKVPMihUrov5GhjGJW78xxnzzm980kyZNMqmpqeaqq64y8+bNcwKKMUP78Y/4pB4k4jmQZIwxsb32AgAAcOXxmRQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArPR/AaBUwVNsjA4BAAAAAElFTkSuQmCC",
 718 |       "text/plain": [
 719 |        "<Figure size 640x480 with 1 Axes>"
 720 |       ]
 721 |      },
 722 |      "metadata": {},
 723 |      "output_type": "display_data"
 724 |     }
 725 |    ],
 726 |    "source": [
 727 |     "import tiktoken\n",
 728 |     "\n",
 729 |     "# Load the cl100k_base tokenizer which is designed to work with the ada-002 model\n",
 730 |     "tokenizer = tiktoken.get_encoding(\"cl100k_base\")\n",
 731 |     "\n",
 732 |     "df = pd.read_csv('processed/scraped.csv', index_col=0)\n",
 733 |     "df.columns = ['title', 'text']\n",
 734 |     "\n",
 735 |     "# Tokenize the text and save the number of tokens to a new column\n",
 736 |     "df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))\n",
 737 |     "\n",
 738 |     "# Visualize the distribution of the number of tokens per row using a histogram\n",
 739 |     "df.n_tokens.hist()"
 740 |    ]
 741 |   },
 742 |   {
 743 |    "cell_type": "code",
 744 |    "execution_count": 27,
 745 |    "metadata": {},
 746 |    "outputs": [],
 747 |    "source": [
 748 |     "max_tokens = 500\n",
 749 |     "\n",
 750 |     "# Function to split the text into chunks of a maximum number of tokens\n",
 751 |     "def split_into_many(text, max_tokens = max_tokens):\n",
 752 |     "\n",
 753 |     "    # Split the text into sentences\n",
 754 |     "    sentences = text.split('. ')\n",
 755 |     "\n",
 756 |     "    # Get the number of tokens for each sentence\n",
 757 |     "    n_tokens = [len(tokenizer.encode(\" \" + sentence)) for sentence in sentences]\n",
 758 |     "    \n",
 759 |     "    chunks = []\n",
 760 |     "    tokens_so_far = 0\n",
 761 |     "    chunk = []\n",
 762 |     "\n",
 763 |     "    # Loop through the sentences and tokens joined together in a tuple\n",
 764 |     "    for sentence, token in zip(sentences, n_tokens):\n",
 765 |     "\n",
 766 |     "        # If the number of tokens so far plus the number of tokens in the current sentence is greater \n",
 767 |     "        # than the max number of tokens, then add the chunk to the list of chunks and reset\n",
 768 |     "        # the chunk and tokens so far\n",
 769 |     "        if tokens_so_far + token > max_tokens:\n",
 770 |     "            chunks.append(\". \".join(chunk) + \".\")\n",
 771 |     "            chunk = []\n",
 772 |     "            tokens_so_far = 0\n",
 773 |     "\n",
 774 |     "        # If the number of tokens in the current sentence is greater than the max number of \n",
 775 |     "        # tokens, go to the next sentence\n",
 776 |     "        if token > max_tokens:\n",
 777 |     "            continue\n",
 778 |     "\n",
 779 |     "        # Otherwise, add the sentence to the chunk and add the number of tokens to the total\n",
 780 |     "        chunk.append(sentence)\n",
 781 |     "        tokens_so_far += token + 1\n",
 782 |     "\n",
 783 |     "    # Add the last chunk to the list of chunks\n",
 784 |     "    if chunk:\n",
 785 |     "        chunks.append(\". \".join(chunk) + \".\")\n",
 786 |     "\n",
 787 |     "    return chunks\n",
 788 |     "    \n",
 789 |     "\n",
 790 |     "shortened = []\n",
 791 |     "\n",
 792 |     "# Loop through the dataframe\n",
 793 |     "for row in df.iterrows():\n",
 794 |     "\n",
 795 |     "    # If the text is None, go to the next row\n",
 796 |     "    if row[1]['text'] is None:\n",
 797 |     "        continue\n",
 798 |     "\n",
 799 |     "    # If the number of tokens is greater than the max number of tokens, split the text into chunks\n",
 800 |     "    if row[1]['n_tokens'] > max_tokens:\n",
 801 |     "        shortened += split_into_many(row[1]['text'])\n",
 802 |     "    \n",
 803 |     "    # Otherwise, add the text to the list of shortened texts\n",
 804 |     "    else:\n",
 805 |     "        shortened.append( row[1]['text'] )"
 806 |    ]
 807 |   },
 808 |   {
 809 |    "cell_type": "code",
 810 |    "execution_count": 28,
 811 |    "metadata": {},
 812 |    "outputs": [
 813 |     {
 814 |      "data": {
 815 |       "text/plain": [
 816 |        "<AxesSubplot: >"
 817 |       ]
 818 |      },
 819 |      "execution_count": 28,
 820 |      "metadata": {},
 821 |      "output_type": "execute_result"
 822 |     },
 823 |     {
 824 |      "data": {
 825 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGgCAYAAACABpytAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/P9b71AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAsTklEQVR4nO3df3TU1Z3/8Vd+TCYEmMSgmSEVIlYrpIhQUDLVbq2ERIzWHzm7/mA1bTl6SoMrxKWaLkIAKy7tFsWNsN1FsGebZUtPoYqIhKBx1fArypYfNtWWNrQyyVYM4UcZJsn9/uE3nzomagYmM3fC83HO58Dn3jt37n07J7z8zHwyScYYIwAAAIskx3sBAAAAH0dAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWiSigXHTRRUpKSupxlJeXS5JOnTql8vJyDRs2TEOGDFFpaalaWlrC5mhublZJSYkyMjKUk5OjuXPnqqOjI3o7AgAACS81ksG7du1SZ2enc75v3z5NnTpVf/u3fytJmjNnjl544QWtW7dOmZmZmjVrlm677Ta9/vrrkqTOzk6VlJTI5/PpjTfe0OHDh3XPPffI5XLpscce6/M6urq69N5772no0KFKSkqKZAsAACBOjDE6duyYcnNzlZz8GddIzFl44IEHzOc//3nT1dVl2trajMvlMuvWrXP63377bSPJNDQ0GGOM2bRpk0lOTjaBQMAZs2LFCuPxeEwwGOzz8x46dMhI4uDg4ODg4EjA49ChQ5/5b31EV1A+6vTp0/rP//xPVVRUKCkpSY2NjQqFQiosLHTGjB49WiNHjlRDQ4MKCgrU0NCgyy+/XF6v1xlTXFysmTNnav/+/ZowYUKvzxUMBhUMBp1z8/+/gPngwYMaOnTomW4hTCgU0ssvv6yvfe1rcrlcUZkTPVHn2KDOsUGdY4dax0Z/1/nYsWMaNWpUn/7tPuOAsmHDBrW1tekb3/iGJCkQCCgtLU1ZWVlh47xerwKBgDPmo+Gku7+775MsWbJECxcu7NHe0NCgjIyMM91CDxkZGdqxY0fU5kPvqHNsUOfYoM6xQ61joz/rfPLkSUnq08czzjigrFq1StOmTVNubu6ZTtFnlZWVqqiocM7b29s1YsQIFRUVyePxROU5QqGQamtrNXXqVNJ5P6LOsUGdY4M6xw61jo3+rnN7e3ufx55RQPnDH/6grVu36he/+IXT5vP5dPr0abW1tYVdRWlpaZHP53PG7Ny5M2yu7rt8usf0xu12y+1292h3uVxRL2B/zImeqHNsUOfYoM6xQ61jo7/qHMmcZ/R7UFavXq2cnByVlJQ4bRMnTpTL5VJdXZ3T1tTUpObmZvn9fkmS3+/X3r171dra6oypra2Vx+NRfn7+mSwFAAAMQBFfQenq6tLq1atVVlam1NS/PjwzM1MzZsxQRUWFsrOz5fF4dP/998vv96ugoECSVFRUpPz8fN19991aunSpAoGA5s2bp/Ly8l6vkAAAgHNTxAFl69atam5u1re+9a0efcuWLVNycrJKS0sVDAZVXFysp59+2ulPSUnRxo0bNXPmTPn9fg0ePFhlZWVatGjR2e0CAAAMKBEHlKKiIuc2349LT09XdXW1qqurP/HxeXl52rRpU6RPCwAAziF8Fw8AALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYJ0z/jZjAADw2S56+IV4L6HP3ClGS6+Sxla9pKbv3xjXtXAFBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYJ+KA8qc//Ul///d/r2HDhmnQoEG6/PLLtXv3bqffGKP58+dr+PDhGjRokAoLC/XOO++EzXHkyBFNnz5dHo9HWVlZmjFjho4fP372uwEAAANCRAHlgw8+0NVXXy2Xy6UXX3xRBw4c0L/8y7/ovPPOc8YsXbpUy5cv18qVK7Vjxw4NHjxYxcXFOnXqlDNm+vTp2r9/v2pra7Vx40a9+uqruu+++6K3KwAAkNBSIxn8z//8zxoxYoRWr17ttI0aNcr5uzFGTzzxhObNm6ebb75ZkvSTn/xEXq9XGzZs0B133KG3335bmzdv1q5duzRp0iRJ0lNPPaUbbrhBP/zhD5Wbm9vjeYPBoILBoHPe3t4uSQqFQgqFQpFs4RN1zxOt+dA76hwb1Dk2qHPsJHKt3Skm3kvoM3eycf7sj1pHMmeSMabPlcvPz1dxcbH++Mc/qr6+Xp/73Of0ne98R/fee68k6Xe/+50+//nP66233tL48eOdx331q1/V+PHj9eSTT+qZZ57Rgw8+qA8++MDp7+joUHp6utatW6dbb721x/NWVVVp4cKFPdpramqUkZHR580CAID4OXnypO666y4dPXpUHo/nU8dGdAXld7/7nVasWKGKigp973vf065du/QP//APSktLU1lZmQKBgCTJ6/WGPc7r9Tp9gUBAOTk54YtITVV2drYz5uMqKytVUVHhnLe3t2vEiBEqKir6zA32VSgUUm1traZOnSqXyxWVOdETdY4N6hwb1Dl2ErnWY6teivcS+sydbLR4Upce2Z2sxvnXR33+7ndA+iKigNLV1aVJkybpsccekyRNmDBB+/bt08qVK1VWVhbZKiPgdrvldrt7tLtcrqi/UPtjTvREnWODOscGdY6dRKx1sDMp3kuIWLArqV/qHMmcEX1Idvjw4crPzw9rGzNmjJqbmyVJPp9PktTS0hI2pqWlxenz+XxqbW0N6+/o6NCRI0ecMQAA4NwWUUC5+uqr1dTUFNb2m9/8Rnl5eZI+/MCsz+dTXV2d09/e3q4dO3bI7/dLkvx+v9ra2tTY2OiM2bZtm7q6ujR58uQz3ggAABg4InqLZ86cOfryl7+sxx57TH/3d3+nnTt36sc//rF+/OMfS5KSkpI0e/ZsPfroo7r00ks1atQoPfLII8rNzdUtt9wi6cMrLtdff73uvfderVy5UqFQSLNmzdIdd9zR6x08AADg3BNRQLnyyiu1fv16VVZWatGiRRo1apSeeOIJTZ8+3Rnz3e9+VydOnNB9992ntrY2XXPNNdq8ebPS09OdMT/96U81a9YsTZkyRcnJySotLdXy5cujtysAAJDQIgooknTjjTfqxhtv/MT+pKQkLVq0SIsWLfrEMdnZ2aqpqYn0qQEAwDmC7+IBAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1okooFRVVSkpKSnsGD16tNN/6tQplZeXa9iwYRoyZIhKS0vV0tISNkdzc7NKSkqUkZGhnJwczZ07Vx0dHdHZDQAAGBBSI33AF7/4RW3duvWvE6T+dYo5c+bohRde0Lp165SZmalZs2bptttu0+uvvy5J6uzsVElJiXw+n9544w0dPnxY99xzj1wulx577LEobAcAAAwEEQeU1NRU+Xy+Hu1Hjx7VqlWrVFNTo+uuu06StHr1ao0ZM0bbt29XQUGBtmzZogMHDmjr1q3yer0aP368Fi9erIceekhVVVVKS0s7+x0BAICEF3FAeeedd5Sbm6v09HT5/X4tWbJEI0eOVGNjo0KhkAoLC52xo0eP1siRI9XQ0KCCggI1NDTo8ssvl9frdcYUFxdr5syZ2r9/vyZMmNDrcwaDQQWDQee8vb1dkhQKhRQKhSLdQq+654nWfOgddY4N6hwb1Dl2ErnW7hQT7yX0mTvZOH/2R60jmTOigDJ58mStWbNGl112mQ4fPqyFCxfqK1/5ivbt26dAIKC0tDRlZWWFPcbr9SoQCEiSAoFAWDjp7u/u+yRLlizRwoULe7Rv2bJFGRkZkWzhM9XW1kZ1PvSOOscGdY4N6hw7iVjrpVfFewWRWzypS5s2bYr6vCdPnuzz2IgCyrRp05y/jxs3TpMnT1ZeXp5+9rOfadCgQZFMFZHKykpVVFQ45+3t7RoxYoSKiork8Xii8hyhUEi1tbWaOnWqXC5XVOZET9Q5NqhzbFDn2EnkWo+teineS+gzd7LR4kldemR3shrnXx/1+bvfAemLiN/i+aisrCx94Qtf0LvvvqupU6fq9OnTamtrC7uK0tLS4nxmxefzaefOnWFzdN/l09vnWrq53W653e4e7S6XK+ov1P6YEz1R59igzrFBnWMnEWsd7EyK9xIiFuxK6pc6RzLnWf0elOPHj+u3v/2thg8frokTJ8rlcqmurs7pb2pqUnNzs/x+vyTJ7/dr7969am1tdcbU1tbK4/EoPz//bJYCAAAGkIiuoPzjP/6jbrrpJuXl5em9997TggULlJKSojvvvFOZmZmaMWOGKioqlJ2dLY/Ho/vvv19+v18FBQWSpKKiIuXn5+vuu+/W0qVLFQgENG/ePJWXl/d6hQQAAJybIgoof/zjH3XnnXfq/fff1wUXXKBrrrlG27dv1wUXXCBJWrZsmZKTk1VaWqpgMKji4mI9/fTTzuNTUlK0ceNGzZw5U36/X4MHD1ZZWZkWLVoU3V0BAICEFlFAWbt27af2p6enq7q6WtXV1Z84Ji8vr18+GQwAAAYOvosHAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArHNWAeXxxx9XUlKSZs+e7bSdOnVK5eXlGjZsmIYMGaLS0lK1tLSEPa65uVklJSXKyMhQTk6O5s6dq46OjrNZCgAAGEDOOKDs2rVL//Zv/6Zx48aFtc+ZM0fPP/+81q1bp/r6er333nu67bbbnP7Ozk6VlJTo9OnTeuONN/Tss89qzZo1mj9//pnvAgAADCipZ/Kg48ePa/r06fr3f/93Pfroo0770aNHtWrVKtXU1Oi6666TJK1evVpjxozR9u3bVVBQoC1btujAgQPaunWrvF6vxo8fr8WLF+uhhx5SVVWV0tLSejxfMBhUMBh0ztvb2yVJoVBIoVDoTLbQQ/c80ZoPvaPOsUGdY4M6x04i19qdYuK9hD5zJxvnz/6odSRzJhljIq5cWVmZsrOztWzZMl177bUaP368nnjiCW3btk1TpkzRBx98oKysLGd8Xl6eZs+erTlz5mj+/Pl67rnntGfPHqf/4MGDuvjii/Xmm29qwoQJPZ6vqqpKCxcu7NFeU1OjjIyMSJcPAADi4OTJk7rrrrt09OhReTyeTx0b8RWUtWvX6s0339SuXbt69AUCAaWlpYWFE0nyer0KBALOGK/X26O/u683lZWVqqiocM7b29s1YsQIFRUVfeYG+yoUCqm2tlZTp06Vy+WKypzoiTrHBnWODeocO4lc67FVL8V7CX3mTjZaPKlLj+xOVuP866M+f/c7IH0RUUA5dOiQHnjgAdXW1io9PT3ihZ0pt9stt9vdo93lckX9hdofc6In6hwb1Dk2qHPsJGKtg51J8V5CxIJdSf1S50jmjOhDso2NjWptbdWXvvQlpaamKjU1VfX19Vq+fLlSU1Pl9Xp1+vRptbW1hT2upaVFPp9PkuTz+Xrc1dN93j0GAACc2yIKKFOmTNHevXu1Z88e55g0aZKmT5/u/N3lcqmurs55TFNTk5qbm+X3+yVJfr9fe/fuVWtrqzOmtrZWHo9H+fn5UdoWAABIZBG9xTN06FCNHTs2rG3w4MEaNmyY0z5jxgxVVFQoOztbHo9H999/v/x+vwoKCiRJRUVFys/P1913362lS5cqEAho3rx5Ki8v7/VtHAAAcO45o9uMP82yZcuUnJys0tJSBYNBFRcX6+mnn3b6U1JStHHjRs2cOVN+v1+DBw9WWVmZFi1aFO2lAACABHXWAeWVV14JO09PT1d1dbWqq6s/8TF5eXnatGnT2T41AAAYoPguHgAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGCdiALKihUrNG7cOHk8Hnk8Hvn9fr344otO/6lTp1ReXq5hw4ZpyJAhKi0tVUtLS9gczc3NKikpUUZGhnJycjR37lx1dHREZzcAAGBAiCigXHjhhXr88cfV2Nio3bt367rrrtPNN9+s/fv3S5LmzJmj559/XuvWrVN9fb3ee+893Xbbbc7jOzs7VVJSotOnT+uNN97Qs88+qzVr1mj+/PnR3RUAAEhoqZEMvummm8LOv//972vFihXavn27LrzwQq1atUo1NTW67rrrJEmrV6/WmDFjtH37dhUUFGjLli06cOCAtm7dKq/Xq/Hjx2vx4sV66KGHVFVVpbS0tOjtDAAAJKyIAspHdXZ2at26dTpx4oT8fr8aGxsVCoVUWFjojBk9erRGjhyphoYGFRQUqKGhQZdffrm8Xq8zpri4WDNnztT+/fs1YcKEXp8rGAwqGAw65+3t7ZKkUCikUCh0plsI0z1PtOZD76hzbFDn2KDOsZPItXanmHgvoc/cycb5sz9qHcmcEQeUvXv3yu/369SpUxoyZIjWr1+v/Px87dmzR2lpacrKygob7/V6FQgEJEmBQCAsnHT3d/d9kiVLlmjhwoU92rds2aKMjIxIt/CpamtrozofekedY4M6xwZ1jp1ErPXSq+K9gsgtntSlTZs2RX3ekydP9nlsxAHlsssu0549e3T06FH9/Oc/V1lZmerr6yOdJiKVlZWqqKhwztvb2zVixAgVFRXJ4/FE5TlCoZBqa2s1depUuVyuqMyJnqhzbFDn2KDOsZPItR5b9VK8l9Bn7mSjxZO69MjuZDXOvz7q83e/A9IXEQeUtLQ0XXLJJZKkiRMnateuXXryySd1++236/Tp02prawu7itLS0iKfzydJ8vl82rlzZ9h83Xf5dI/pjdvtltvt7tHucrmi/kLtjznRE3WODeocG9Q5dhKx1sHOpHgvIWLBrqR+qXMkc57170Hp6upSMBjUxIkT5XK5VFdX5/Q1NTWpublZfr9fkuT3+7V37161trY6Y2pra+XxeJSfn3+2SwEAAANERFdQKisrNW3aNI0cOVLHjh1TTU2NXnnlFb300kvKzMzUjBkzVFFRoezsbHk8Ht1///3y+/0qKCiQJBUVFSk/P1933323li5dqkAgoHnz5qm8vLzXKyQAAODcFFFAaW1t1T333KPDhw8rMzNT48aN00svvaSpU6dKkpYtW6bk5GSVlpYqGAyquLhYTz/9tPP4lJQUbdy4UTNnzpTf79fgwYNVVlamRYsWRXdXAAAgoUUUUFatWvWp/enp6aqurlZ1dfUnjsnLy+uXTwYDAICBg+/iAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOtEFFCWLFmiK6+8UkOHDlVOTo5uueUWNTU1hY05deqUysvLNWzYMA0ZMkSlpaVqaWkJG9Pc3KySkhJlZGQoJydHc+fOVUdHx9nvBgAADAgRBZT6+nqVl5dr+/btqq2tVSgUUlFRkU6cOOGMmTNnjp5//nmtW7dO9fX1eu+993Tbbbc5/Z2dnSopKdHp06f1xhtv6Nlnn9WaNWs0f/786O0KAAAktNRIBm/evDnsfM2aNcrJyVFjY6P+5m/+RkePHtWqVatUU1Oj6667TpK0evVqjRkzRtu3b1dBQYG2bNmiAwcOaOvWrfJ6vRo/frwWL16shx56SFVVVUpLS+vxvMFgUMFg0Dlvb2+XJIVCIYVCoYg33ZvueaI1H3pHnWODOscGdY6dRK61O8XEewl95k42zp/9UetI5kwyxpxx5d59911deuml2rt3r8aOHatt27ZpypQp+uCDD5SVleWMy8vL0+zZszVnzhzNnz9fzz33nPbs2eP0Hzx4UBdffLHefPNNTZgwocfzVFVVaeHChT3aa2pqlJGRcabLBwAAMXTy5EndddddOnr0qDwez6eOjegKykd1dXVp9uzZuvrqqzV27FhJUiAQUFpaWlg4kSSv16tAIOCM8Xq9Pfq7+3pTWVmpiooK57y9vV0jRoxQUVHRZ26wr0KhkGprazV16lS5XK6ozImeqHNsUOfYoM6xk8i1Hlv1UryX0GfuZKPFk7r0yO5kNc6/Purzd78D0hdnHFDKy8u1b98+vfbaa2c6RZ+53W653e4e7S6XK+ov1P6YEz1R59igzrFBnWMnEWsd7EyK9xIiFuxK6pc6RzLnGd1mPGvWLG3cuFEvv/yyLrzwQqfd5/Pp9OnTamtrCxvf0tIin8/njPn4XT3d591jAADAuS2igGKM0axZs7R+/Xpt27ZNo0aNCuufOHGiXC6X6urqnLampiY1NzfL7/dLkvx+v/bu3avW1lZnTG1trTwej/Lz889mLwAAYICI6C2e8vJy1dTU6Je//KWGDh3qfGYkMzNTgwYNUmZmpmbMmKGKigplZ2fL4/Ho/vvvl9/vV0FBgSSpqKhI+fn5uvvuu7V06VIFAgHNmzdP5eXlvb6NAwAAzj0RBZQVK1ZIkq699tqw9tWrV+sb3/iGJGnZsmVKTk5WaWmpgsGgiouL9fTTTztjU1JStHHjRs2cOVN+v1+DBw9WWVmZFi1adHY7AQAAA0ZEAaUvdySnp6erurpa1dXVnzgmLy9PmzZtiuSpAQDAOYTv4gEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHVS470AIJFc9PAL8V5Cn7hTjJZeJY2teklN378x3ssBgIhxBQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdSIOKK+++qpuuukm5ebmKikpSRs2bAjrN8Zo/vz5Gj58uAYNGqTCwkK98847YWOOHDmi6dOny+PxKCsrSzNmzNDx48fPaiMAAGDgiDignDhxQldccYWqq6t77V+6dKmWL1+ulStXaseOHRo8eLCKi4t16tQpZ8z06dO1f/9+1dbWauPGjXr11Vd13333nfkuAADAgJIa6QOmTZumadOm9dpnjNETTzyhefPm6eabb5Yk/eQnP5HX69WGDRt0xx136O2339bmzZu1a9cuTZo0SZL01FNP6YYbbtAPf/hD5ebmnsV2AADAQBBxQPk0Bw8eVCAQUGFhodOWmZmpyZMnq6GhQXfccYcaGhqUlZXlhBNJKiwsVHJysnbs2KFbb721x7zBYFDBYNA5b29vlySFQiGFQqGorL17nmjNh94lep3dKSbeS+gTd7Jx/kzUWieCRH89J5JErnWi/NyQ+v9nRyRzRjWgBAIBSZLX6w1r93q9Tl8gEFBOTk74IlJTlZ2d7Yz5uCVLlmjhwoU92rds2aKMjIxoLN1RW1sb1fnQu0St89Kr4r2CyCye1KVNmzbFexkDXqK+nhNRItY60X5uSP33s+PkyZN9HhvVgNJfKisrVVFR4Zy3t7drxIgRKioqksfjicpzhEIh1dbWaurUqXK5XFGZEz0lep3HVr0U7yX0iTvZaPGkLj2yO1mN86+P93IGrER/PSeSRK51ovzckPr/Z0f3OyB9EdWA4vP5JEktLS0aPny4097S0qLx48c7Y1pbW8Me19HRoSNHjjiP/zi32y23292j3eVyRf2F2h9zoqdErXOwMyneS4hIsCspIeucaBL19ZyIErHWifZzQ+q/nx2RzBnV34MyatQo+Xw+1dXVOW3t7e3asWOH/H6/JMnv96utrU2NjY3OmG3btqmrq0uTJ0+O5nIAAECCivgKyvHjx/Xuu+865wcPHtSePXuUnZ2tkSNHavbs2Xr00Ud16aWXatSoUXrkkUeUm5urW265RZI0ZswYXX/99br33nu1cuVKhUIhzZo1S3fccQd38AAAAElnEFB2796tr33ta85592dDysrKtGbNGn33u9/ViRMndN9996mtrU3XXHONNm/erPT0dOcxP/3pTzVr1ixNmTJFycnJKi0t1fLly6OwHQAAMBBEHFCuvfZaGfPJt0wlJSVp0aJFWrRo0SeOyc7OVk1NTaRPDQAAzhF8Fw8AALAOAQUAAFgnIX4PCgAA0oe/UyQRb9tF5LiCAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADr8F08wAB30cMvxHsJEfv94yXxXgKAOOMKCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA63AXDwCcoxLpDi93itHSq+K9CsQSV1AAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA5fFoi4SaQvKkNsJcpro/sL7MZWvaSm798Y7+UAAwpXUAAAgHW4gtKLRPm/t4/6/eMl8V4CAABRwxUUAABgHQIKAACwDgEFAABYh4ACAACsw4dkASAKEvHD9YDNuIICAACsE9eAUl1drYsuukjp6emaPHmydu7cGc/lAAAAS8QtoPz3f/+3KioqtGDBAr355pu64oorVFxcrNbW1ngtCQAAWCJun0H50Y9+pHvvvVff/OY3JUkrV67UCy+8oGeeeUYPP/xw2NhgMKhgMOicHz16VJJ05MgRhUKhqKwnFArp5MmTev/995XacSIqc8bS+++/H+8l9Emi1zlRpHYZnTzZpdRQsjq7kuK9nAGLOscOtY6Nj9a5P/5dOXbsmCTJGPPZg00cBINBk5KSYtavXx/Wfs8995ivf/3rPcYvWLDASOLg4ODg4OAYAMehQ4c+MyvE5QrKn//8Z3V2dsrr9Ya1e71e/frXv+4xvrKyUhUVFc55V1eXjhw5omHDhikpKTpJur29XSNGjNChQ4fk8XiiMid6os6xQZ1jgzrHDrWOjf6uszFGx44dU25u7meOTYjbjN1ut9xud1hbVlZWvzyXx+PhxR8D1Dk2qHNsUOfYodax0Z91zszM7NO4uHxI9vzzz1dKSopaWlrC2ltaWuTz+eKxJAAAYJG4BJS0tDRNnDhRdXV1TltXV5fq6urk9/vjsSQAAGCRuL3FU1FRobKyMk2aNElXXXWVnnjiCZ04ccK5qyfW3G63FixY0OOtJEQXdY4N6hwb1Dl2qHVs2FTnJGP6cq9P//jXf/1X/eAHP1AgEND48eO1fPlyTZ48OV7LAQAAlohrQAEAAOgN38UDAACsQ0ABAADWIaAAAADrEFAAAIB1CCiSqqurddFFFyk9PV2TJ0/Wzp07472khPLqq6/qpptuUm5urpKSkrRhw4awfmOM5s+fr+HDh2vQoEEqLCzUO++8EzbmyJEjmj59ujwej7KysjRjxgwdP348hruw35IlS3TllVdq6NChysnJ0S233KKmpqawMadOnVJ5ebmGDRumIUOGqLS0tMcvRGxublZJSYkyMjKUk5OjuXPnqqOjI5ZbsdqKFSs0btw45zdp+v1+vfjii04/Ne4fjz/+uJKSkjR79mynjVpHR1VVlZKSksKO0aNHO/3W1vnsv/ovsa1du9akpaWZZ555xuzfv9/ce++9Jisry7S0tMR7aQlj06ZN5p/+6Z/ML37xCyOpx5dAPv744yYzM9Ns2LDB/O///q/5+te/bkaNGmX+8pe/OGOuv/56c8UVV5jt27eb//mf/zGXXHKJufPOO2O8E7sVFxeb1atXm3379pk9e/aYG264wYwcOdIcP37cGfPtb3/bjBgxwtTV1Zndu3ebgoIC8+Uvf9np7+joMGPHjjWFhYXmrbfeMps2bTLnn3++qaysjMeWrPTcc8+ZF154wfzmN78xTU1N5nvf+55xuVxm3759xhhq3B927txpLrroIjNu3DjzwAMPOO3UOjoWLFhgvvjFL5rDhw87x//93/85/bbW+ZwPKFdddZUpLy93zjs7O01ubq5ZsmRJHFeVuD4eULq6uozP5zM/+MEPnLa2tjbjdrvNf/3XfxljjDlw4ICRZHbt2uWMefHFF01SUpL505/+FLO1J5rW1lYjydTX1xtjPqyry+Uy69atc8a8/fbbRpJpaGgwxnwYJpOTk00gEHDGrFixwng8HhMMBmO7gQRy3nnnmf/4j/+gxv3g2LFj5tJLLzW1tbXmq1/9qhNQqHX0LFiwwFxxxRW99tlc53P6LZ7Tp0+rsbFRhYWFTltycrIKCwvV0NAQx5UNHAcPHlQgEAircWZmpiZPnuzUuKGhQVlZWZo0aZIzprCwUMnJydqxY0fM15wojh49KknKzs6WJDU2NioUCoXVevTo0Ro5cmRYrS+//PKwbxIvLi5We3u79u/fH8PVJ4bOzk6tXbtWJ06ckN/vp8b9oLy8XCUlJWE1lXg9R9s777yj3NxcXXzxxZo+fbqam5sl2V3nhPg24/7y5z//WZ2dnWFFlySv16tf//rXcVrVwBIIBCSp1xp39wUCAeXk5IT1p6amKjs72xmDcF1dXZo9e7auvvpqjR07VtKHdUxLS+vxTd8fr3Vv/y26+/ChvXv3yu/369SpUxoyZIjWr1+v/Px87dmzhxpH0dq1a/Xmm29q165dPfp4PUfP5MmTtWbNGl122WU6fPiwFi5cqK985Svat2+f1XU+pwMKkKjKy8u1b98+vfbaa/FeyoB02WWXac+ePTp69Kh+/vOfq6ysTPX19fFe1oBy6NAhPfDAA6qtrVV6enq8lzOgTZs2zfn7uHHjNHnyZOXl5elnP/uZBg0aFMeVfbpz+i2e888/XykpKT0+rdzS0iKfzxenVQ0s3XX8tBr7fD61traG9Xd0dOjIkSP8d+jFrFmztHHjRr388su68MILnXafz6fTp0+rra0tbPzHa93bf4vuPnwoLS1Nl1xyiSZOnKglS5boiiuu0JNPPkmNo6ixsVGtra360pe+pNTUVKWmpqq+vl7Lly9XamqqvF4vte4nWVlZ+sIXvqB3333X6tf0OR1Q0tLSNHHiRNXV1TltXV1dqqurk9/vj+PKBo5Ro0bJ5/OF1bi9vV07duxwauz3+9XW1qbGxkZnzLZt29TV1cWXR36EMUazZs3S+vXrtW3bNo0aNSqsf+LEiXK5XGG1bmpqUnNzc1it9+7dGxYIa2tr5fF4lJ+fH5uNJKCuri4Fg0FqHEVTpkzR3r17tWfPHueYNGmSpk+f7vydWveP48eP67e//a2GDx9u92u63z5+myDWrl1r3G63WbNmjTlw4IC57777TFZWVtinlfHpjh07Zt566y3z1ltvGUnmRz/6kXnrrbfMH/7wB2PMh7cZZ2VlmV/+8pfmV7/6lbn55pt7vc14woQJZseOHea1114zl156KbcZf8zMmTNNZmameeWVV8JuFzx58qQz5tvf/rYZOXKk2bZtm9m9e7fx+/3G7/c7/d23CxYVFZk9e/aYzZs3mwsuuIDbMj/i4YcfNvX19ebgwYPmV7/6lXn44YdNUlKS2bJlizGGGvenj97FYwy1jpYHH3zQvPLKK+bgwYPm9ddfN4WFheb88883ra2txhh763zOBxRjjHnqqafMyJEjTVpamrnqqqvM9u3b472khPLyyy8bST2OsrIyY8yHtxo/8sgjxuv1GrfbbaZMmWKamprC5nj//ffNnXfeaYYMGWI8Ho/55je/aY4dOxaH3dirtxpLMqtXr3bG/OUvfzHf+c53zHnnnWcyMjLMrbfeag4fPhw2z+9//3szbdo0M2jQIHP++eebBx980IRCoRjvxl7f+ta3TF5enklLSzMXXHCBmTJlihNOjKHG/enjAYVaR8ftt99uhg8fbtLS0sznPvc5c/vtt5t3333X6be1zknGGNN/12cAAAAid05/BgUAANiJgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1vl/gvKK3Dyq3sYAAAAASUVORK5CYII=",
 826 |       "text/plain": [
 827 |        "<Figure size 640x480 with 1 Axes>"
 828 |       ]
 829 |      },
 830 |      "metadata": {},
 831 |      "output_type": "display_data"
 832 |     }
 833 |    ],
 834 |    "source": [
 835 |     "df = pd.DataFrame(shortened, columns = ['text'])\n",
 836 |     "df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))\n",
 837 |     "df.n_tokens.hist()"
 838 |    ]
 839 |   },
 840 |   {
 841 |    "cell_type": "code",
 842 |    "execution_count": 29,
 843 |    "metadata": {},
 844 |    "outputs": [
 845 |     {
 846 |      "data": {
 847 |       "text/html": [
 848 |        "<div>\n",
 849 |        "<style scoped>\n",
 850 |        "    .dataframe tbody tr th:only-of-type {\n",
 851 |        "        vertical-align: middle;\n",
 852 |        "    }\n",
 853 |        "\n",
 854 |        "    .dataframe tbody tr th {\n",
 855 |        "        vertical-align: top;\n",
 856 |        "    }\n",
 857 |        "\n",
 858 |        "    .dataframe thead th {\n",
 859 |        "        text-align: right;\n",
 860 |        "    }\n",
 861 |        "</style>\n",
 862 |        "<table border=\"1\" class=\"dataframe\">\n",
 863 |        "  <thead>\n",
 864 |        "    <tr style=\"text-align: right;\">\n",
 865 |        "      <th></th>\n",
 866 |        "      <th>text</th>\n",
 867 |        "      <th>n_tokens</th>\n",
 868 |        "      <th>embeddings</th>\n",
 869 |        "    </tr>\n",
 870 |        "  </thead>\n",
 871 |        "  <tbody>\n",
 872 |        "    <tr>\n",
 873 |        "      <th>0</th>\n",
 874 |        "      <td>blog authors maddie.   Maddie Hall - OpenAI   ...</td>\n",
 875 |        "      <td>175</td>\n",
 876 |        "      <td>[-0.012958061881363392, -0.006103983614593744,...</td>\n",
 877 |        "    </tr>\n",
 878 |        "    <tr>\n",
 879 |        "      <th>1</th>\n",
 880 |        "      <td>blog authors tom.   Tom Brown - OpenAI        ...</td>\n",
 881 |        "      <td>228</td>\n",
 882 |        "      <td>[-0.0053874170407652855, -0.009962032549083233...</td>\n",
 883 |        "    </tr>\n",
 884 |        "    <tr>\n",
 885 |        "      <th>2</th>\n",
 886 |        "      <td>blog openai scholars 2019 final projects.   Op...</td>\n",
 887 |        "      <td>492</td>\n",
 888 |        "      <td>[0.0019150723237544298, -0.0070442273281514645...</td>\n",
 889 |        "    </tr>\n",
 890 |        "    <tr>\n",
 891 |        "      <th>3</th>\n",
 892 |        "      <td>In this project, I used curiosity-driven explo...</td>\n",
 893 |        "      <td>478</td>\n",
 894 |        "      <td>[-0.0067560747265815735, 0.0004431474662851542...</td>\n",
 895 |        "    </tr>\n",
 896 |        "    <tr>\n",
 897 |        "      <th>4</th>\n",
 898 |        "      <td>Results revealed that the optimal RL policies ...</td>\n",
 899 |        "      <td>499</td>\n",
 900 |        "      <td>[-0.012868616729974747, 0.0029640409629791975,...</td>\n",
 901 |        "    </tr>\n",
 902 |        "  </tbody>\n",
 903 |        "</table>\n",
 904 |        "</div>"
 905 |       ],
 906 |       "text/plain": [
 907 |        "                                                text  n_tokens  \\\n",
 908 |        "0  blog authors maddie.   Maddie Hall - OpenAI   ...       175   \n",
 909 |        "1  blog authors tom.   Tom Brown - OpenAI        ...       228   \n",
 910 |        "2  blog openai scholars 2019 final projects.   Op...       492   \n",
 911 |        "3  In this project, I used curiosity-driven explo...       478   \n",
 912 |        "4  Results revealed that the optimal RL policies ...       499   \n",
 913 |        "\n",
 914 |        "                                          embeddings  \n",
 915 |        "0  [-0.012958061881363392, -0.006103983614593744,...  \n",
 916 |        "1  [-0.0053874170407652855, -0.009962032549083233...  \n",
 917 |        "2  [0.0019150723237544298, -0.0070442273281514645...  \n",
 918 |        "3  [-0.0067560747265815735, 0.0004431474662851542...  \n",
 919 |        "4  [-0.012868616729974747, 0.0029640409629791975,...  "
 920 |       ]
 921 |      },
 922 |      "execution_count": 29,
 923 |      "metadata": {},
 924 |      "output_type": "execute_result"
 925 |     }
 926 |    ],
 927 |    "source": [
 928 |     "import openai\n",
 929 |     "\n",
 930 |     "df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])\n",
 931 |     "df.to_csv('processed/embeddings.csv')\n",
 932 |     "df.head()"
 933 |    ]
 934 |   },
 935 |   {
 936 |    "cell_type": "code",
 937 |    "execution_count": 31,
 938 |    "metadata": {},
 939 |    "outputs": [
 940 |     {
 941 |      "data": {
 942 |       "text/html": [
 943 |        "<div>\n",
 944 |        "<style scoped>\n",
 945 |        "    .dataframe tbody tr th:only-of-type {\n",
 946 |        "        vertical-align: middle;\n",
 947 |        "    }\n",
 948 |        "\n",
 949 |        "    .dataframe tbody tr th {\n",
 950 |        "        vertical-align: top;\n",
 951 |        "    }\n",
 952 |        "\n",
 953 |        "    .dataframe thead th {\n",
 954 |        "        text-align: right;\n",
 955 |        "    }\n",
 956 |        "</style>\n",
 957 |        "<table border=\"1\" class=\"dataframe\">\n",
 958 |        "  <thead>\n",
 959 |        "    <tr style=\"text-align: right;\">\n",
 960 |        "      <th></th>\n",
 961 |        "      <th>text</th>\n",
 962 |        "      <th>n_tokens</th>\n",
 963 |        "      <th>embeddings</th>\n",
 964 |        "    </tr>\n",
 965 |        "  </thead>\n",
 966 |        "  <tbody>\n",
 967 |        "    <tr>\n",
 968 |        "      <th>0</th>\n",
 969 |        "      <td>blog authors maddie.   Maddie Hall - OpenAI   ...</td>\n",
 970 |        "      <td>175</td>\n",
 971 |        "      <td>[-0.012958061881363392, -0.006103983614593744,...</td>\n",
 972 |        "    </tr>\n",
 973 |        "    <tr>\n",
 974 |        "      <th>1</th>\n",
 975 |        "      <td>blog authors tom.   Tom Brown - OpenAI        ...</td>\n",
 976 |        "      <td>228</td>\n",
 977 |        "      <td>[-0.0053874170407652855, -0.009962032549083233...</td>\n",
 978 |        "    </tr>\n",
 979 |        "    <tr>\n",
 980 |        "      <th>2</th>\n",
 981 |        "      <td>blog openai scholars 2019 final projects.   Op...</td>\n",
 982 |        "      <td>492</td>\n",
 983 |        "      <td>[0.0019150723237544298, -0.0070442273281514645...</td>\n",
 984 |        "    </tr>\n",
 985 |        "    <tr>\n",
 986 |        "      <th>3</th>\n",
 987 |        "      <td>In this project, I used curiosity-driven explo...</td>\n",
 988 |        "      <td>478</td>\n",
 989 |        "      <td>[-0.0067560747265815735, 0.0004431474662851542...</td>\n",
 990 |        "    </tr>\n",
 991 |        "    <tr>\n",
 992 |        "      <th>4</th>\n",
 993 |        "      <td>Results revealed that the optimal RL policies ...</td>\n",
 994 |        "      <td>499</td>\n",
 995 |        "      <td>[-0.012868616729974747, 0.0029640409629791975,...</td>\n",
 996 |        "    </tr>\n",
 997 |        "  </tbody>\n",
 998 |        "</table>\n",
 999 |        "</div>"
1000 |       ],
1001 |       "text/plain": [
1002 |        "                                                text  n_tokens  \\\n",
1003 |        "0  blog authors maddie.   Maddie Hall - OpenAI   ...       175   \n",
1004 |        "1  blog authors tom.   Tom Brown - OpenAI        ...       228   \n",
1005 |        "2  blog openai scholars 2019 final projects.   Op...       492   \n",
1006 |        "3  In this project, I used curiosity-driven explo...       478   \n",
1007 |        "4  Results revealed that the optimal RL policies ...       499   \n",
1008 |        "\n",
1009 |        "                                          embeddings  \n",
1010 |        "0  [-0.012958061881363392, -0.006103983614593744,...  \n",
1011 |        "1  [-0.0053874170407652855, -0.009962032549083233...  \n",
1012 |        "2  [0.0019150723237544298, -0.0070442273281514645...  \n",
1013 |        "3  [-0.0067560747265815735, 0.0004431474662851542...  \n",
1014 |        "4  [-0.012868616729974747, 0.0029640409629791975,...  "
1015 |       ]
1016 |      },
1017 |      "execution_count": 31,
1018 |      "metadata": {},
1019 |      "output_type": "execute_result"
1020 |     }
1021 |    ],
1022 |    "source": [
1023 |     "import pandas as pd\n",
1024 |     "import numpy as np\n",
1025 |     "from ast import literal_eval\n",
1026 |     "from openai.embeddings_utils import distances_from_embeddings, cosine_similarity\n",
1027 |     "\n",
1028 |     "df=pd.read_csv('processed/embeddings.csv', index_col=0)\n",
1029 |     "df['embeddings'] = df['embeddings'].apply(literal_eval).apply(np.array)\n",
1030 |     "\n",
1031 |     "df.head()"
1032 |    ]
1033 |   },
1034 |   {
1035 |    "cell_type": "code",
1036 |    "execution_count": 32,
1037 |    "metadata": {},
1038 |    "outputs": [
1039 |     {
1040 |      "data": {
1041 |       "text/plain": [
1042 |        "'No, you are not allowed to publish model outputs to Twitter without a human review. You must manually review each generation before sharing or while streaming, and indicate that the content is AI-generated in a way no user could reasonably miss or misunderstand.'"
1043 |       ]
1044 |      },
1045 |      "execution_count": 32,
1046 |      "metadata": {},
1047 |      "output_type": "execute_result"
1048 |     }
1049 |    ],
1050 |    "source": [
1051 |     "def create_context(\n",
1052 |     "    question, df, max_len=1800, size=\"ada\"\n",
1053 |     "):\n",
1054 |     "    \"\"\"\n",
1055 |     "    Create a context for a question by finding the most similar context from the dataframe\n",
1056 |     "    \"\"\"\n",
1057 |     "\n",
1058 |     "    # Get the embeddings for the question\n",
1059 |     "    q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']\n",
1060 |     "\n",
1061 |     "    # Get the distances from the embeddings\n",
1062 |     "    df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')\n",
1063 |     "\n",
1064 |     "\n",
1065 |     "    returns = []\n",
1066 |     "    cur_len = 0\n",
1067 |     "\n",
1068 |     "    # Sort by distance and add the text to the context until the context is too long\n",
1069 |     "    for i, row in df.sort_values('distances', ascending=True).iterrows():\n",
1070 |     "        \n",
1071 |     "        # Add the length of the text to the current length\n",
1072 |     "        cur_len += row['n_tokens'] + 4\n",
1073 |     "        \n",
1074 |     "        # If the context is too long, break\n",
1075 |     "        if cur_len > max_len:\n",
1076 |     "            break\n",
1077 |     "        \n",
1078 |     "        # Else add it to the text that is being returned\n",
1079 |     "        returns.append(row[\"text\"])\n",
1080 |     "\n",
1081 |     "    # Return the context\n",
1082 |     "    return \"\\n\\n###\\n\\n\".join(returns)\n",
1083 |     "\n",
1084 |     "def answer_question(\n",
1085 |     "    df,\n",
1086 |     "    model=\"text-davinci-003\",\n",
1087 |     "    question=\"Am I allowed to publish model outputs to Twitter, without a human review?\",\n",
1088 |     "    max_len=1800,\n",
1089 |     "    size=\"ada\",\n",
1090 |     "    debug=False,\n",
1091 |     "    max_tokens=150,\n",
1092 |     "    stop_sequence=None\n",
1093 |     "):\n",
1094 |     "    \"\"\"\n",
1095 |     "    Answer a question based on the most similar context from the dataframe texts\n",
1096 |     "    \"\"\"\n",
1097 |     "    context = create_context(\n",
1098 |     "        question,\n",
1099 |     "        df,\n",
1100 |     "        max_len=max_len,\n",
1101 |     "        size=size,\n",
1102 |     "    )\n",
1103 |     "    # If debug, print the raw model response\n",
1104 |     "    if debug:\n",
1105 |     "        print(\"Context:\\n\" + context)\n",
1106 |     "        print(\"\\n\\n\")\n",
1107 |     "\n",
1108 |     "    try:\n",
1109 |     "        # Create a completions using the question and context\n",
1110 |     "        response = openai.Completion.create(\n",
1111 |     "            prompt=f\"Answer the question based on the context below, and if the question can't be answered based on the context, say \\\"I don't know\\\"\\n\\nContext: {context}\\n\\n---\\n\\nQuestion: {question}\\nAnswer:\",\n",
1112 |     "            temperature=0,\n",
1113 |     "            max_tokens=max_tokens,\n",
1114 |     "            top_p=1,\n",
1115 |     "            frequency_penalty=0,\n",
1116 |     "            presence_penalty=0,\n",
1117 |     "            stop=stop_sequence,\n",
1118 |     "            model=model,\n",
1119 |     "        )\n",
1120 |     "        return response[\"choices\"][0][\"text\"].strip()\n",
1121 |     "    except Exception as e:\n",
1122 |     "        print(e)\n",
1123 |     "        return \"\""
1124 |    ]
1125 |   },
1126 |   {
1127 |    "cell_type": "code",
1128 |    "execution_count": 33,
1129 |    "metadata": {},
1130 |    "outputs": [
1131 |     {
1132 |      "data": {
1133 |       "text/plain": [
1134 |        "\"I don't know.\""
1135 |       ]
1136 |      },
1137 |      "execution_count": 33,
1138 |      "metadata": {},
1139 |      "output_type": "execute_result"
1140 |     }
1141 |    ],
1142 |    "source": [
1143 |     "answer_question(df, question=\"What day is it?\", debug=False)"
1144 |    ]
1145 |   },
1146 |   {
1147 |    "cell_type": "code",
1148 |    "execution_count": 34,
1149 |    "metadata": {},
1150 |    "outputs": [
1151 |     {
1152 |      "data": {
1153 |       "text/plain": [
1154 |        "'The newest embeddings model is text-embedding-ada-002.'"
1155 |       ]
1156 |      },
1157 |      "execution_count": 34,
1158 |      "metadata": {},
1159 |      "output_type": "execute_result"
1160 |     }
1161 |    ],
1162 |    "source": [
1163 |     "answer_question(df, question=\"What is our newest embeddings model?\")"
1164 |    ]
1165 |   }
1166 |  ],
1167 |  "metadata": {
1168 |   "kernelspec": {
1169 |    "display_name": "env",
1170 |    "language": "python",
1171 |    "name": "python3"
1172 |   },
1173 |   "language_info": {
1174 |    "codemirror_mode": {
1175 |     "name": "ipython",
1176 |     "version": 3
1177 |    },
1178 |    "file_extension": ".py",
1179 |    "mimetype": "text/x-python",
1180 |    "name": "python",
1181 |    "nbconvert_exporter": "python",
1182 |    "pygments_lexer": "ipython3",
1183 |    "version": "3.9.6"
1184 |   },
1185 |   "orig_nbformat": 4,
1186 |   "vscode": {
1187 |    "interpreter": {
1188 |     "hash": "05f34a34d73b71652304030c1097be3a5720ea2447153dd6542d145a26b73181"
1189 |    }
1190 |   }
1191 |  },
1192 |  "nbformat": 4,
1193 |  "nbformat_minor": 2
1194 | }
1195 | 


--------------------------------------------------------------------------------
/web-qa.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### Step 1
  3 | ################################################################################
  4 | 
  5 | import requests
  6 | import re
  7 | import urllib.request
  8 | from bs4 import BeautifulSoup
  9 | from collections import deque
 10 | from html.parser import HTMLParser
 11 | from urllib.parse import urlparse
 12 | import os
 13 | import pandas as pd
 14 | import tiktoken
 15 | import openai
 16 | import numpy as np
 17 | from openai.embeddings_utils import distances_from_embeddings, cosine_similarity
 18 | from ast import literal_eval
 19 | 
 20 | # Regex pattern to match a URL
 21 | HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$'
 22 | 
 23 | # Define OpenAI api_key
 24 | # openai.api_key = '<Your API Key>'
 25 | 
 26 | # Define root domain to crawl
 27 | domain = "openai.com"
 28 | full_url = "https://openai.com/"
 29 | 
 30 | # Create a class to parse the HTML and get the hyperlinks
 31 | class HyperlinkParser(HTMLParser):
 32 |     def __init__(self):
 33 |         super().__init__()
 34 |         # Create a list to store the hyperlinks
 35 |         self.hyperlinks = []
 36 | 
 37 |     # Override the HTMLParser's handle_starttag method to get the hyperlinks
 38 |     def handle_starttag(self, tag, attrs):
 39 |         attrs = dict(attrs)
 40 | 
 41 |         # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
 42 |         if tag == "a" and "href" in attrs:
 43 |             self.hyperlinks.append(attrs["href"])
 44 | 
 45 | ################################################################################
 46 | ### Step 2
 47 | ################################################################################
 48 | 
 49 | # Function to get the hyperlinks from a URL
 50 | def get_hyperlinks(url):
 51 |     
 52 |     # Try to open the URL and read the HTML
 53 |     try:
 54 |         # Open the URL and read the HTML
 55 |         with urllib.request.urlopen(url) as response:
 56 | 
 57 |             # If the response is not HTML, return an empty list
 58 |             if not response.info().get('Content-Type').startswith("text/html"):
 59 |                 return []
 60 |             
 61 |             # Decode the HTML
 62 |             html = response.read().decode('utf-8')
 63 |     except Exception as e:
 64 |         print(e)
 65 |         return []
 66 | 
 67 |     # Create the HTML Parser and then Parse the HTML to get hyperlinks
 68 |     parser = HyperlinkParser()
 69 |     parser.feed(html)
 70 | 
 71 |     return parser.hyperlinks
 72 | 
 73 | ################################################################################
 74 | ### Step 3
 75 | ################################################################################
 76 | 
 77 | # Function to get the hyperlinks from a URL that are within the same domain
 78 | def get_domain_hyperlinks(local_domain, url):
 79 |     clean_links = []
 80 |     for link in set(get_hyperlinks(url)):
 81 |         clean_link = None
 82 | 
 83 |         # If the link is a URL, check if it is within the same domain
 84 |         if re.search(HTTP_URL_PATTERN, link):
 85 |             # Parse the URL and check if the domain is the same
 86 |             url_obj = urlparse(link)
 87 |             if url_obj.netloc == local_domain:
 88 |                 clean_link = link
 89 | 
 90 |         # If the link is not a URL, check if it is a relative link
 91 |         else:
 92 |             if link.startswith("/"):
 93 |                 link = link[1:]
 94 |             elif (
 95 |                 link.startswith("#")
 96 |                 or link.startswith("mailto:")
 97 |                 or link.startswith("tel:")
 98 |             ):
 99 |                 continue
100 |             clean_link = "https://" + local_domain + "/" + link
101 | 
102 |         if clean_link is not None:
103 |             if clean_link.endswith("/"):
104 |                 clean_link = clean_link[:-1]
105 |             clean_links.append(clean_link)
106 | 
107 |     # Return the list of hyperlinks that are within the same domain
108 |     return list(set(clean_links))
109 | 
110 | 
111 | ################################################################################
112 | ### Step 4
113 | ################################################################################
114 | 
115 | def crawl(url):
116 |     # Parse the URL and get the domain
117 |     local_domain = urlparse(url).netloc
118 | 
119 |     # Create a queue to store the URLs to crawl
120 |     queue = deque([url])
121 | 
122 |     # Create a set to store the URLs that have already been seen (no duplicates)
123 |     seen = set([url])
124 | 
125 |     # Create a directory to store the text files
126 |     if not os.path.exists("text/"):
127 |             os.mkdir("text/")
128 | 
129 |     if not os.path.exists("text/"+local_domain+"/"):
130 |             os.mkdir("text/" + local_domain + "/")
131 | 
132 |     # Create a directory to store the csv files
133 |     if not os.path.exists("processed"):
134 |             os.mkdir("processed")
135 | 
136 |     # While the queue is not empty, continue crawling
137 |     while queue:
138 | 
139 |         # Get the next URL from the queue
140 |         url = queue.pop()
141 |         print(url) # for debugging and to see the progress
142 |         
143 |         # Try extracting the text from the link, if failed proceed with the next item in the queue
144 |         try:
145 |             # Save text from the url to a <url>.txt file
146 |             with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:
147 | 
148 |                 # Get the text from the URL using BeautifulSoup
149 |                 soup = BeautifulSoup(requests.get(url).text, "html.parser")
150 | 
151 |                 # Get the text but remove the tags
152 |                 text = soup.get_text()
153 | 
154 |                 # If the crawler gets to a page that requires JavaScript, it will stop the crawl
155 |                 if ("You need to enable JavaScript to run this app." in text):
156 |                     print("Unable to parse page " + url + " due to JavaScript being required")
157 |             
158 |                 # Otherwise, write the text to the file in the text directory
159 |                 f.write(text)
160 |         except Exception as e:
161 |             print("Unable to parse page " + url)
162 | 
163 |         # Get the hyperlinks from the URL and add them to the queue
164 |         for link in get_domain_hyperlinks(local_domain, url):
165 |             if link not in seen:
166 |                 queue.append(link)
167 |                 seen.add(link)
168 | 
169 | crawl(full_url)
170 | 
171 | ################################################################################
172 | ### Step 5
173 | ################################################################################
174 | 
175 | def remove_newlines(serie):
176 |     serie = serie.str.replace('\n', ' ')
177 |     serie = serie.str.replace('\\n', ' ')
178 |     serie = serie.str.replace('  ', ' ')
179 |     serie = serie.str.replace('  ', ' ')
180 |     return serie
181 | 
182 | 
183 | ################################################################################
184 | ### Step 6
185 | ################################################################################
186 | 
187 | # Create a list to store the text files
188 | texts=[]
189 | 
190 | # Get all the text files in the text directory
191 | for file in os.listdir("text/" + domain + "/"):
192 | 
193 |     # Open the file and read the text
194 |     with open("text/" + domain + "/" + file, "r", encoding="UTF-8") as f:
195 |         text = f.read()
196 | 
197 |         # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
198 |         texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))
199 | 
200 | # Create a dataframe from the list of texts
201 | df = pd.DataFrame(texts, columns = ['fname', 'text'])
202 | 
203 | # Set the text column to be the raw text with the newlines removed
204 | df['text'] = df.fname + ". " + remove_newlines(df.text)
205 | df.to_csv('processed/scraped.csv')
206 | df.head()
207 | 
208 | ################################################################################
209 | ### Step 7
210 | ################################################################################
211 | 
212 | # Load the cl100k_base tokenizer which is designed to work with the ada-002 model
213 | tokenizer = tiktoken.get_encoding("cl100k_base")
214 | 
215 | df = pd.read_csv('processed/scraped.csv', index_col=0)
216 | df.columns = ['title', 'text']
217 | 
218 | # Tokenize the text and save the number of tokens to a new column
219 | df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
220 | 
221 | # Visualize the distribution of the number of tokens per row using a histogram
222 | df.n_tokens.hist()
223 | 
224 | ################################################################################
225 | ### Step 8
226 | ################################################################################
227 | 
228 | max_tokens = 500
229 | 
230 | # Function to split the text into chunks of a maximum number of tokens
231 | def split_into_many(text, max_tokens = max_tokens):
232 | 
233 |     # Split the text into sentences
234 |     sentences = text.split('. ')
235 | 
236 |     # Get the number of tokens for each sentence
237 |     n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
238 |     
239 |     chunks = []
240 |     tokens_so_far = 0
241 |     chunk = []
242 | 
243 |     # Loop through the sentences and tokens joined together in a tuple
244 |     for sentence, token in zip(sentences, n_tokens):
245 | 
246 |         # If the number of tokens so far plus the number of tokens in the current sentence is greater 
247 |         # than the max number of tokens, then add the chunk to the list of chunks and reset
248 |         # the chunk and tokens so far
249 |         if tokens_so_far + token > max_tokens:
250 |             chunks.append(". ".join(chunk) + ".")
251 |             chunk = []
252 |             tokens_so_far = 0
253 | 
254 |         # If the number of tokens in the current sentence is greater than the max number of 
255 |         # tokens, go to the next sentence
256 |         if token > max_tokens:
257 |             continue
258 | 
259 |         # Otherwise, add the sentence to the chunk and add the number of tokens to the total
260 |         chunk.append(sentence)
261 |         tokens_so_far += token + 1
262 |         
263 |     # Add the last chunk to the list of chunks
264 |     if chunk:
265 |         chunks.append(". ".join(chunk) + ".")
266 | 
267 |     return chunks
268 |     
269 | 
270 | shortened = []
271 | 
272 | # Loop through the dataframe
273 | for row in df.iterrows():
274 | 
275 |     # If the text is None, go to the next row
276 |     if row[1]['text'] is None:
277 |         continue
278 | 
279 |     # If the number of tokens is greater than the max number of tokens, split the text into chunks
280 |     if row[1]['n_tokens'] > max_tokens:
281 |         shortened += split_into_many(row[1]['text'])
282 |     
283 |     # Otherwise, add the text to the list of shortened texts
284 |     else:
285 |         shortened.append( row[1]['text'] )
286 | 
287 | ################################################################################
288 | ### Step 9
289 | ################################################################################
290 | 
291 | df = pd.DataFrame(shortened, columns = ['text'])
292 | df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
293 | df.n_tokens.hist()
294 | 
295 | ################################################################################
296 | ### Step 10
297 | ################################################################################
298 | 
299 | # Note that you may run into rate limit issues depending on how many files you try to embed
300 | # Please check out our rate limit guide to learn more on how to handle this: https://platform.openai.com/docs/guides/rate-limits
301 | 
302 | df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
303 | df.to_csv('processed/embeddings.csv')
304 | df.head()
305 | 
306 | ################################################################################
307 | ### Step 11
308 | ################################################################################
309 | 
310 | df=pd.read_csv('processed/embeddings.csv', index_col=0)
311 | df['embeddings'] = df['embeddings'].apply(literal_eval).apply(np.array)
312 | 
313 | df.head()
314 | 
315 | ################################################################################
316 | ### Step 12
317 | ################################################################################
318 | 
319 | def create_context(
320 |     question, df, max_len=1800, size="ada"
321 | ):
322 |     """
323 |     Create a context for a question by finding the most similar context from the dataframe
324 |     """
325 | 
326 |     # Get the embeddings for the question
327 |     q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']
328 | 
329 |     # Get the distances from the embeddings
330 |     df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')
331 | 
332 | 
333 |     returns = []
334 |     cur_len = 0
335 | 
336 |     # Sort by distance and add the text to the context until the context is too long
337 |     for i, row in df.sort_values('distances', ascending=True).iterrows():
338 |         
339 |         # Add the length of the text to the current length
340 |         cur_len += row['n_tokens'] + 4
341 |         
342 |         # If the context is too long, break
343 |         if cur_len > max_len:
344 |             break
345 |         
346 |         # Else add it to the text that is being returned
347 |         returns.append(row["text"])
348 | 
349 |     # Return the context
350 |     return "\n\n###\n\n".join(returns)
351 | 
352 | def answer_question(
353 |     df,
354 |     model="text-davinci-003",
355 |     question="Am I allowed to publish model outputs to Twitter, without a human review?",
356 |     max_len=1800,
357 |     size="ada",
358 |     debug=False,
359 |     max_tokens=150,
360 |     stop_sequence=None
361 | ):
362 |     """
363 |     Answer a question based on the most similar context from the dataframe texts
364 |     """
365 |     context = create_context(
366 |         question,
367 |         df,
368 |         max_len=max_len,
369 |         size=size,
370 |     )
371 |     # If debug, print the raw model response
372 |     if debug:
373 |         print("Context:\n" + context)
374 |         print("\n\n")
375 | 
376 |     try:
377 |         # Create a completions using the questin and context
378 |         response = openai.Completion.create(
379 |             prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
380 |             temperature=0,
381 |             max_tokens=max_tokens,
382 |             top_p=1,
383 |             frequency_penalty=0,
384 |             presence_penalty=0,
385 |             stop=stop_sequence,
386 |             model=model,
387 |         )
388 |         return response["choices"][0]["text"].strip()
389 |     except Exception as e:
390 |         print(e)
391 |         return ""
392 | 
393 | ################################################################################
394 | ### Step 13
395 | ################################################################################
396 | 
397 | print(answer_question(df, question="What day is it?", debug=False))
398 | 
399 | print(answer_question(df, question="What is our newest embeddings model?"))
400 | 


--------------------------------------------------------------------------------