├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── data ├── bbc-news-data-embedding.csv └── bbc-news-data.csv ├── notebooks ├── 00-explore-data.ipynb ├── 01-get-embeddings.ipynb ├── 02-visualise-embeddings.ipynb ├── 03-classify-documents.ipynb ├── 04-summarize-documents.ipynb ├── 05-extract-key-information.ipynb ├── 06-extract-key-words.ipynb ├── 07-semantic-search.ipynb ├── 08-retrieve-information.ipynb └── 09-unstructure-data-to-structured-data.ipynb └── output ├── key_info.csv ├── keywords.csv ├── predictions.csv └── summaries.csv /.gitattributes: -------------------------------------------------------------------------------- 1 | bbc-news-data-embedding.csv filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # git 2 | .gitattributes 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Dragon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Document Analysis using OpenAI GPT-3 2 | This repository provides a set of examples for performing document analysis using OpenAI's GPT-3 language model. They are: 3 | 4 | 1. **Data Exploration** - Explore data used in this repo. 5 | 2. **Get Embeddings** - Generate embeddings from documents using GPT-3. 6 | 3. **Visualise Embeddings** - Visualise embeddings in 3D plot. 7 | 4. **Classify Documents** - Use GPT-3 to classify documents into different categories. 8 | 5. **Summarize Documents** - Automatically generate summaries of documents using GPT-3. 9 | 6. **Extract Key Information** - Identify key information in documents using GPT-3. 10 | 7. **Extract Key Words** - Extract important words from documents using GPT-3. 11 | 8. **Semantic Search** - Retrieve relevant documents. 12 | 9. **Retrieve Information based on Context** - Answer a query based on given context. 13 | 10. **Unstructured Data to Structured Data** - Extract specified entities and put into a table. 14 | 15 | # BBC News Articles Analysis 16 | This project is a data analysis of the BBC news dataset. The goal of this project is to explore the data, classify documents into categories, summarize documents, extract key information from documents and extract keywords from documents. 17 | 18 | ## Dataset 19 | The dataset used in this project is the [BBC News Archive](https://www.kaggle.com/datasets/hgultekin/bbcnewsarchive) available from [kaggle](www.kaggle.com). It contains 2225 articles from the BBC news website with 5 different categories: business, entertainment, politics, sport and tech. Each article has a category, filename, title and text. 20 | 21 | ## Language Model 22 | Examples in this repo uses `text-davinci-003`. 23 | ## Notebooks 24 | This project consists of notebooks that perform the following tasks: 25 | 26 | 1. [00-explore-data.ipynb](./notebooks/00-explore-data.ipynb) - This notebook explores the data by looking at the distribution of classes, number of words per document, etc. 27 | 2. [01-get-embeddings.ipynb](./notebooks/01-get-embeddings.ipynb) - This notebook uses pre-trained word embeddings to create vector representations for each document. 28 | 3. [02-visualise-embeddings.ipynb](./notebooks/02-visualise-embeddings.ipynb) - This notebooks visualise word embeedings in a 3D plot. 29 | 4. [03-classify-documents.ipynb](./notebooks/03-classify-documents.ipynb) - This notebook builds classification models using Random Forest and XGBoost to predict the class of each document. 30 | 5. [04-summarize-documents.ipynb](./notebooks/04-summarize-documents.ipynb) - This notebook uses GPT-3 to generate summaries for each document. 31 | 6. [05-extract-key-information.ipynb](./notebooks/05-extract-key-information.ipynb) - This notebook extracts key information from each document such as people, organizations, locations, etc. 32 | 7. [06-extract-key-words.ipynb](./notebooks/06-extract-key-words.ipynb) - This notebook extracts important keywords from each document. 33 | 8. [07-semantic-search.ipynb](./notebooks/07-semantic-search.ipynb) - This notebook performs semantic search to retreive most relevant news from a specific corpus, by comparing the similiarity of the embeddings of the query to that of the text corpus. 34 | 9. [08-retrieve-information.ipynb](./notebooks/08-retrieve-information.ipynb) This notebook retrieve information based on a given context. This is achieved by contstructing the prompt with context. 35 | 10. [09-unstructure-data-to-structured-data.ipynb](./notebooks/09-unstructure-data-to-structured-data.ipynb) This notebook extract specified entities and arranged them into a table. 36 | 37 | ## Output 38 | The output of this project is stored in the `output` directory. It contains the following files: 39 | 40 | * [predictions.csv](./output/predictions.csv) - Predicted classes for each document 41 | * [summaries.csv](./output/summaries.csv) - Generated summaries for each document 42 | * [key_info.csv](./output/key_info.csv) - Extracted key information from each document 43 | * [keywords.csv](./output/keywords.csv) - Extracted keywords from each document 44 | * `models/rf.pkl` - Trained Random Forest model 45 | * `models/xgb.pkl` - Trained XGBoost model 46 | 47 | *Note: This README.md is co-authored with `text-davinci-003`.* 48 | 49 | ## References 50 | - OpenAI repo: https://github.com/openai/openai-cookbook/ 51 | - Which embedding model to use? https://openai.com/blog/new-and-improved-embedding-model/ 52 | 53 | -------------------------------------------------------------------------------- /data/bbc-news-data-embedding.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:969f464442857ad6f2e31fdb652a434fad25ef208697ab22eb8605e77a69792f 3 | size 622883109 4 | -------------------------------------------------------------------------------- /notebooks/01-get-embeddings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Get Embeddings" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "%load_ext autoreload\n", 18 | "%autoreload 2\n", 19 | "\n", 20 | "from IPython.core.interactiveshell import InteractiveShell\n", 21 | "InteractiveShell.ast_node_interactivity = \"all\"" 22 | ] 23 | }, 24 | { 25 | "attachments": {}, 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Set up Azure OpenAI" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "True" 41 | ] 42 | }, 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "import os\n", 50 | "import openai\n", 51 | "from dotenv import load_dotenv\n", 52 | "\n", 53 | "# Set up Azure OpenAI\n", 54 | "load_dotenv()\n", 55 | "openai.api_type = \"azure\"\n", 56 | "openai.api_base = \"https://tutorial-openai-01-2023.openai.azure.com/\"\n", 57 | "openai.api_version = \"2022-12-01\"\n", 58 | "openai.api_key = os.getenv(\"OPENAI_API_KEY\")" 59 | ] 60 | }, 61 | { 62 | "attachments": {}, 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## Load Data" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/html": [ 77 | "
\n", 78 | "\n", 91 | "\n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | "
categoryfilenametitlecontent
0business001.txtAd sales boost Time Warner profitQuarterly profits at US media giant TimeWarne...
1business002.txtDollar gains on Greenspan speechThe dollar has hit its highest level against ...
2business003.txtYukos unit buyer faces loan claimThe owners of embattled Russian oil giant Yuk...
3business004.txtHigh fuel prices hit BA's profitsBritish Airways has blamed high fuel prices f...
4business005.txtPernod takeover talk lifts DomecqShares in UK drinks and food firm Allied Dome...
...............
2220tech397.txtBT program to beat dialler scamsBT is introducing two initiatives to help bea...
2221tech398.txtSpam e-mails tempt net shoppersComputer users across the world continue to i...
2222tech399.txtBe careful how you codeA new European directive could put software w...
2223tech400.txtUS cyber security chief resignsThe man making sure US computer networks are ...
2224tech401.txtLosing yourself in online gamingOnline role playing games are time-consuming,...
\n", 181 | "

2225 rows × 4 columns

\n", 182 | "
" 183 | ], 184 | "text/plain": [ 185 | " category filename title \\\n", 186 | "0 business 001.txt Ad sales boost Time Warner profit \n", 187 | "1 business 002.txt Dollar gains on Greenspan speech \n", 188 | "2 business 003.txt Yukos unit buyer faces loan claim \n", 189 | "3 business 004.txt High fuel prices hit BA's profits \n", 190 | "4 business 005.txt Pernod takeover talk lifts Domecq \n", 191 | "... ... ... ... \n", 192 | "2220 tech 397.txt BT program to beat dialler scams \n", 193 | "2221 tech 398.txt Spam e-mails tempt net shoppers \n", 194 | "2222 tech 399.txt Be careful how you code \n", 195 | "2223 tech 400.txt US cyber security chief resigns \n", 196 | "2224 tech 401.txt Losing yourself in online gaming \n", 197 | "\n", 198 | " content \n", 199 | "0 Quarterly profits at US media giant TimeWarne... \n", 200 | "1 The dollar has hit its highest level against ... \n", 201 | "2 The owners of embattled Russian oil giant Yuk... \n", 202 | "3 British Airways has blamed high fuel prices f... \n", 203 | "4 Shares in UK drinks and food firm Allied Dome... \n", 204 | "... ... \n", 205 | "2220 BT is introducing two initiatives to help bea... \n", 206 | "2221 Computer users across the world continue to i... \n", 207 | "2222 A new European directive could put software w... \n", 208 | "2223 The man making sure US computer networks are ... \n", 209 | "2224 Online role playing games are time-consuming,... \n", 210 | "\n", 211 | "[2225 rows x 4 columns]" 212 | ] 213 | }, 214 | "execution_count": 3, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "import pandas as pd\n", 221 | "\n", 222 | "df_orig = pd.read_csv(\"../data/bbc-news-data.csv\", delimiter='\\t')\n", 223 | "df = df_orig.copy()\n", 224 | "df" 225 | ] 226 | }, 227 | { 228 | "attachments": {}, 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "## Deploy a model" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 4, 238 | "metadata": {}, 239 | "outputs": [ 240 | { 241 | "name": "stdout", 242 | "output_type": "stream", 243 | "text": [ 244 | "Found a succeeded deployment that supports embeddings with id: deployment-89153abdfa934e1580296dbee586239b.\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "# list models deployed with embeddings capability\n", 250 | "deployment_id = None\n", 251 | "result = openai.Deployment.list()\n", 252 | "\n", 253 | "for deployment in result.data:\n", 254 | " if deployment[\"status\"] != \"succeeded\":\n", 255 | " continue\n", 256 | " \n", 257 | " model = openai.Model.retrieve(deployment[\"model\"])\n", 258 | " if model[\"capabilities\"][\"embeddings\"] != True:\n", 259 | " continue\n", 260 | " \n", 261 | " deployment_id = deployment[\"id\"]\n", 262 | " break\n", 263 | "\n", 264 | "# if not model deployed, deploy one\n", 265 | "if not deployment_id:\n", 266 | " print('No deployment with status: succeeded found.')\n", 267 | " model = \"text-similarity-davinci-001\"\n", 268 | "\n", 269 | " # Now let's create the deployment\n", 270 | " print(f'Creating a new deployment with model: {model}')\n", 271 | " result = openai.Deployment.create(model=model, scale_settings={\"scale_type\":\"standard\"})\n", 272 | " deployment_id = result[\"id\"]\n", 273 | " print(f'Successfully created {model} with deployment_id {deployment_id}')\n", 274 | "else:\n", 275 | " print(f'Found a succeeded deployment that supports embeddings with id: {deployment_id}.')" 276 | ] 277 | }, 278 | { 279 | "attachments": {}, 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "## Get Embeddings\n", 284 | "ref: https://learn.microsoft.com/en-us/azure/cognitive-services/openai/tutorials/embeddings?tabs=bash" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 5, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/plain": [ 295 | "12288" 296 | ] 297 | }, 298 | "execution_count": 5, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "embedding = openai.Embedding.create(\n", 305 | " input=\"Your text goes here\",\n", 306 | " deployment_id=deployment_id)\n", 307 | "\n", 308 | "# embedding = openai.Embedding.create(\n", 309 | "# input=\"Your text goes here\",\n", 310 | "# model=\"text-similarity-davinci-001\") # <-- this will fail\n", 311 | "\n", 312 | "### embedding\n", 313 | "# embedding[\"data\"][0][\"embedding\"]\n", 314 | "len(embedding[\"data\"][0][\"embedding\"])" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 6, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "text/plain": [ 325 | "762" 326 | ] 327 | }, 328 | "execution_count": 6, 329 | "metadata": {}, 330 | "output_type": "execute_result" 331 | }, 332 | { 333 | "name": "stdout", 334 | "output_type": "stream", 335 | "text": [ 336 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 2046 tokens, however you requested 4300 tokens (4300 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 337 | ] 338 | }, 339 | { 340 | "data": { 341 | "text/plain": [ 342 | "765" 343 | ] 344 | }, 345 | "execution_count": 6, 346 | "metadata": {}, 347 | "output_type": "execute_result" 348 | }, 349 | { 350 | "name": "stdout", 351 | "output_type": "stream", 352 | "text": [ 353 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 2046 tokens, however you requested 2940 tokens (2940 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 354 | ] 355 | }, 356 | { 357 | "data": { 358 | "text/plain": [ 359 | "862" 360 | ] 361 | }, 362 | "execution_count": 6, 363 | "metadata": {}, 364 | "output_type": "execute_result" 365 | }, 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 2046 tokens, however you requested 3046 tokens (3046 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 371 | ] 372 | }, 373 | { 374 | "data": { 375 | "text/plain": [ 376 | "1185" 377 | ] 378 | }, 379 | "execution_count": 6, 380 | "metadata": {}, 381 | "output_type": "execute_result" 382 | }, 383 | { 384 | "name": "stdout", 385 | "output_type": "stream", 386 | "text": [ 387 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 2046 tokens, however you requested 5267 tokens (5267 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 388 | ] 389 | }, 390 | { 391 | "data": { 392 | "text/plain": [ 393 | "1188" 394 | ] 395 | }, 396 | "execution_count": 6, 397 | "metadata": {}, 398 | "output_type": "execute_result" 399 | }, 400 | { 401 | "name": "stdout", 402 | "output_type": "stream", 403 | "text": [ 404 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 2046 tokens, however you requested 3040 tokens (3040 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 405 | ] 406 | }, 407 | { 408 | "data": { 409 | "text/plain": [ 410 | "1275" 411 | ] 412 | }, 413 | "execution_count": 6, 414 | "metadata": {}, 415 | "output_type": "execute_result" 416 | }, 417 | { 418 | "name": "stdout", 419 | "output_type": "stream", 420 | "text": [ 421 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 2046 tokens, however you requested 3842 tokens (3842 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 422 | ] 423 | }, 424 | { 425 | "data": { 426 | "text/plain": [ 427 | "1683" 428 | ] 429 | }, 430 | "execution_count": 6, 431 | "metadata": {}, 432 | "output_type": "execute_result" 433 | }, 434 | { 435 | "name": "stdout", 436 | "output_type": "stream", 437 | "text": [ 438 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 2046 tokens, however you requested 2059 tokens (2059 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 439 | ] 440 | }, 441 | { 442 | "data": { 443 | "text/plain": [ 444 | "2224" 445 | ] 446 | }, 447 | "execution_count": 6, 448 | "metadata": {}, 449 | "output_type": "execute_result" 450 | }, 451 | { 452 | "name": "stdout", 453 | "output_type": "stream", 454 | "text": [ 455 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 2046 tokens, however you requested 3545 tokens (3545 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 456 | ] 457 | } 458 | ], 459 | "source": [ 460 | "df['embedding'] = ''\n", 461 | "\n", 462 | "for i in range(len(df)): \n", 463 | "#for i in range(760,765):\n", 464 | " try:\n", 465 | " embedding = openai.Embedding.create(input=df['content'][i], deployment_id=deployment_id)\n", 466 | " #len(embedding[\"data\"][0][\"embedding\"])\n", 467 | " df['embedding'][i] = embedding['data'][0]['embedding']\n", 468 | " except Exception as err:\n", 469 | " i\n", 470 | " print(f\"Unexpected {err=}, {type(err)=}\")\n", 471 | " #raise" 472 | ] 473 | }, 474 | { 475 | "attachments": {}, 476 | "cell_type": "markdown", 477 | "metadata": {}, 478 | "source": [ 479 | "## Save data" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 7, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "#df.to_csv(\"../data/bbc-news-data-embedding.csv\", sep='\\t', index=False)" 489 | ] 490 | } 491 | ], 492 | "metadata": { 493 | "kernelspec": { 494 | "display_name": "azureml_py38", 495 | "language": "python", 496 | "name": "python3" 497 | }, 498 | "language_info": { 499 | "codemirror_mode": { 500 | "name": "ipython", 501 | "version": 3 502 | }, 503 | "file_extension": ".py", 504 | "mimetype": "text/x-python", 505 | "name": "python", 506 | "nbconvert_exporter": "python", 507 | "pygments_lexer": "ipython3", 508 | "version": "3.8.5" 509 | }, 510 | "orig_nbformat": 4, 511 | "vscode": { 512 | "interpreter": { 513 | "hash": "6d65a8c07f5b6469e0fc613f182488c0dccce05038bbda39e5ac9075c0454d11" 514 | } 515 | } 516 | }, 517 | "nbformat": 4, 518 | "nbformat_minor": 2 519 | } 520 | -------------------------------------------------------------------------------- /notebooks/04-summarize-documents.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Summarise Documents" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 8, 14 | "metadata": {}, 15 | "outputs": [ 16 | { 17 | "name": "stdout", 18 | "output_type": "stream", 19 | "text": [ 20 | "The autoreload extension is already loaded. To reload it, use:\n", 21 | " %reload_ext autoreload\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "%load_ext autoreload\n", 27 | "%autoreload 2\n", 28 | "\n", 29 | "from IPython.core.interactiveshell import InteractiveShell\n", 30 | "InteractiveShell.ast_node_interactivity = \"all\"" 31 | ] 32 | }, 33 | { 34 | "attachments": {}, 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "## Set up Azure OpenAI" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 9, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "True" 50 | ] 51 | }, 52 | "execution_count": 9, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "import os\n", 59 | "import openai\n", 60 | "from dotenv import load_dotenv\n", 61 | "\n", 62 | "# Set up Azure OpenAI\n", 63 | "load_dotenv()\n", 64 | "openai.api_type = \"azure\"\n", 65 | "openai.api_base = \"https://tutorial-openai-01-2023.openai.azure.com/\"\n", 66 | "openai.api_version = \"2022-12-01\"\n", 67 | "openai.api_key = os.getenv(\"OPENAI_API_KEY\")" 68 | ] 69 | }, 70 | { 71 | "attachments": {}, 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Load Data" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 10, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "import pandas as pd\n", 85 | "\n", 86 | "df_orig = pd.read_csv(\"../data/bbc-news-data.csv\", delimiter='\\t', index_col=False)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 11, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/html": [ 97 | "
\n", 98 | "\n", 111 | "\n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | "
categoryfilenametitlecontent
0business001.txtAd sales boost Time Warner profitQuarterly profits at US media giant TimeWarne...
1business002.txtDollar gains on Greenspan speechThe dollar has hit its highest level against ...
2business003.txtYukos unit buyer faces loan claimThe owners of embattled Russian oil giant Yuk...
3business004.txtHigh fuel prices hit BA's profitsBritish Airways has blamed high fuel prices f...
4business005.txtPernod takeover talk lifts DomecqShares in UK drinks and food firm Allied Dome...
...............
2220tech397.txtBT program to beat dialler scamsBT is introducing two initiatives to help bea...
2221tech398.txtSpam e-mails tempt net shoppersComputer users across the world continue to i...
2222tech399.txtBe careful how you codeA new European directive could put software w...
2223tech400.txtUS cyber security chief resignsThe man making sure US computer networks are ...
2224tech401.txtLosing yourself in online gamingOnline role playing games are time-consuming,...
\n", 201 | "

2225 rows × 4 columns

\n", 202 | "
" 203 | ], 204 | "text/plain": [ 205 | " category filename title \\\n", 206 | "0 business 001.txt Ad sales boost Time Warner profit \n", 207 | "1 business 002.txt Dollar gains on Greenspan speech \n", 208 | "2 business 003.txt Yukos unit buyer faces loan claim \n", 209 | "3 business 004.txt High fuel prices hit BA's profits \n", 210 | "4 business 005.txt Pernod takeover talk lifts Domecq \n", 211 | "... ... ... ... \n", 212 | "2220 tech 397.txt BT program to beat dialler scams \n", 213 | "2221 tech 398.txt Spam e-mails tempt net shoppers \n", 214 | "2222 tech 399.txt Be careful how you code \n", 215 | "2223 tech 400.txt US cyber security chief resigns \n", 216 | "2224 tech 401.txt Losing yourself in online gaming \n", 217 | "\n", 218 | " content \n", 219 | "0 Quarterly profits at US media giant TimeWarne... \n", 220 | "1 The dollar has hit its highest level against ... \n", 221 | "2 The owners of embattled Russian oil giant Yuk... \n", 222 | "3 British Airways has blamed high fuel prices f... \n", 223 | "4 Shares in UK drinks and food firm Allied Dome... \n", 224 | "... ... \n", 225 | "2220 BT is introducing two initiatives to help bea... \n", 226 | "2221 Computer users across the world continue to i... \n", 227 | "2222 A new European directive could put software w... \n", 228 | "2223 The man making sure US computer networks are ... \n", 229 | "2224 Online role playing games are time-consuming,... \n", 230 | "\n", 231 | "[2225 rows x 4 columns]" 232 | ] 233 | }, 234 | "execution_count": 11, 235 | "metadata": {}, 236 | "output_type": "execute_result" 237 | } 238 | ], 239 | "source": [ 240 | "df = df_orig.copy()\n", 241 | "df" 242 | ] 243 | }, 244 | { 245 | "attachments": {}, 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "## Request to API" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 12, 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "data": { 259 | "text/plain": [ 260 | "'Ad sales boost Time Warner profit\\n Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier. The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL. Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL\\'s existing customers for high-speed broadband. TimeWarner also has to restate 2000 and 2003 results following a probe by the US Securities Exchange Commission (SEC), which is close to concluding. Time Warner\\'s fourth quarter profits were slightly better than analysts\\' expectations. But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results. For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn. \"Our financial performance was strong, meeting or exceeding all of our full-year objectives and greatly enhancing our flexibility,\" chairman and chief executive Richard Parsons said. For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins. TimeWarner is to restate its accounts as part of efforts to resolve an inquiry into AOL by US market regulators. It has already offered to pay $300m to settle charges, in a deal that is under review by the SEC. The company said it was unable to estimate the amount it needed to set aside for legal reserves, which it previously set at $500m. It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann\\'s purchase of a stake in AOL Europe, which it had reported as advertising revenue. It will now book the sale of its stake in AOL Europe as a loss on the value of that stake. \\n \\n\\nTl;dr\\n'" 261 | ] 262 | }, 263 | "execution_count": 12, 264 | "metadata": {}, 265 | "output_type": "execute_result" 266 | } 267 | ], 268 | "source": [ 269 | "# create prompt\n", 270 | "prompt_postfix = \"\"\" \n", 271 | " \\n\\nTl;dr\n", 272 | "\"\"\"\n", 273 | "\n", 274 | "prompt = df['title'].loc[0] + \"\\n\" + df['content'].loc[0] + prompt_postfix\n", 275 | "prompt" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 13, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "\"Time Warner's quarterly profits jumped by 76% to $1.13 billion, benefiting from sales of high-speed internet connections and higher advertisement sales. Fourth quarter sales rose 2%, however AOL suffered a profit dip, but was offset by one-offs from the firm owning 8% of Google. Their film division took a hit from box office flops, but for the year their profits were still up 27% with revenue growth at 6.4%, offering 5% earnigs growth projections in the coming\"" 287 | ] 288 | }, 289 | "execution_count": 13, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "# Request API\n", 296 | "response = openai.Completion.create(\n", 297 | " deployment_id=\"text-davinci-003\", # has to be deployment_id\n", 298 | " prompt=prompt,\n", 299 | " temperature=1,\n", 300 | " max_tokens=100,\n", 301 | " top_p=1.0,\n", 302 | " frequency_penalty=0.0,\n", 303 | " presence_penalty=1\n", 304 | ")\n", 305 | "\n", 306 | "# print response\n", 307 | "response['choices'][0]['text']" 308 | ] 309 | }, 310 | { 311 | "attachments": {}, 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "----------------" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 14, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "text/plain": [ 326 | "762" 327 | ] 328 | }, 329 | "execution_count": 14, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | }, 333 | { 334 | "name": "stdout", 335 | "output_type": "stream", 336 | "text": [ 337 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 4097 tokens, however you requested 4416 tokens (4316 in your prompt; 100 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 338 | ] 339 | }, 340 | { 341 | "data": { 342 | "text/plain": [ 343 | "1185" 344 | ] 345 | }, 346 | "execution_count": 14, 347 | "metadata": {}, 348 | "output_type": "execute_result" 349 | }, 350 | { 351 | "name": "stdout", 352 | "output_type": "stream", 353 | "text": [ 354 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 4097 tokens, however you requested 5383 tokens (5283 in your prompt; 100 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 355 | ] 356 | } 357 | ], 358 | "source": [ 359 | "results = pd.DataFrame(columns=['summary'], index=df.index)\n", 360 | "\n", 361 | "# prompt postifx\n", 362 | "prompt_postfix = \"\"\" \n", 363 | " \\n\\nTl;dr\n", 364 | "\"\"\"\n", 365 | "\n", 366 | "for idx, title, content in zip(df.index.values, df['title'].loc[df.index.values], df['content'].loc[df.index.values]):\n", 367 | " \n", 368 | " # build prompt\n", 369 | " prompt = title + \"\\n\" + content + prompt_postfix\n", 370 | "\n", 371 | " try:\n", 372 | " # Request API\n", 373 | " response = openai.Completion.create(\n", 374 | " deployment_id=\"text-davinci-003\", # has to be deployment_id\n", 375 | " prompt=prompt,\n", 376 | " temperature=1,\n", 377 | " max_tokens=100,\n", 378 | " top_p=1.0,\n", 379 | " frequency_penalty=0.0,\n", 380 | " presence_penalty=1\n", 381 | " )\n", 382 | "\n", 383 | " # response\n", 384 | " results['summary'].loc[idx] = response['choices'][0]['text']\n", 385 | " except Exception as err:\n", 386 | " idx\n", 387 | " print(f\"Unexpected {err=}, {type(err)=}\")" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 15, 393 | "metadata": {}, 394 | "outputs": [ 395 | { 396 | "data": { 397 | "text/html": [ 398 | "
\n", 399 | "\n", 412 | "\n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | "
summary
0Time Warner's quarterly profits surged 76% to ...
1The dollar has recently reached its highest le...
2Yukos's owners are demanding repayment of a $9...
3British Airways reported a 40% drop in profits...
4Shares of Allied Domecq rose on speculation th...
......
2220BT is introducing two initiatives to protect ...
2221A new report shows that many computer users ac...
2222\\nIf the new European Directive on the Patenta...
2223Amit Yoran has resigned from his post as direc...
2224Online gaming can be an unhealthy obsession fo...
\n", 466 | "

2225 rows × 1 columns

\n", 467 | "
" 468 | ], 469 | "text/plain": [ 470 | " summary\n", 471 | "0 Time Warner's quarterly profits surged 76% to ...\n", 472 | "1 The dollar has recently reached its highest le...\n", 473 | "2 Yukos's owners are demanding repayment of a $9...\n", 474 | "3 British Airways reported a 40% drop in profits...\n", 475 | "4 Shares of Allied Domecq rose on speculation th...\n", 476 | "... ...\n", 477 | "2220 BT is introducing two initiatives to protect ...\n", 478 | "2221 A new report shows that many computer users ac...\n", 479 | "2222 \\nIf the new European Directive on the Patenta...\n", 480 | "2223 Amit Yoran has resigned from his post as direc...\n", 481 | "2224 Online gaming can be an unhealthy obsession fo...\n", 482 | "\n", 483 | "[2225 rows x 1 columns]" 484 | ] 485 | }, 486 | "execution_count": 15, 487 | "metadata": {}, 488 | "output_type": "execute_result" 489 | } 490 | ], 491 | "source": [ 492 | "results" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 16, 498 | "metadata": {}, 499 | "outputs": [ 500 | { 501 | "data": { 502 | "text/plain": [ 503 | "(2225, 5)" 504 | ] 505 | }, 506 | "execution_count": 16, 507 | "metadata": {}, 508 | "output_type": "execute_result" 509 | }, 510 | { 511 | "data": { 512 | "text/html": [ 513 | "
\n", 514 | "\n", 527 | "\n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | "
categoryfilenametitlecontentsummary
0business001.txtAd sales boost Time Warner profitQuarterly profits at US media giant TimeWarne...Time Warner's quarterly profits surged 76% to ...
1business002.txtDollar gains on Greenspan speechThe dollar has hit its highest level against ...The dollar has recently reached its highest le...
2business003.txtYukos unit buyer faces loan claimThe owners of embattled Russian oil giant Yuk...Yukos's owners are demanding repayment of a $9...
3business004.txtHigh fuel prices hit BA's profitsBritish Airways has blamed high fuel prices f...British Airways reported a 40% drop in profits...
4business005.txtPernod takeover talk lifts DomecqShares in UK drinks and food firm Allied Dome...Shares of Allied Domecq rose on speculation th...
..................
2220tech397.txtBT program to beat dialler scamsBT is introducing two initiatives to help bea...BT is introducing two initiatives to protect ...
2221tech398.txtSpam e-mails tempt net shoppersComputer users across the world continue to i...A new report shows that many computer users ac...
2222tech399.txtBe careful how you codeA new European directive could put software w...\\nIf the new European Directive on the Patenta...
2223tech400.txtUS cyber security chief resignsThe man making sure US computer networks are ...Amit Yoran has resigned from his post as direc...
2224tech401.txtLosing yourself in online gamingOnline role playing games are time-consuming,...Online gaming can be an unhealthy obsession fo...
\n", 629 | "

2225 rows × 5 columns

\n", 630 | "
" 631 | ], 632 | "text/plain": [ 633 | " category filename title \\\n", 634 | "0 business 001.txt Ad sales boost Time Warner profit \n", 635 | "1 business 002.txt Dollar gains on Greenspan speech \n", 636 | "2 business 003.txt Yukos unit buyer faces loan claim \n", 637 | "3 business 004.txt High fuel prices hit BA's profits \n", 638 | "4 business 005.txt Pernod takeover talk lifts Domecq \n", 639 | "... ... ... ... \n", 640 | "2220 tech 397.txt BT program to beat dialler scams \n", 641 | "2221 tech 398.txt Spam e-mails tempt net shoppers \n", 642 | "2222 tech 399.txt Be careful how you code \n", 643 | "2223 tech 400.txt US cyber security chief resigns \n", 644 | "2224 tech 401.txt Losing yourself in online gaming \n", 645 | "\n", 646 | " content \\\n", 647 | "0 Quarterly profits at US media giant TimeWarne... \n", 648 | "1 The dollar has hit its highest level against ... \n", 649 | "2 The owners of embattled Russian oil giant Yuk... \n", 650 | "3 British Airways has blamed high fuel prices f... \n", 651 | "4 Shares in UK drinks and food firm Allied Dome... \n", 652 | "... ... \n", 653 | "2220 BT is introducing two initiatives to help bea... \n", 654 | "2221 Computer users across the world continue to i... \n", 655 | "2222 A new European directive could put software w... \n", 656 | "2223 The man making sure US computer networks are ... \n", 657 | "2224 Online role playing games are time-consuming,... \n", 658 | "\n", 659 | " summary \n", 660 | "0 Time Warner's quarterly profits surged 76% to ... \n", 661 | "1 The dollar has recently reached its highest le... \n", 662 | "2 Yukos's owners are demanding repayment of a $9... \n", 663 | "3 British Airways reported a 40% drop in profits... \n", 664 | "4 Shares of Allied Domecq rose on speculation th... \n", 665 | "... ... \n", 666 | "2220 BT is introducing two initiatives to protect ... \n", 667 | "2221 A new report shows that many computer users ac... \n", 668 | "2222 \\nIf the new European Directive on the Patenta... \n", 669 | "2223 Amit Yoran has resigned from his post as direc... \n", 670 | "2224 Online gaming can be an unhealthy obsession fo... \n", 671 | "\n", 672 | "[2225 rows x 5 columns]" 673 | ] 674 | }, 675 | "execution_count": 16, 676 | "metadata": {}, 677 | "output_type": "execute_result" 678 | } 679 | ], 680 | "source": [ 681 | "df_results = pd.concat([df, results], axis=1)\n", 682 | "df_results.shape\n", 683 | "df_results" 684 | ] 685 | }, 686 | { 687 | "attachments": {}, 688 | "cell_type": "markdown", 689 | "metadata": {}, 690 | "source": [ 691 | "## Save results" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": 18, 697 | "metadata": {}, 698 | "outputs": [], 699 | "source": [ 700 | "SAVE = False\n", 701 | "\n", 702 | "if SAVE:\n", 703 | " fname = '../output/summaries.csv'\n", 704 | " df_results.to_csv(fname, sep='\\t')" 705 | ] 706 | } 707 | ], 708 | "metadata": { 709 | "kernelspec": { 710 | "display_name": "azureml_py38", 711 | "language": "python", 712 | "name": "python3" 713 | }, 714 | "language_info": { 715 | "codemirror_mode": { 716 | "name": "ipython", 717 | "version": 3 718 | }, 719 | "file_extension": ".py", 720 | "mimetype": "text/x-python", 721 | "name": "python", 722 | "nbconvert_exporter": "python", 723 | "pygments_lexer": "ipython3", 724 | "version": "3.8.5" 725 | }, 726 | "orig_nbformat": 4, 727 | "vscode": { 728 | "interpreter": { 729 | "hash": "6d65a8c07f5b6469e0fc613f182488c0dccce05038bbda39e5ac9075c0454d11" 730 | } 731 | } 732 | }, 733 | "nbformat": 4, 734 | "nbformat_minor": 2 735 | } 736 | -------------------------------------------------------------------------------- /notebooks/05-extract-key-information.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Extract Key Information" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "%load_ext autoreload\n", 18 | "%autoreload 2\n", 19 | "\n", 20 | "from IPython.core.interactiveshell import InteractiveShell\n", 21 | "InteractiveShell.ast_node_interactivity = \"all\"" 22 | ] 23 | }, 24 | { 25 | "attachments": {}, 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Set up Azure OpenAI" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "True" 41 | ] 42 | }, 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "import os\n", 50 | "import openai\n", 51 | "from dotenv import load_dotenv\n", 52 | "\n", 53 | "# Set up Azure OpenAI\n", 54 | "load_dotenv()\n", 55 | "openai.api_type = \"azure\"\n", 56 | "openai.api_base = \"https://tutorial-openai-01-2023.openai.azure.com/\"\n", 57 | "openai.api_version = \"2022-12-01\"\n", 58 | "openai.api_key = os.getenv(\"OPENAI_API_KEY\")" 59 | ] 60 | }, 61 | { 62 | "attachments": {}, 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## Load Data" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "import pandas as pd\n", 76 | "\n", 77 | "df_orig = pd.read_csv(\"../data/bbc-news-data.csv\", delimiter='\\t', index_col=False)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/html": [ 88 | "
\n", 89 | "\n", 102 | "\n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | "
categoryfilenametitlecontent
0business001.txtAd sales boost Time Warner profitQuarterly profits at US media giant TimeWarne...
1business002.txtDollar gains on Greenspan speechThe dollar has hit its highest level against ...
2business003.txtYukos unit buyer faces loan claimThe owners of embattled Russian oil giant Yuk...
3business004.txtHigh fuel prices hit BA's profitsBritish Airways has blamed high fuel prices f...
4business005.txtPernod takeover talk lifts DomecqShares in UK drinks and food firm Allied Dome...
...............
2220tech397.txtBT program to beat dialler scamsBT is introducing two initiatives to help bea...
2221tech398.txtSpam e-mails tempt net shoppersComputer users across the world continue to i...
2222tech399.txtBe careful how you codeA new European directive could put software w...
2223tech400.txtUS cyber security chief resignsThe man making sure US computer networks are ...
2224tech401.txtLosing yourself in online gamingOnline role playing games are time-consuming,...
\n", 192 | "

2225 rows × 4 columns

\n", 193 | "
" 194 | ], 195 | "text/plain": [ 196 | " category filename title \\\n", 197 | "0 business 001.txt Ad sales boost Time Warner profit \n", 198 | "1 business 002.txt Dollar gains on Greenspan speech \n", 199 | "2 business 003.txt Yukos unit buyer faces loan claim \n", 200 | "3 business 004.txt High fuel prices hit BA's profits \n", 201 | "4 business 005.txt Pernod takeover talk lifts Domecq \n", 202 | "... ... ... ... \n", 203 | "2220 tech 397.txt BT program to beat dialler scams \n", 204 | "2221 tech 398.txt Spam e-mails tempt net shoppers \n", 205 | "2222 tech 399.txt Be careful how you code \n", 206 | "2223 tech 400.txt US cyber security chief resigns \n", 207 | "2224 tech 401.txt Losing yourself in online gaming \n", 208 | "\n", 209 | " content \n", 210 | "0 Quarterly profits at US media giant TimeWarne... \n", 211 | "1 The dollar has hit its highest level against ... \n", 212 | "2 The owners of embattled Russian oil giant Yuk... \n", 213 | "3 British Airways has blamed high fuel prices f... \n", 214 | "4 Shares in UK drinks and food firm Allied Dome... \n", 215 | "... ... \n", 216 | "2220 BT is introducing two initiatives to help bea... \n", 217 | "2221 Computer users across the world continue to i... \n", 218 | "2222 A new European directive could put software w... \n", 219 | "2223 The man making sure US computer networks are ... \n", 220 | "2224 Online role playing games are time-consuming,... \n", 221 | "\n", 222 | "[2225 rows x 4 columns]" 223 | ] 224 | }, 225 | "execution_count": 4, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "df = df_orig.copy()\n", 232 | "df" 233 | ] 234 | }, 235 | { 236 | "attachments": {}, 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "## Request to API" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 5, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "name": "stdout", 250 | "output_type": "stream", 251 | "text": [ 252 | " \n", 253 | " Extract key information from this text\n", 254 | "Ad sales boost Time Warner profit\n", 255 | " Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier. The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL. Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-speed broadband. TimeWarner also has to restate 2000 and 2003 results following a probe by the US Securities Exchange Commission (SEC), which is close to concluding. Time Warner's fourth quarter profits were slightly better than analysts' expectations. But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results. For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn. \"Our financial performance was strong, meeting or exceeding all of our full-year objectives and greatly enhancing our flexibility,\" chairman and chief executive Richard Parsons said. For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins. TimeWarner is to restate its accounts as part of efforts to resolve an inquiry into AOL by US market regulators. It has already offered to pay $300m to settle charges, in a deal that is under review by the SEC. The company said it was unable to estimate the amount it needed to set aside for legal reserves, which it previously set at $500m. It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann's purchase of a stake in AOL Europe, which it had reported as advertising revenue. It will now book the sale of its stake in AOL Europe as a loss on the value of that stake. \n" 256 | ] 257 | } 258 | ], 259 | "source": [ 260 | "# create prompt\n", 261 | "prompt_prefix = \"\"\" \n", 262 | " Extract key information from this text\n", 263 | "\"\"\"\n", 264 | "\n", 265 | "prompt = prompt_prefix + df['title'].loc[0] + \"\\n\" + df['content'].loc[0]\n", 266 | "print(prompt)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 6, 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "name": "stdout", 276 | "output_type": "stream", 277 | "text": [ 278 | "\n", 279 | "\n", 280 | "Key Information: \n", 281 | "- Quarterly profits at Time Warner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier\n", 282 | "- Sales of high-speed internet connections and higher advert sales boosted profits \n", 283 | "- AOL lost 464,000 subscribers in the fourth quarter but underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues \n", 284 | "- Time Warner now owns 8% of Google \n", 285 | "- Film division saw profits slump 27%, helped by box office flops Alexander and Catwoman \n", 286 | "- For full year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance; revenues grew 6.4% to $42.09bn \n", 287 | "- Projecting operating earnings growth of around 5%, with higher revenue and wider profit margins for 2005 \n", 288 | "- Intends to adjust way it accounts for deal with German music publisher Bertelsmann's purchase of stake in AOL Europe; will book sale as loss on value\n" 289 | ] 290 | } 291 | ], 292 | "source": [ 293 | "# Request API\n", 294 | "response = openai.Completion.create(\n", 295 | " deployment_id=\"text-davinci-003\", \n", 296 | " prompt=prompt,\n", 297 | " temperature=0,\n", 298 | " max_tokens=1000,\n", 299 | " top_p=0.95,\n", 300 | " frequency_penalty=1,\n", 301 | " presence_penalty=1\n", 302 | ")\n", 303 | "\n", 304 | "# print response\n", 305 | "print(response['choices'][0]['text'])" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 7, 311 | "metadata": {}, 312 | "outputs": [ 313 | { 314 | "data": { 315 | "text/plain": [ 316 | "624" 317 | ] 318 | }, 319 | "execution_count": 7, 320 | "metadata": {}, 321 | "output_type": "execute_result" 322 | }, 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 328 | ] 329 | }, 330 | { 331 | "data": { 332 | "text/plain": [ 333 | "644" 334 | ] 335 | }, 336 | "execution_count": 7, 337 | "metadata": {}, 338 | "output_type": "execute_result" 339 | }, 340 | { 341 | "name": "stdout", 342 | "output_type": "stream", 343 | "text": [ 344 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 345 | ] 346 | }, 347 | { 348 | "data": { 349 | "text/plain": [ 350 | "678" 351 | ] 352 | }, 353 | "execution_count": 7, 354 | "metadata": {}, 355 | "output_type": "execute_result" 356 | }, 357 | { 358 | "name": "stdout", 359 | "output_type": "stream", 360 | "text": [ 361 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 362 | ] 363 | }, 364 | { 365 | "data": { 366 | "text/plain": [ 367 | "762" 368 | ] 369 | }, 370 | "execution_count": 7, 371 | "metadata": {}, 372 | "output_type": "execute_result" 373 | }, 374 | { 375 | "name": "stdout", 376 | "output_type": "stream", 377 | "text": [ 378 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 4097 tokens, however you requested 5317 tokens (4317 in your prompt; 1000 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 379 | ] 380 | }, 381 | { 382 | "data": { 383 | "text/plain": [ 384 | "884" 385 | ] 386 | }, 387 | "execution_count": 7, 388 | "metadata": {}, 389 | "output_type": "execute_result" 390 | }, 391 | { 392 | "name": "stdout", 393 | "output_type": "stream", 394 | "text": [ 395 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 396 | ] 397 | }, 398 | { 399 | "data": { 400 | "text/plain": [ 401 | "913" 402 | ] 403 | }, 404 | "execution_count": 7, 405 | "metadata": {}, 406 | "output_type": "execute_result" 407 | }, 408 | { 409 | "name": "stdout", 410 | "output_type": "stream", 411 | "text": [ 412 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 413 | ] 414 | }, 415 | { 416 | "data": { 417 | "text/plain": [ 418 | "995" 419 | ] 420 | }, 421 | "execution_count": 7, 422 | "metadata": {}, 423 | "output_type": "execute_result" 424 | }, 425 | { 426 | "name": "stdout", 427 | "output_type": "stream", 428 | "text": [ 429 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 430 | ] 431 | }, 432 | { 433 | "data": { 434 | "text/plain": [ 435 | "1015" 436 | ] 437 | }, 438 | "execution_count": 7, 439 | "metadata": {}, 440 | "output_type": "execute_result" 441 | }, 442 | { 443 | "name": "stdout", 444 | "output_type": "stream", 445 | "text": [ 446 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 447 | ] 448 | }, 449 | { 450 | "data": { 451 | "text/plain": [ 452 | "1020" 453 | ] 454 | }, 455 | "execution_count": 7, 456 | "metadata": {}, 457 | "output_type": "execute_result" 458 | }, 459 | { 460 | "name": "stdout", 461 | "output_type": "stream", 462 | "text": [ 463 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 464 | ] 465 | }, 466 | { 467 | "data": { 468 | "text/plain": [ 469 | "1035" 470 | ] 471 | }, 472 | "execution_count": 7, 473 | "metadata": {}, 474 | "output_type": "execute_result" 475 | }, 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 481 | ] 482 | }, 483 | { 484 | "data": { 485 | "text/plain": [ 486 | "1039" 487 | ] 488 | }, 489 | "execution_count": 7, 490 | "metadata": {}, 491 | "output_type": "execute_result" 492 | }, 493 | { 494 | "name": "stdout", 495 | "output_type": "stream", 496 | "text": [ 497 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 498 | ] 499 | }, 500 | { 501 | "data": { 502 | "text/plain": [ 503 | "1049" 504 | ] 505 | }, 506 | "execution_count": 7, 507 | "metadata": {}, 508 | "output_type": "execute_result" 509 | }, 510 | { 511 | "name": "stdout", 512 | "output_type": "stream", 513 | "text": [ 514 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 515 | ] 516 | }, 517 | { 518 | "data": { 519 | "text/plain": [ 520 | "1052" 521 | ] 522 | }, 523 | "execution_count": 7, 524 | "metadata": {}, 525 | "output_type": "execute_result" 526 | }, 527 | { 528 | "name": "stdout", 529 | "output_type": "stream", 530 | "text": [ 531 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 532 | ] 533 | }, 534 | { 535 | "data": { 536 | "text/plain": [ 537 | "1109" 538 | ] 539 | }, 540 | "execution_count": 7, 541 | "metadata": {}, 542 | "output_type": "execute_result" 543 | }, 544 | { 545 | "name": "stdout", 546 | "output_type": "stream", 547 | "text": [ 548 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 549 | ] 550 | }, 551 | { 552 | "data": { 553 | "text/plain": [ 554 | "1127" 555 | ] 556 | }, 557 | "execution_count": 7, 558 | "metadata": {}, 559 | "output_type": "execute_result" 560 | }, 561 | { 562 | "name": "stdout", 563 | "output_type": "stream", 564 | "text": [ 565 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 566 | ] 567 | }, 568 | { 569 | "data": { 570 | "text/plain": [ 571 | "1154" 572 | ] 573 | }, 574 | "execution_count": 7, 575 | "metadata": {}, 576 | "output_type": "execute_result" 577 | }, 578 | { 579 | "name": "stdout", 580 | "output_type": "stream", 581 | "text": [ 582 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 583 | ] 584 | }, 585 | { 586 | "data": { 587 | "text/plain": [ 588 | "1181" 589 | ] 590 | }, 591 | "execution_count": 7, 592 | "metadata": {}, 593 | "output_type": "execute_result" 594 | }, 595 | { 596 | "name": "stdout", 597 | "output_type": "stream", 598 | "text": [ 599 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 600 | ] 601 | }, 602 | { 603 | "data": { 604 | "text/plain": [ 605 | "1185" 606 | ] 607 | }, 608 | "execution_count": 7, 609 | "metadata": {}, 610 | "output_type": "execute_result" 611 | }, 612 | { 613 | "name": "stdout", 614 | "output_type": "stream", 615 | "text": [ 616 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 4097 tokens, however you requested 6284 tokens (5284 in your prompt; 1000 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 617 | ] 618 | }, 619 | { 620 | "data": { 621 | "text/plain": [ 622 | "1275" 623 | ] 624 | }, 625 | "execution_count": 7, 626 | "metadata": {}, 627 | "output_type": "execute_result" 628 | }, 629 | { 630 | "name": "stdout", 631 | "output_type": "stream", 632 | "text": [ 633 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 4097 tokens, however you requested 4861 tokens (3861 in your prompt; 1000 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 634 | ] 635 | }, 636 | { 637 | "data": { 638 | "text/plain": [ 639 | "1941" 640 | ] 641 | }, 642 | "execution_count": 7, 643 | "metadata": {}, 644 | "output_type": "execute_result" 645 | }, 646 | { 647 | "name": "stdout", 648 | "output_type": "stream", 649 | "text": [ 650 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 651 | ] 652 | }, 653 | { 654 | "data": { 655 | "text/plain": [ 656 | "2033" 657 | ] 658 | }, 659 | "execution_count": 7, 660 | "metadata": {}, 661 | "output_type": "execute_result" 662 | }, 663 | { 664 | "name": "stdout", 665 | "output_type": "stream", 666 | "text": [ 667 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 668 | ] 669 | }, 670 | { 671 | "data": { 672 | "text/plain": [ 673 | "2210" 674 | ] 675 | }, 676 | "execution_count": 7, 677 | "metadata": {}, 678 | "output_type": "execute_result" 679 | }, 680 | { 681 | "name": "stdout", 682 | "output_type": "stream", 683 | "text": [ 684 | "Unexpected err=InvalidRequestError(message='The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766', param='prompt', code='content_filter', http_status=400, request_id=None), type(err)=\n" 685 | ] 686 | }, 687 | { 688 | "data": { 689 | "text/plain": [ 690 | "2224" 691 | ] 692 | }, 693 | "execution_count": 7, 694 | "metadata": {}, 695 | "output_type": "execute_result" 696 | }, 697 | { 698 | "name": "stdout", 699 | "output_type": "stream", 700 | "text": [ 701 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 4097 tokens, however you requested 4560 tokens (3560 in your prompt; 1000 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 702 | ] 703 | } 704 | ], 705 | "source": [ 706 | "colname = 'key_info'\n", 707 | "results = pd.DataFrame(columns=[colname], index=df.index)\n", 708 | "\n", 709 | "prompt_prefix = \"\"\" \n", 710 | " Extract key information from this text\n", 711 | "\"\"\"\n", 712 | "\n", 713 | "for idx, title, content in zip(df.index.values, df['title'].loc[df.index.values], df['content'].loc[df.index.values]):\n", 714 | " \n", 715 | " # build prompt\n", 716 | " prompt = prompt_prefix + title + \"\\n\" + content\n", 717 | "\n", 718 | " try:\n", 719 | " # Request API\n", 720 | " response = openai.Completion.create(\n", 721 | " deployment_id=\"text-davinci-003\", \n", 722 | " prompt=prompt,\n", 723 | " temperature=0,\n", 724 | " max_tokens=1000,\n", 725 | " top_p=0.95,\n", 726 | " frequency_penalty=1,\n", 727 | " presence_penalty=1\n", 728 | " )\n", 729 | "\n", 730 | " # response\n", 731 | " results[colname].loc[idx] = response['choices'][0]['text']\n", 732 | " except Exception as err:\n", 733 | " idx\n", 734 | " print(f\"Unexpected {err=}, {type(err)=}\")" 735 | ] 736 | }, 737 | { 738 | "cell_type": "code", 739 | "execution_count": 8, 740 | "metadata": {}, 741 | "outputs": [ 742 | { 743 | "data": { 744 | "text/html": [ 745 | "
\n", 746 | "\n", 759 | "\n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | "
key_info
0\\n\\nKey Information: \\n- Quarterly profits at ...
1\\n\\nKey Information: \\n- Dollar has hit highes...
2\\n\\nKey Information: \\n- Menatep Group is aski...
3\\n\\n- British Airways reported a 40% drop in p...
4\\n\\nKey Information: \\n- Pernod Ricard is cons...
......
2220\\n\\nKey Information: \\n- BT is introducing two...
2221\\n\\nKey Information: \\n- More than a quarter o...
2222\\n\\nKey Information: \\n- Former programmer and...
2223\\n\\nKey Information: \\n- Amit Yoran resigned a...
2224NaN
\n", 813 | "

2225 rows × 1 columns

\n", 814 | "
" 815 | ], 816 | "text/plain": [ 817 | " key_info\n", 818 | "0 \\n\\nKey Information: \\n- Quarterly profits at ...\n", 819 | "1 \\n\\nKey Information: \\n- Dollar has hit highes...\n", 820 | "2 \\n\\nKey Information: \\n- Menatep Group is aski...\n", 821 | "3 \\n\\n- British Airways reported a 40% drop in p...\n", 822 | "4 \\n\\nKey Information: \\n- Pernod Ricard is cons...\n", 823 | "... ...\n", 824 | "2220 \\n\\nKey Information: \\n- BT is introducing two...\n", 825 | "2221 \\n\\nKey Information: \\n- More than a quarter o...\n", 826 | "2222 \\n\\nKey Information: \\n- Former programmer and...\n", 827 | "2223 \\n\\nKey Information: \\n- Amit Yoran resigned a...\n", 828 | "2224 NaN\n", 829 | "\n", 830 | "[2225 rows x 1 columns]" 831 | ] 832 | }, 833 | "execution_count": 8, 834 | "metadata": {}, 835 | "output_type": "execute_result" 836 | } 837 | ], 838 | "source": [ 839 | "results" 840 | ] 841 | }, 842 | { 843 | "cell_type": "code", 844 | "execution_count": 9, 845 | "metadata": {}, 846 | "outputs": [ 847 | { 848 | "data": { 849 | "text/plain": [ 850 | "(2225, 5)" 851 | ] 852 | }, 853 | "execution_count": 9, 854 | "metadata": {}, 855 | "output_type": "execute_result" 856 | }, 857 | { 858 | "data": { 859 | "text/html": [ 860 | "
\n", 861 | "\n", 874 | "\n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | "
categoryfilenametitlecontentkey_info
0business001.txtAd sales boost Time Warner profitQuarterly profits at US media giant TimeWarne...\\n\\nKey Information: \\n- Quarterly profits at ...
1business002.txtDollar gains on Greenspan speechThe dollar has hit its highest level against ...\\n\\nKey Information: \\n- Dollar has hit highes...
2business003.txtYukos unit buyer faces loan claimThe owners of embattled Russian oil giant Yuk...\\n\\nKey Information: \\n- Menatep Group is aski...
3business004.txtHigh fuel prices hit BA's profitsBritish Airways has blamed high fuel prices f...\\n\\n- British Airways reported a 40% drop in p...
4business005.txtPernod takeover talk lifts DomecqShares in UK drinks and food firm Allied Dome...\\n\\nKey Information: \\n- Pernod Ricard is cons...
..................
2220tech397.txtBT program to beat dialler scamsBT is introducing two initiatives to help bea...\\n\\nKey Information: \\n- BT is introducing two...
2221tech398.txtSpam e-mails tempt net shoppersComputer users across the world continue to i...\\n\\nKey Information: \\n- More than a quarter o...
2222tech399.txtBe careful how you codeA new European directive could put software w...\\n\\nKey Information: \\n- Former programmer and...
2223tech400.txtUS cyber security chief resignsThe man making sure US computer networks are ...\\n\\nKey Information: \\n- Amit Yoran resigned a...
2224tech401.txtLosing yourself in online gamingOnline role playing games are time-consuming,...NaN
\n", 976 | "

2225 rows × 5 columns

\n", 977 | "
" 978 | ], 979 | "text/plain": [ 980 | " category filename title \\\n", 981 | "0 business 001.txt Ad sales boost Time Warner profit \n", 982 | "1 business 002.txt Dollar gains on Greenspan speech \n", 983 | "2 business 003.txt Yukos unit buyer faces loan claim \n", 984 | "3 business 004.txt High fuel prices hit BA's profits \n", 985 | "4 business 005.txt Pernod takeover talk lifts Domecq \n", 986 | "... ... ... ... \n", 987 | "2220 tech 397.txt BT program to beat dialler scams \n", 988 | "2221 tech 398.txt Spam e-mails tempt net shoppers \n", 989 | "2222 tech 399.txt Be careful how you code \n", 990 | "2223 tech 400.txt US cyber security chief resigns \n", 991 | "2224 tech 401.txt Losing yourself in online gaming \n", 992 | "\n", 993 | " content \\\n", 994 | "0 Quarterly profits at US media giant TimeWarne... \n", 995 | "1 The dollar has hit its highest level against ... \n", 996 | "2 The owners of embattled Russian oil giant Yuk... \n", 997 | "3 British Airways has blamed high fuel prices f... \n", 998 | "4 Shares in UK drinks and food firm Allied Dome... \n", 999 | "... ... \n", 1000 | "2220 BT is introducing two initiatives to help bea... \n", 1001 | "2221 Computer users across the world continue to i... \n", 1002 | "2222 A new European directive could put software w... \n", 1003 | "2223 The man making sure US computer networks are ... \n", 1004 | "2224 Online role playing games are time-consuming,... \n", 1005 | "\n", 1006 | " key_info \n", 1007 | "0 \\n\\nKey Information: \\n- Quarterly profits at ... \n", 1008 | "1 \\n\\nKey Information: \\n- Dollar has hit highes... \n", 1009 | "2 \\n\\nKey Information: \\n- Menatep Group is aski... \n", 1010 | "3 \\n\\n- British Airways reported a 40% drop in p... \n", 1011 | "4 \\n\\nKey Information: \\n- Pernod Ricard is cons... \n", 1012 | "... ... \n", 1013 | "2220 \\n\\nKey Information: \\n- BT is introducing two... \n", 1014 | "2221 \\n\\nKey Information: \\n- More than a quarter o... \n", 1015 | "2222 \\n\\nKey Information: \\n- Former programmer and... \n", 1016 | "2223 \\n\\nKey Information: \\n- Amit Yoran resigned a... \n", 1017 | "2224 NaN \n", 1018 | "\n", 1019 | "[2225 rows x 5 columns]" 1020 | ] 1021 | }, 1022 | "execution_count": 9, 1023 | "metadata": {}, 1024 | "output_type": "execute_result" 1025 | } 1026 | ], 1027 | "source": [ 1028 | "df_results = pd.concat([df, results], axis=1)\n", 1029 | "df_results.shape\n", 1030 | "df_results" 1031 | ] 1032 | }, 1033 | { 1034 | "attachments": {}, 1035 | "cell_type": "markdown", 1036 | "metadata": {}, 1037 | "source": [ 1038 | "## Save results" 1039 | ] 1040 | }, 1041 | { 1042 | "cell_type": "code", 1043 | "execution_count": 11, 1044 | "metadata": {}, 1045 | "outputs": [], 1046 | "source": [ 1047 | "if False:\n", 1048 | " fname = '../output/key_info.csv'\n", 1049 | " df_results.to_csv(fname, sep='\\t')" 1050 | ] 1051 | } 1052 | ], 1053 | "metadata": { 1054 | "kernelspec": { 1055 | "display_name": "azureml_py38", 1056 | "language": "python", 1057 | "name": "python3" 1058 | }, 1059 | "language_info": { 1060 | "codemirror_mode": { 1061 | "name": "ipython", 1062 | "version": 3 1063 | }, 1064 | "file_extension": ".py", 1065 | "mimetype": "text/x-python", 1066 | "name": "python", 1067 | "nbconvert_exporter": "python", 1068 | "pygments_lexer": "ipython3", 1069 | "version": "3.8.5" 1070 | }, 1071 | "orig_nbformat": 4, 1072 | "vscode": { 1073 | "interpreter": { 1074 | "hash": "6d65a8c07f5b6469e0fc613f182488c0dccce05038bbda39e5ac9075c0454d11" 1075 | } 1076 | } 1077 | }, 1078 | "nbformat": 4, 1079 | "nbformat_minor": 2 1080 | } 1081 | -------------------------------------------------------------------------------- /notebooks/06-extract-key-words.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Extract Key Information" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "%load_ext autoreload\n", 18 | "%autoreload 2\n", 19 | "\n", 20 | "from IPython.core.interactiveshell import InteractiveShell\n", 21 | "InteractiveShell.ast_node_interactivity = \"all\"" 22 | ] 23 | }, 24 | { 25 | "attachments": {}, 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Set up Azure OpenAI" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "True" 41 | ] 42 | }, 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "import os\n", 50 | "import openai\n", 51 | "from dotenv import load_dotenv\n", 52 | "\n", 53 | "# Set up Azure OpenAI\n", 54 | "load_dotenv()\n", 55 | "openai.api_type = \"azure\"\n", 56 | "openai.api_base = \"https://tutorial-openai-01-2023.openai.azure.com/\"\n", 57 | "openai.api_version = \"2022-12-01\"\n", 58 | "openai.api_key = os.getenv(\"OPENAI_API_KEY\")" 59 | ] 60 | }, 61 | { 62 | "attachments": {}, 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## Load Data" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "import pandas as pd\n", 76 | "\n", 77 | "df_orig = pd.read_csv(\"../data/bbc-news-data.csv\", delimiter='\\t', index_col=False)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/html": [ 88 | "
\n", 89 | "\n", 102 | "\n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | "
categoryfilenametitlecontent
0business001.txtAd sales boost Time Warner profitQuarterly profits at US media giant TimeWarne...
1business002.txtDollar gains on Greenspan speechThe dollar has hit its highest level against ...
2business003.txtYukos unit buyer faces loan claimThe owners of embattled Russian oil giant Yuk...
3business004.txtHigh fuel prices hit BA's profitsBritish Airways has blamed high fuel prices f...
4business005.txtPernod takeover talk lifts DomecqShares in UK drinks and food firm Allied Dome...
...............
2220tech397.txtBT program to beat dialler scamsBT is introducing two initiatives to help bea...
2221tech398.txtSpam e-mails tempt net shoppersComputer users across the world continue to i...
2222tech399.txtBe careful how you codeA new European directive could put software w...
2223tech400.txtUS cyber security chief resignsThe man making sure US computer networks are ...
2224tech401.txtLosing yourself in online gamingOnline role playing games are time-consuming,...
\n", 192 | "

2225 rows × 4 columns

\n", 193 | "
" 194 | ], 195 | "text/plain": [ 196 | " category filename title \\\n", 197 | "0 business 001.txt Ad sales boost Time Warner profit \n", 198 | "1 business 002.txt Dollar gains on Greenspan speech \n", 199 | "2 business 003.txt Yukos unit buyer faces loan claim \n", 200 | "3 business 004.txt High fuel prices hit BA's profits \n", 201 | "4 business 005.txt Pernod takeover talk lifts Domecq \n", 202 | "... ... ... ... \n", 203 | "2220 tech 397.txt BT program to beat dialler scams \n", 204 | "2221 tech 398.txt Spam e-mails tempt net shoppers \n", 205 | "2222 tech 399.txt Be careful how you code \n", 206 | "2223 tech 400.txt US cyber security chief resigns \n", 207 | "2224 tech 401.txt Losing yourself in online gaming \n", 208 | "\n", 209 | " content \n", 210 | "0 Quarterly profits at US media giant TimeWarne... \n", 211 | "1 The dollar has hit its highest level against ... \n", 212 | "2 The owners of embattled Russian oil giant Yuk... \n", 213 | "3 British Airways has blamed high fuel prices f... \n", 214 | "4 Shares in UK drinks and food firm Allied Dome... \n", 215 | "... ... \n", 216 | "2220 BT is introducing two initiatives to help bea... \n", 217 | "2221 Computer users across the world continue to i... \n", 218 | "2222 A new European directive could put software w... \n", 219 | "2223 The man making sure US computer networks are ... \n", 220 | "2224 Online role playing games are time-consuming,... \n", 221 | "\n", 222 | "[2225 rows x 4 columns]" 223 | ] 224 | }, 225 | "execution_count": 4, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "df = df_orig.copy()\n", 232 | "df" 233 | ] 234 | }, 235 | { 236 | "attachments": {}, 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "## Request to API" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 5, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "' \\n Extract keywords from this text\\nAd sales boost Time Warner profit\\n Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier. The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL. Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL\\'s existing customers for high-speed broadband. TimeWarner also has to restate 2000 and 2003 results following a probe by the US Securities Exchange Commission (SEC), which is close to concluding. Time Warner\\'s fourth quarter profits were slightly better than analysts\\' expectations. But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results. For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn. \"Our financial performance was strong, meeting or exceeding all of our full-year objectives and greatly enhancing our flexibility,\" chairman and chief executive Richard Parsons said. For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins. TimeWarner is to restate its accounts as part of efforts to resolve an inquiry into AOL by US market regulators. It has already offered to pay $300m to settle charges, in a deal that is under review by the SEC. The company said it was unable to estimate the amount it needed to set aside for legal reserves, which it previously set at $500m. It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann\\'s purchase of a stake in AOL Europe, which it had reported as advertising revenue. It will now book the sale of its stake in AOL Europe as a loss on the value of that stake. '" 252 | ] 253 | }, 254 | "execution_count": 5, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "# create prompt\n", 261 | "prompt_prefix = \"\"\" \n", 262 | " Extract keywords from this text\n", 263 | "\"\"\"\n", 264 | "\n", 265 | "prompt = prompt_prefix + df['title'].loc[0] + \"\\n\" + df['content'].loc[0]\n", 266 | "prompt" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 6, 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "data": { 276 | "text/plain": [ 277 | "'\\n\\nKeywords: Time Warner, profit, quarterly, Google, AOL, internet, advertising, subscribers, Securities Exchange Commission, Lord of the Rings, box office, Richard Parsons, restate accounts, AOL Europe, Bertelsmann, advertising revenue.'" 278 | ] 279 | }, 280 | "execution_count": 6, 281 | "metadata": {}, 282 | "output_type": "execute_result" 283 | } 284 | ], 285 | "source": [ 286 | "# Request API\n", 287 | "response = openai.Completion.create(\n", 288 | " deployment_id=\"text-davinci-003\", # has to be deployment_id\n", 289 | " prompt=prompt,\n", 290 | " temperature=1,\n", 291 | " max_tokens=100,\n", 292 | " top_p=1.0,\n", 293 | " frequency_penalty=0.0,\n", 294 | " presence_penalty=0\n", 295 | ")\n", 296 | "\n", 297 | "# print response\n", 298 | "response['choices'][0]['text']" 299 | ] 300 | }, 301 | { 302 | "attachments": {}, 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "----------------" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 7, 312 | "metadata": {}, 313 | "outputs": [ 314 | { 315 | "data": { 316 | "text/plain": [ 317 | "762" 318 | ] 319 | }, 320 | "execution_count": 7, 321 | "metadata": {}, 322 | "output_type": "execute_result" 323 | }, 324 | { 325 | "name": "stdout", 326 | "output_type": "stream", 327 | "text": [ 328 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 4097 tokens, however you requested 4417 tokens (4317 in your prompt; 100 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 329 | ] 330 | }, 331 | { 332 | "data": { 333 | "text/plain": [ 334 | "1185" 335 | ] 336 | }, 337 | "execution_count": 7, 338 | "metadata": {}, 339 | "output_type": "execute_result" 340 | }, 341 | { 342 | "name": "stdout", 343 | "output_type": "stream", 344 | "text": [ 345 | "Unexpected err=InvalidRequestError(message=\"This model's maximum context length is 4097 tokens, however you requested 5384 tokens (5284 in your prompt; 100 for the completion). Please reduce your prompt; or completion length.\", param=None, code=None, http_status=400, request_id=None), type(err)=\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "colname = 'keywords'\n", 351 | "results = pd.DataFrame(columns=[colname], index=df.index)\n", 352 | "\n", 353 | "prompt_prefix = \"\"\" \n", 354 | " Extract key words from this text\n", 355 | "\"\"\"\n", 356 | "\n", 357 | "for idx, title, content in zip(df.index.values, df['title'].loc[df.index.values], df['content'].loc[df.index.values]):\n", 358 | " \n", 359 | " # build prompt\n", 360 | " prompt = prompt_prefix + title + \"\\n\" + content\n", 361 | "\n", 362 | " try:\n", 363 | " # Request API\n", 364 | " response = openai.Completion.create(\n", 365 | " deployment_id=\"text-davinci-003\", # has to be deployment_id\n", 366 | " prompt=prompt,\n", 367 | " temperature=1,\n", 368 | " max_tokens=100,\n", 369 | " top_p=1.0,\n", 370 | " frequency_penalty=0.0,\n", 371 | " presence_penalty=1\n", 372 | " )\n", 373 | "\n", 374 | " # response\n", 375 | " results[colname].loc[idx] = response['choices'][0]['text']\n", 376 | " except Exception as err:\n", 377 | " idx\n", 378 | " print(f\"Unexpected {err=}, {type(err)=}\")" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 8, 384 | "metadata": {}, 385 | "outputs": [ 386 | { 387 | "data": { 388 | "text/html": [ 389 | "
\n", 390 | "\n", 403 | "\n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | "
keywords
0\\n\\nKey words: Time Warner, Quarterly Profit, ...
1\\n\\nKey Words: \\nDollar, Euro, Federal Reserve...
2\\n\\nKey words: \\nYukos, Rosneft, Yugansk, Mena...
3\\n\\nKey words: British Airways, Fuel Prices, P...
4\\n\\n•Pernod Ricard •Allied Domecq •Wall Street...
......
2220\\n\\n- BT Modem Protection Program \\n- rogue di...
2221\\n\\nKeywords: spam, e-mails, security warnings...
2222\\n\\nKey words: software, legal action, Europea...
2223\\n\\n1. US cyber security \\n2. Amit Yoran \\n3. ...
2224\\n\\nOnline gaming, addiction, MMORPGS, Pong, o...
\n", 457 | "

2225 rows × 1 columns

\n", 458 | "
" 459 | ], 460 | "text/plain": [ 461 | " keywords\n", 462 | "0 \\n\\nKey words: Time Warner, Quarterly Profit, ...\n", 463 | "1 \\n\\nKey Words: \\nDollar, Euro, Federal Reserve...\n", 464 | "2 \\n\\nKey words: \\nYukos, Rosneft, Yugansk, Mena...\n", 465 | "3 \\n\\nKey words: British Airways, Fuel Prices, P...\n", 466 | "4 \\n\\n•Pernod Ricard •Allied Domecq •Wall Street...\n", 467 | "... ...\n", 468 | "2220 \\n\\n- BT Modem Protection Program \\n- rogue di...\n", 469 | "2221 \\n\\nKeywords: spam, e-mails, security warnings...\n", 470 | "2222 \\n\\nKey words: software, legal action, Europea...\n", 471 | "2223 \\n\\n1. US cyber security \\n2. Amit Yoran \\n3. ...\n", 472 | "2224 \\n\\nOnline gaming, addiction, MMORPGS, Pong, o...\n", 473 | "\n", 474 | "[2225 rows x 1 columns]" 475 | ] 476 | }, 477 | "execution_count": 8, 478 | "metadata": {}, 479 | "output_type": "execute_result" 480 | } 481 | ], 482 | "source": [ 483 | "results" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 9, 489 | "metadata": {}, 490 | "outputs": [ 491 | { 492 | "data": { 493 | "text/plain": [ 494 | "(2225, 5)" 495 | ] 496 | }, 497 | "execution_count": 9, 498 | "metadata": {}, 499 | "output_type": "execute_result" 500 | }, 501 | { 502 | "data": { 503 | "text/html": [ 504 | "
\n", 505 | "\n", 518 | "\n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | "
categoryfilenametitlecontentkeywords
0business001.txtAd sales boost Time Warner profitQuarterly profits at US media giant TimeWarne...\\n\\nKey words: Time Warner, Quarterly Profit, ...
1business002.txtDollar gains on Greenspan speechThe dollar has hit its highest level against ...\\n\\nKey Words: \\nDollar, Euro, Federal Reserve...
2business003.txtYukos unit buyer faces loan claimThe owners of embattled Russian oil giant Yuk...\\n\\nKey words: \\nYukos, Rosneft, Yugansk, Mena...
3business004.txtHigh fuel prices hit BA's profitsBritish Airways has blamed high fuel prices f...\\n\\nKey words: British Airways, Fuel Prices, P...
4business005.txtPernod takeover talk lifts DomecqShares in UK drinks and food firm Allied Dome...\\n\\n•Pernod Ricard •Allied Domecq •Wall Street...
..................
2220tech397.txtBT program to beat dialler scamsBT is introducing two initiatives to help bea...\\n\\n- BT Modem Protection Program \\n- rogue di...
2221tech398.txtSpam e-mails tempt net shoppersComputer users across the world continue to i...\\n\\nKeywords: spam, e-mails, security warnings...
2222tech399.txtBe careful how you codeA new European directive could put software w...\\n\\nKey words: software, legal action, Europea...
2223tech400.txtUS cyber security chief resignsThe man making sure US computer networks are ...\\n\\n1. US cyber security \\n2. Amit Yoran \\n3. ...
2224tech401.txtLosing yourself in online gamingOnline role playing games are time-consuming,...\\n\\nOnline gaming, addiction, MMORPGS, Pong, o...
\n", 620 | "

2225 rows × 5 columns

\n", 621 | "
" 622 | ], 623 | "text/plain": [ 624 | " category filename title \\\n", 625 | "0 business 001.txt Ad sales boost Time Warner profit \n", 626 | "1 business 002.txt Dollar gains on Greenspan speech \n", 627 | "2 business 003.txt Yukos unit buyer faces loan claim \n", 628 | "3 business 004.txt High fuel prices hit BA's profits \n", 629 | "4 business 005.txt Pernod takeover talk lifts Domecq \n", 630 | "... ... ... ... \n", 631 | "2220 tech 397.txt BT program to beat dialler scams \n", 632 | "2221 tech 398.txt Spam e-mails tempt net shoppers \n", 633 | "2222 tech 399.txt Be careful how you code \n", 634 | "2223 tech 400.txt US cyber security chief resigns \n", 635 | "2224 tech 401.txt Losing yourself in online gaming \n", 636 | "\n", 637 | " content \\\n", 638 | "0 Quarterly profits at US media giant TimeWarne... \n", 639 | "1 The dollar has hit its highest level against ... \n", 640 | "2 The owners of embattled Russian oil giant Yuk... \n", 641 | "3 British Airways has blamed high fuel prices f... \n", 642 | "4 Shares in UK drinks and food firm Allied Dome... \n", 643 | "... ... \n", 644 | "2220 BT is introducing two initiatives to help bea... \n", 645 | "2221 Computer users across the world continue to i... \n", 646 | "2222 A new European directive could put software w... \n", 647 | "2223 The man making sure US computer networks are ... \n", 648 | "2224 Online role playing games are time-consuming,... \n", 649 | "\n", 650 | " keywords \n", 651 | "0 \\n\\nKey words: Time Warner, Quarterly Profit, ... \n", 652 | "1 \\n\\nKey Words: \\nDollar, Euro, Federal Reserve... \n", 653 | "2 \\n\\nKey words: \\nYukos, Rosneft, Yugansk, Mena... \n", 654 | "3 \\n\\nKey words: British Airways, Fuel Prices, P... \n", 655 | "4 \\n\\n•Pernod Ricard •Allied Domecq •Wall Street... \n", 656 | "... ... \n", 657 | "2220 \\n\\n- BT Modem Protection Program \\n- rogue di... \n", 658 | "2221 \\n\\nKeywords: spam, e-mails, security warnings... \n", 659 | "2222 \\n\\nKey words: software, legal action, Europea... \n", 660 | "2223 \\n\\n1. US cyber security \\n2. Amit Yoran \\n3. ... \n", 661 | "2224 \\n\\nOnline gaming, addiction, MMORPGS, Pong, o... \n", 662 | "\n", 663 | "[2225 rows x 5 columns]" 664 | ] 665 | }, 666 | "execution_count": 9, 667 | "metadata": {}, 668 | "output_type": "execute_result" 669 | } 670 | ], 671 | "source": [ 672 | "df_results = pd.concat([df, results], axis=1)\n", 673 | "df_results.shape\n", 674 | "df_results" 675 | ] 676 | }, 677 | { 678 | "attachments": {}, 679 | "cell_type": "markdown", 680 | "metadata": {}, 681 | "source": [ 682 | "## Save results" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": 10, 688 | "metadata": {}, 689 | "outputs": [], 690 | "source": [ 691 | "SAVE = False\n", 692 | "\n", 693 | "if SAVE:\n", 694 | " fname = '../output/keywords.csv'\n", 695 | " df_results.to_csv(fname, sep='\\t')" 696 | ] 697 | } 698 | ], 699 | "metadata": { 700 | "kernelspec": { 701 | "display_name": "azureml_py38", 702 | "language": "python", 703 | "name": "python3" 704 | }, 705 | "language_info": { 706 | "codemirror_mode": { 707 | "name": "ipython", 708 | "version": 3 709 | }, 710 | "file_extension": ".py", 711 | "mimetype": "text/x-python", 712 | "name": "python", 713 | "nbconvert_exporter": "python", 714 | "pygments_lexer": "ipython3", 715 | "version": "3.8.5" 716 | }, 717 | "orig_nbformat": 4, 718 | "vscode": { 719 | "interpreter": { 720 | "hash": "6d65a8c07f5b6469e0fc613f182488c0dccce05038bbda39e5ac9075c0454d11" 721 | } 722 | } 723 | }, 724 | "nbformat": 4, 725 | "nbformat_minor": 2 726 | } 727 | -------------------------------------------------------------------------------- /notebooks/07-semantic-search.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Semantic Search on Specific Data Corpus\n", 9 | "Query files within specifi corpus. " 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "%load_ext autoreload\n", 19 | "%autoreload 2\n", 20 | "\n", 21 | "from IPython.core.interactiveshell import InteractiveShell\n", 22 | "InteractiveShell.ast_node_interactivity = \"all\"" 23 | ] 24 | }, 25 | { 26 | "attachments": {}, 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Set up Azure OpenAI" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/plain": [ 41 | "True" 42 | ] 43 | }, 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "output_type": "execute_result" 47 | } 48 | ], 49 | "source": [ 50 | "import os\n", 51 | "import openai\n", 52 | "from dotenv import load_dotenv\n", 53 | "\n", 54 | "# Set up Azure OpenAI\n", 55 | "load_dotenv()\n", 56 | "openai.api_type = \"azure\"\n", 57 | "openai.api_base = \"https://tutorial-openai-01-2023.openai.azure.com/\"\n", 58 | "openai.api_version = \"2022-12-01\"\n", 59 | "openai.api_key = os.getenv(\"OPENAI_API_KEY\")" 60 | ] 61 | }, 62 | { 63 | "attachments": {}, 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Deploy a Language Model" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "Found a succeeded deployment that supports embeddings with id: deployment-89153abdfa934e1580296dbee586239b.\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "# list models deployed with embeddings capability\n", 85 | "deployment_id = None\n", 86 | "result = openai.Deployment.list()\n", 87 | "\n", 88 | "for deployment in result.data:\n", 89 | " if deployment[\"status\"] != \"succeeded\":\n", 90 | " continue\n", 91 | " \n", 92 | " model = openai.Model.retrieve(deployment[\"model\"])\n", 93 | " if model[\"capabilities\"][\"embeddings\"] != True:\n", 94 | " continue\n", 95 | " \n", 96 | " deployment_id = deployment[\"id\"]\n", 97 | " break\n", 98 | "\n", 99 | "# if not model deployed, deploy one\n", 100 | "if not deployment_id:\n", 101 | " print('No deployment with status: succeeded found.')\n", 102 | " model = \"text-similarity-davinci-001\"\n", 103 | "\n", 104 | " # Now let's create the deployment\n", 105 | " print(f'Creating a new deployment with model: {model}')\n", 106 | " result = openai.Deployment.create(model=model, scale_settings={\"scale_type\":\"standard\"})\n", 107 | " deployment_id = result[\"id\"]\n", 108 | " print(f'Successfully created {model} with deployment_id {deployment_id}')\n", 109 | "else:\n", 110 | " print(f'Found a succeeded deployment that supports embeddings with id: {deployment_id}.')" 111 | ] 112 | }, 113 | { 114 | "attachments": {}, 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "## Create Embeddings\n", 119 | "\n", 120 | "see [01-get-embeddings.ipynb](./01-get-embeddings.ipynb) on how to get embeddings.\n", 121 | "\n", 122 | "In this example, we will load embeddings from a file. " 123 | ] 124 | }, 125 | { 126 | "attachments": {}, 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## Load Data" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 4, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "import pandas as pd\n", 140 | "fname = '../data/bbc-news-data-embedding.csv'\n", 141 | "df_orig = pd.read_csv(fname, delimiter='\\t', index_col=False)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 5, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/html": [ 152 | "
\n", 153 | "\n", 166 | "\n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | "
categoryfilenametitlecontentembedding
0business001.txtAd sales boost Time Warner profitQuarterly profits at US media giant TimeWarne...[-0.0012276918860152364, 0.00733763724565506, ...
1business002.txtDollar gains on Greenspan speechThe dollar has hit its highest level against ...[0.0009311728645116091, 0.014099937863647938, ...
2business003.txtYukos unit buyer faces loan claimThe owners of embattled Russian oil giant Yuk...[-0.010487922467291355, 0.009665092453360558, ...
3business004.txtHigh fuel prices hit BA's profitsBritish Airways has blamed high fuel prices f...[0.0111119095236063, 0.004624682944267988, -0....
4business005.txtPernod takeover talk lifts DomecqShares in UK drinks and food firm Allied Dome...[-0.0021637482568621635, 0.005410161800682545,...
..................
2219tech396.txtNew consoles promise big problemsMaking games for future consoles will require...[0.014879594556987286, 0.004789963364601135, -...
2220tech397.txtBT program to beat dialler scamsBT is introducing two initiatives to help bea...[0.007671569474041462, 0.00624304823577404, -0...
2221tech398.txtSpam e-mails tempt net shoppersComputer users across the world continue to i...[0.0026338498573750257, 0.015989987179636955, ...
2222tech399.txtBe careful how you codeA new European directive could put software w...[0.007126151118427515, 0.008495588786900043, -...
2223tech400.txtUS cyber security chief resignsThe man making sure US computer networks are ...[0.002447678940370679, 0.006076449993997812, -...
\n", 268 | "

2217 rows × 5 columns

\n", 269 | "
" 270 | ], 271 | "text/plain": [ 272 | " category filename title \\\n", 273 | "0 business 001.txt Ad sales boost Time Warner profit \n", 274 | "1 business 002.txt Dollar gains on Greenspan speech \n", 275 | "2 business 003.txt Yukos unit buyer faces loan claim \n", 276 | "3 business 004.txt High fuel prices hit BA's profits \n", 277 | "4 business 005.txt Pernod takeover talk lifts Domecq \n", 278 | "... ... ... ... \n", 279 | "2219 tech 396.txt New consoles promise big problems \n", 280 | "2220 tech 397.txt BT program to beat dialler scams \n", 281 | "2221 tech 398.txt Spam e-mails tempt net shoppers \n", 282 | "2222 tech 399.txt Be careful how you code \n", 283 | "2223 tech 400.txt US cyber security chief resigns \n", 284 | "\n", 285 | " content \\\n", 286 | "0 Quarterly profits at US media giant TimeWarne... \n", 287 | "1 The dollar has hit its highest level against ... \n", 288 | "2 The owners of embattled Russian oil giant Yuk... \n", 289 | "3 British Airways has blamed high fuel prices f... \n", 290 | "4 Shares in UK drinks and food firm Allied Dome... \n", 291 | "... ... \n", 292 | "2219 Making games for future consoles will require... \n", 293 | "2220 BT is introducing two initiatives to help bea... \n", 294 | "2221 Computer users across the world continue to i... \n", 295 | "2222 A new European directive could put software w... \n", 296 | "2223 The man making sure US computer networks are ... \n", 297 | "\n", 298 | " embedding \n", 299 | "0 [-0.0012276918860152364, 0.00733763724565506, ... \n", 300 | "1 [0.0009311728645116091, 0.014099937863647938, ... \n", 301 | "2 [-0.010487922467291355, 0.009665092453360558, ... \n", 302 | "3 [0.0111119095236063, 0.004624682944267988, -0.... \n", 303 | "4 [-0.0021637482568621635, 0.005410161800682545,... \n", 304 | "... ... \n", 305 | "2219 [0.014879594556987286, 0.004789963364601135, -... \n", 306 | "2220 [0.007671569474041462, 0.00624304823577404, -0... \n", 307 | "2221 [0.0026338498573750257, 0.015989987179636955, ... \n", 308 | "2222 [0.007126151118427515, 0.008495588786900043, -... \n", 309 | "2223 [0.002447678940370679, 0.006076449993997812, -... \n", 310 | "\n", 311 | "[2217 rows x 5 columns]" 312 | ] 313 | }, 314 | "execution_count": 5, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "import numpy as np\n", 321 | "\n", 322 | "DEVELOPMENT = False\n", 323 | "\n", 324 | "if DEVELOPMENT:\n", 325 | " # Sub-sample for development\n", 326 | " df = df_orig.sample(n=20, replace=False, random_state=9).copy()\n", 327 | "else:\n", 328 | " df = df_orig.copy()\n", 329 | "\n", 330 | "# drop rows with NaN\n", 331 | "df.dropna(inplace=True)\n", 332 | "\n", 333 | "# convert string to array\n", 334 | "df[\"embedding\"] = df['embedding'].apply(eval).apply(np.array)\n", 335 | "df" 336 | ] 337 | }, 338 | { 339 | "attachments": {}, 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "## Find documents with similar embeddings to the embeddings of the question" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 7, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "import numpy as np\n", 353 | "\n", 354 | "def get_embedding(text, deployment_id=deployment_id):\n", 355 | " \"\"\" \n", 356 | " Get embeddings for an input text. \n", 357 | " \"\"\"\n", 358 | " result = openai.Embedding.create(\n", 359 | " deployment_id=deployment_id,\n", 360 | " input=text\n", 361 | " )\n", 362 | " result = np.array(result[\"data\"][0][\"embedding\"])\n", 363 | " return result\n", 364 | "\n", 365 | "def vector_similarity(x, y):\n", 366 | " \"\"\"\n", 367 | " Returns the similarity between two vectors. \n", 368 | " Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.\n", 369 | " \"\"\"\n", 370 | " similarity = np.dot(x, y)\n", 371 | " return similarity \n", 372 | "\n", 373 | "def order_document_sections_by_query_similarity(query, contexts):\n", 374 | " \"\"\"\n", 375 | " Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings\n", 376 | " to find the most relevant sections. \n", 377 | " Return the list of document sections, sorted by relevance in descending order.\n", 378 | " \"\"\"\n", 379 | " query_embedding = get_embedding(query)\n", 380 | "\n", 381 | " document_similarities = sorted([\n", 382 | " (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()\n", 383 | " ], reverse=True)\n", 384 | " \n", 385 | " return document_similarities" 386 | ] 387 | }, 388 | { 389 | "attachments": {}, 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "## Retrieve relevant news " 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 14, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "def retrieve_relevant_documents(query, contexts = df['embedding']):\n", 403 | " # find text most similar to the query\n", 404 | " answers = order_document_sections_by_query_similarity(query=query, contexts=contexts)[0:3]\n", 405 | "\n", 406 | " # print top 3\n", 407 | " for answer in answers:\n", 408 | " print(f'similarity score: {answer[0]}')\n", 409 | " print(df['content'].loc[answer[1]], '\\n')\n", 410 | "\n", 411 | " return" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 15, 417 | "metadata": {}, 418 | "outputs": [ 419 | { 420 | "name": "stdout", 421 | "output_type": "stream", 422 | "text": [ 423 | "similarity score: 0.5842770878602115\n", 424 | " The owner of the technology-dominated Nasdaq stock index plans to sell shares to the public and list itself on the market it operates. According to a registration document filed with the Securities and Exchange Commission, Nasdaq Stock Market plans to raise $100m (£52m) from the sale. Some observers see this as another step closer to a full public listing. However Nasdaq, an icon of the 1990s technology boom, recently poured cold water on those suggestions. The company first sold shares in private placements during 2000 and 2001. It technically went public in 2002 when the stock started trading on the OTC Bulletin Board, which lists equities that trade only occasionally. Nasdaq will not make money from the sale, only investors who bought shares in the private placings, the filing documents said. The Nasdaq is made up shares in technology firms and other companies with high growth potential. It was the most potent symbol of the 1990s internet and telecoms boom, nose-diving after the bubble burst. A recovery in the fortunes of tech giants such as Intel, and dot.com survivors such as Amazon has helped revive its fortunes. \n", 425 | "\n", 426 | "similarity score: 0.5842770878602115\n", 427 | " The owner of the technology-dominated Nasdaq stock index plans to sell shares to the public and list itself on the market it operates. According to a registration document filed with the Securities and Exchange Commission, Nasdaq Stock Market plans to raise $100m (£52m) from the sale. Some observers see this as another step closer to a full public listing. However Nasdaq, an icon of the 1990s technology boom, recently poured cold water on those suggestions. The company first sold shares in private placements during 2000 and 2001. It technically went public in 2002 when the stock started trading on the OTC Bulletin Board, which lists equities that trade only occasionally. Nasdaq will not make money from the sale, only investors who bought shares in the private placings, the filing documents said. The Nasdaq is made up shares in technology firms and other companies with high growth potential. It was the most potent symbol of the 1990s internet and telecoms boom, nose-diving after the bubble burst. A recovery in the fortunes of tech giants such as Intel, and dot.com survivors such as Amazon has helped revive its fortunes. \n", 428 | "\n", 429 | "similarity score: 0.584184967453397\n", 430 | " Shares in Google have fallen 6.7% after employees and early investors in the web search took advantage of the first chance to sell their holdings. Restrictions were imposed ahead of its flotation in August, to prevent shares being dumped quickly onto the market. In one of the most closely-watched initial public offerings in stock market history, the US-based company sold 19.6 million shares at $85 each. Google shares have risen since but fell $12.33 on Tuesday to close at $172.55. The restriction - known as a lockup - is being eased piecemeal: in all, some 227 million additional shares will become free to trade by February 2005. Selling the shares could turn many of Google's workers into millionaires. There were fears that the potential increase of shares in circulation from Tuesday would ease demand for stock. However, analysts say they expected most shareholders would be holding back from selling all their shares immediately, as Google's good performance and future growth potential means demand will hold. In its first earnings report since floating on the stock market, Google said it made a net profit of $52m in the three months ending 30 September. Sales surged to $805.9m in the third quarter, up from $393.9m a year earlier. Google's main service - its internet search - is free to users, so the firm makes much of its money from selling advertising space linked to the words for which its users search. It also sells the use of its technology to companies who need to make either their websites, or their internal information systems, searchable. \n", 431 | "\n" 432 | ] 433 | } 434 | ], 435 | "source": [ 436 | "query = 'News about stock market.'\n", 437 | "retrieve_relevant_documents(query=query)" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 16, 443 | "metadata": {}, 444 | "outputs": [ 445 | { 446 | "name": "stdout", 447 | "output_type": "stream", 448 | "text": [ 449 | "similarity score: 0.5842770878602115\n", 450 | " The owner of the technology-dominated Nasdaq stock index plans to sell shares to the public and list itself on the market it operates. According to a registration document filed with the Securities and Exchange Commission, Nasdaq Stock Market plans to raise $100m (£52m) from the sale. Some observers see this as another step closer to a full public listing. However Nasdaq, an icon of the 1990s technology boom, recently poured cold water on those suggestions. The company first sold shares in private placements during 2000 and 2001. It technically went public in 2002 when the stock started trading on the OTC Bulletin Board, which lists equities that trade only occasionally. Nasdaq will not make money from the sale, only investors who bought shares in the private placings, the filing documents said. The Nasdaq is made up shares in technology firms and other companies with high growth potential. It was the most potent symbol of the 1990s internet and telecoms boom, nose-diving after the bubble burst. A recovery in the fortunes of tech giants such as Intel, and dot.com survivors such as Amazon has helped revive its fortunes. \n", 451 | "\n", 452 | "similarity score: 0.5842770878602115\n", 453 | " The owner of the technology-dominated Nasdaq stock index plans to sell shares to the public and list itself on the market it operates. According to a registration document filed with the Securities and Exchange Commission, Nasdaq Stock Market plans to raise $100m (£52m) from the sale. Some observers see this as another step closer to a full public listing. However Nasdaq, an icon of the 1990s technology boom, recently poured cold water on those suggestions. The company first sold shares in private placements during 2000 and 2001. It technically went public in 2002 when the stock started trading on the OTC Bulletin Board, which lists equities that trade only occasionally. Nasdaq will not make money from the sale, only investors who bought shares in the private placings, the filing documents said. The Nasdaq is made up shares in technology firms and other companies with high growth potential. It was the most potent symbol of the 1990s internet and telecoms boom, nose-diving after the bubble burst. A recovery in the fortunes of tech giants such as Intel, and dot.com survivors such as Amazon has helped revive its fortunes. \n", 454 | "\n", 455 | "similarity score: 0.584184967453397\n", 456 | " Shares in Google have fallen 6.7% after employees and early investors in the web search took advantage of the first chance to sell their holdings. Restrictions were imposed ahead of its flotation in August, to prevent shares being dumped quickly onto the market. In one of the most closely-watched initial public offerings in stock market history, the US-based company sold 19.6 million shares at $85 each. Google shares have risen since but fell $12.33 on Tuesday to close at $172.55. The restriction - known as a lockup - is being eased piecemeal: in all, some 227 million additional shares will become free to trade by February 2005. Selling the shares could turn many of Google's workers into millionaires. There were fears that the potential increase of shares in circulation from Tuesday would ease demand for stock. However, analysts say they expected most shareholders would be holding back from selling all their shares immediately, as Google's good performance and future growth potential means demand will hold. In its first earnings report since floating on the stock market, Google said it made a net profit of $52m in the three months ending 30 September. Sales surged to $805.9m in the third quarter, up from $393.9m a year earlier. Google's main service - its internet search - is free to users, so the firm makes much of its money from selling advertising space linked to the words for which its users search. It also sells the use of its technology to companies who need to make either their websites, or their internal information systems, searchable. \n", 457 | "\n" 458 | ] 459 | } 460 | ], 461 | "source": [ 462 | "query = 'News about stock market.'\n", 463 | "retrieve_relevant_documents(query=query)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 17, 469 | "metadata": {}, 470 | "outputs": [ 471 | { 472 | "name": "stdout", 473 | "output_type": "stream", 474 | "text": [ 475 | "similarity score: 0.5600900443509507\n", 476 | " England will have to negotiate their way through a tough draw if they are to win the Rugby World Cup Sevens in Hong Kong next month. The second seeds have been drawn against Samoa, France, Italy, Georgia and Chinese Taipei. The top two sides in each pool qualify but England could face 2001 winners New Zealand in the quarter-finals if they stumble against Samoa. Scotland and Ireland are in Pool A together with the All Blacks. England won the first event of the International Rugby Board World Sevens series in Dubai but have slipped to fourth in the table after failing to build on that victory. However, they beat Samoa in the recent Los Angeles Sevens before losing to Argentina in the semi-finals. \"England have the ability and determination to win this World Cup and create sporting history by being the only nation to hold both the 15s and Sevens World Cups at the same time,\" said England sevens coach Mike Friday. \"England have a fantastic record in Hong Kong and have won there the last three years, but the World Cup is on a different level. \"Every pool contains teams who have caused upsets before and we will have to work hard to ensure we progress from our group. \"We have not performed consistently to our true potential so far in the IRB Sevens which has been disappointing - but we can only look forward.\" England won the first Rugby World Cup Sevens in 1993 with a side that included the likes of Lawrence Dallaglio and Matt Dawson. In 1997 and 2001, England lost in the quarter-finals. (seeds in brackets) New Zealand (1), Scotland (8), Tonga, Ireland, Korea, USA. England (2), Samoa (7), France, Italy, Georgia, Chinese Taipei. Fiji (3), Australia (6), Canada, Portugal, Japan, Hong Kong. Argentina (4), South Africa (5), Kenya, Tunisia, Russia, Uruguay. \n", 477 | "\n", 478 | "similarity score: 0.5592240071488772\n", 479 | " The Welsh Rugby Union wants to restructure the Northern Hemisphere season into four separate blocks. The season would start with the Celtic League in October, followed by the Heineken Cup in February and March, and the Six Nations moved to April and May. After a nine week break, the WRU then proposes a two-month period of away and home international matches. WRU chairman David Pickering said the structure would end problems of player availability for club and country. He added: \"We feel sure that spectator interest would respond to the impetus of high intensity rugby being played continuously rather than the fragmented timetable currently in operation. \"Equally, we suspect that the sponsors would prefer the sustained interest in a continuous tournament and hopefully, the broadcasters would also enjoy increased exposure.\" Moving the Six Nations from its traditional February beginning should also ensure better weather conditions and \"stimulate greater interest in the games and generally provide increased skills and competition and attract greater spectator viewing\", Pickering argued. The plan will be put before the International Rugby Board next month, where four other plans drawn up by independent consultants for a global integrated season will also be discussed. Pickering added: \"It's very early days and there are a number of caveats associated with it - not least the revenue from the broadcasters, which is extremely important. \"We've got a good plan and one which should be judged on its merits.\" \n", 480 | "\n", 481 | "similarity score: 0.5505910793421455\n", 482 | " England coach Andy Robinson says English rugby has to act now to prevent injury destroying players' careers. He will be without a host of big names for the Six Nations as the intensity of professional rugby union hits players. \"Injuries are part of the sport but we have to have a look at the amount of injuries that occur in the English season,\" Robinson told BBC Sport. \"I think players are probably going to have three or four years taken off their careers.\" Robinson will be missing an entire midfield for the Six Nations with the likes of Jonny Wilkinson, Mike Tindall and Will Greenwood injured. Rugby union has become far more physically demanding since the game went professional nearly 10 years ago. As a result three of the major stakeholders in English rugby have launched an \"injury audit\" to find out how players are coping. The audit is jointly funded by the Professional Rugby Players' Association, the Rugby Football Union and Premier Rugby. As far as Robinson is concerned its findings must not be ignored. \"I think there's an injury audit coming out in March that's got some great information in there that I think everybody in the English game has got to look at,\" he said. \"If we don't the situation is going to get worse and not better, so I think rugby as a whole has got to look at this.\" \n", 483 | "\n" 484 | ] 485 | } 486 | ], 487 | "source": [ 488 | "query = 'What is happening in the rugby world?'\n", 489 | "retrieve_relevant_documents(query=query)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 18, 495 | "metadata": {}, 496 | "outputs": [ 497 | { 498 | "name": "stdout", 499 | "output_type": "stream", 500 | "text": [ 501 | "similarity score: 0.5232753153602943\n", 502 | " A major reform of Brazil's bankruptcy laws has been approved by the country's Congress, in a move which it is hoped will cut the cost of borrowing. The bill, proposed in 1993, has finally been approved by the leadership of President Luiz Inacio Lula da Silva. The old law, dating from 1945, gave priority first to workers, second to tax revenue and finally to creditors. The new legislation changes this, giving priority to creditors and limiting payments to workers. The new regulations will limit payments to workers to 150 times the minimum monthly salary, which is currently $94. The law also makes it more difficult for a company to declare bankruptcy. However, when a firm is declared bankrupt it will gain protection from creditors for 180 days while a recovery plan is worked out. The proposals were opposed in the past by leftist parties, including Mr Lula's Worker Party. They considered that they undermined workers' rights. But President Lula became a defender of the reforms, arguing that the country's bank lending margins were among the highest in the world and were damaging the economy. According to Andreas Adriano of Latin Trade Magazine, the new bankruptcy law will help in reducing the spread - difference between the interest rates of the banks and federal bonds. Nevertheless, Mr Adriano said to reduce the basic interest rate the Central Bank needs to change its policy, focusing not only on inflation but also on economic growth. \n", 503 | "\n", 504 | "similarity score: 0.5192763546343225\n", 505 | " Brazil's unemployment rate fell to its lowest level in three years in December, according to the government. The Brazilian Institute for Geography and Statistics (IBGE) said it fell to 9.6% in December from 10.6% in November and 10.9% in December 2003. IBGE also said that average monthly salaries grew 1.9% in December 2004 from December 2003. However, average monthly wages fell 1.8% in December to 895.4 reais ($332; £179.3) from November. Tuesday's figures represent the first time that the unemployment rate has fallen to a single digit since new measurement rules were introduced in 2001. The unemployment rate has been falling gradually since April 2004 when it reached a peak of 13.1%. The jobless rate average for the whole of 2004 was 11.5%, down from 12.3% in 2003, the IBGE said. This improvement can be attributed to the country's strong economic growth, with the economy registering growth of 5.2% in 2004, the government said. The economy is expected to grow by about 4% this year. President Luiz Inacio Lula da Silva promised to reduce unemployment when he was elected two years ago. Nevertheless, some analysts say that unemployment could increase in the next months. \"The data is favourable, but a lot of jobs are temporary for the (Christmas) holiday season, so we may see slightly higher joblessness in January and February,\" Julio Hegedus, chief economist with Lopes Filho & Associates consultancy in Rio de Janeir, told Reuters news agency. Despite his leftist background, President Lula has pursued a surprisingly conservative economic policy, arguing that in order to meet its social promises, the government needs to first reach a sustained economic growth. The unemployment rate is measured in the six main metropolitan areas of Brazil (Sao Paolo, Rio de Janeiro, Belo Horizonte, Recife, Salvador and Porto Alegre), where most of the population is concentrated. \n", 506 | "\n", 507 | "similarity score: 0.5156874118999003\n", 508 | " Belgian brewing giant Inbev has seen its profits soar thanks to its acquisition of Brazil's biggest beer firm Ambev last year. Inbev, which makes Stella Artois, said pre-tax profits for 2004 rose 56% to 1.16bn euros ($1.5bn; £800m), and said it expected solid growth in 2005. The performance comes on sales up 21% at 8.6bn euros. The firm, formerly Interbrew, became the world's biggest brewer by volume when it bought Ambev in August 2004. The acquisition meant its sales by volume grew 57% in 2004, with four months of Ambev sales accounting for almost all of the increase. US beermaker Anheuser-Busch sells less beer by volume than Inbev but is bigger in terms of the value of its sales. Continuing demand for Inbev's products in the South American markets where its Brazilian arm is most popular means it expects to keep boosting its turnover. \"It's the Brazil business that's doing it,\" said ING analyst Gerard Rijk of Inbev's strong performance. Ambev boosted its share of Brazil's beer market from 62% at the end of 2003 to more than 68% by December 2004, Inbev reported. In contrast, Inbev's European business saw volume sales fall 2.5%, although Central and Eastern European sales rose 12%. Overall, net profits were up 42% to 719m euros. \n", 509 | "\n" 510 | ] 511 | } 512 | ], 513 | "source": [ 514 | "query = 'What happened in Brazil?'\n", 515 | "retrieve_relevant_documents(query=query)" 516 | ] 517 | } 518 | ], 519 | "metadata": { 520 | "kernelspec": { 521 | "display_name": "azureml_py38", 522 | "language": "python", 523 | "name": "python3" 524 | }, 525 | "language_info": { 526 | "codemirror_mode": { 527 | "name": "ipython", 528 | "version": 3 529 | }, 530 | "file_extension": ".py", 531 | "mimetype": "text/x-python", 532 | "name": "python", 533 | "nbconvert_exporter": "python", 534 | "pygments_lexer": "ipython3", 535 | "version": "3.8.5" 536 | }, 537 | "orig_nbformat": 4, 538 | "vscode": { 539 | "interpreter": { 540 | "hash": "6d65a8c07f5b6469e0fc613f182488c0dccce05038bbda39e5ac9075c0454d11" 541 | } 542 | } 543 | }, 544 | "nbformat": 4, 545 | "nbformat_minor": 2 546 | } 547 | -------------------------------------------------------------------------------- /notebooks/09-unstructure-data-to-structured-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Retrieve Information from Specific Data Corpus" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "%load_ext autoreload\n", 18 | "%autoreload 2\n", 19 | "\n", 20 | "from IPython.core.interactiveshell import InteractiveShell\n", 21 | "InteractiveShell.ast_node_interactivity = \"all\"" 22 | ] 23 | }, 24 | { 25 | "attachments": {}, 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Set up Azure OpenAI" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "True" 41 | ] 42 | }, 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "import os\n", 50 | "import openai\n", 51 | "from dotenv import load_dotenv\n", 52 | "\n", 53 | "# Set up Azure OpenAI\n", 54 | "load_dotenv()\n", 55 | "openai.api_type = \"azure\"\n", 56 | "openai.api_base = \"https://tutorial-openai-01-2023.openai.azure.com/\"\n", 57 | "openai.api_version = \"2022-12-01\"\n", 58 | "openai.api_key = os.getenv(\"OPENAI_API_KEY\")" 59 | ] 60 | }, 61 | { 62 | "attachments": {}, 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## Deploy a Language Model" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 25, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "Text-davinci-003\n", 79 | "Found a succeeded deployment that supports embeddings with id: Text-davinci-003.\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "# list models deployed with embeddings capability\n", 85 | "deployment_id = None\n", 86 | "result = openai.Deployment.list()\n", 87 | "desired_model = 'text-davinci-003'\n", 88 | "\n", 89 | "# check if desired model is already deployed\n", 90 | "for deployment in result.data:\n", 91 | " if deployment[\"status\"] != \"succeeded\":\n", 92 | " continue\n", 93 | " \n", 94 | " if deployment['model'] != desired_model:\n", 95 | " continue\n", 96 | " \n", 97 | " deployment_id = deployment[\"id\"]; print(deployment_id)\n", 98 | " break\n", 99 | "\n", 100 | "# if not model deployed, deploy one\n", 101 | "if not deployment_id:\n", 102 | " print('No deployment with status: succeeded found.')\n", 103 | " model = desired_model\n", 104 | "\n", 105 | " # Now let's create the deployment\n", 106 | " print(f'Creating a new deployment with model: {model}')\n", 107 | " result = openai.Deployment.create(model=model, scale_settings={\"scale_type\":\"standard\"})\n", 108 | " deployment_id = result[\"id\"]\n", 109 | " print(f'Successfully created {model} with deployment_id {deployment_id}')\n", 110 | "else:\n", 111 | " print(f'Found a succeeded deployment that supports embeddings with id: {deployment_id}.')" 112 | ] 113 | }, 114 | { 115 | "attachments": {}, 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "## Load Data" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 26, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "import pandas as pd\n", 129 | "fname = '../data/bbc-news-data.csv'\n", 130 | "df_orig = pd.read_csv(fname, delimiter='\\t', index_col=False)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 29, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/html": [ 141 | "
\n", 142 | "\n", 155 | "\n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | "
categoryfilenametitlecontent
0business001.txtAd sales boost Time Warner profitQuarterly profits at US media giant TimeWarne...
1business002.txtDollar gains on Greenspan speechThe dollar has hit its highest level against ...
2business003.txtYukos unit buyer faces loan claimThe owners of embattled Russian oil giant Yuk...
3business004.txtHigh fuel prices hit BA's profitsBritish Airways has blamed high fuel prices f...
4business005.txtPernod takeover talk lifts DomecqShares in UK drinks and food firm Allied Dome...
...............
2220tech397.txtBT program to beat dialler scamsBT is introducing two initiatives to help bea...
2221tech398.txtSpam e-mails tempt net shoppersComputer users across the world continue to i...
2222tech399.txtBe careful how you codeA new European directive could put software w...
2223tech400.txtUS cyber security chief resignsThe man making sure US computer networks are ...
2224tech401.txtLosing yourself in online gamingOnline role playing games are time-consuming,...
\n", 245 | "

2225 rows × 4 columns

\n", 246 | "
" 247 | ], 248 | "text/plain": [ 249 | " category filename title \\\n", 250 | "0 business 001.txt Ad sales boost Time Warner profit \n", 251 | "1 business 002.txt Dollar gains on Greenspan speech \n", 252 | "2 business 003.txt Yukos unit buyer faces loan claim \n", 253 | "3 business 004.txt High fuel prices hit BA's profits \n", 254 | "4 business 005.txt Pernod takeover talk lifts Domecq \n", 255 | "... ... ... ... \n", 256 | "2220 tech 397.txt BT program to beat dialler scams \n", 257 | "2221 tech 398.txt Spam e-mails tempt net shoppers \n", 258 | "2222 tech 399.txt Be careful how you code \n", 259 | "2223 tech 400.txt US cyber security chief resigns \n", 260 | "2224 tech 401.txt Losing yourself in online gaming \n", 261 | "\n", 262 | " content \n", 263 | "0 Quarterly profits at US media giant TimeWarne... \n", 264 | "1 The dollar has hit its highest level against ... \n", 265 | "2 The owners of embattled Russian oil giant Yuk... \n", 266 | "3 British Airways has blamed high fuel prices f... \n", 267 | "4 Shares in UK drinks and food firm Allied Dome... \n", 268 | "... ... \n", 269 | "2220 BT is introducing two initiatives to help bea... \n", 270 | "2221 Computer users across the world continue to i... \n", 271 | "2222 A new European directive could put software w... \n", 272 | "2223 The man making sure US computer networks are ... \n", 273 | "2224 Online role playing games are time-consuming,... \n", 274 | "\n", 275 | "[2225 rows x 4 columns]" 276 | ] 277 | }, 278 | "execution_count": 29, 279 | "metadata": {}, 280 | "output_type": "execute_result" 281 | } 282 | ], 283 | "source": [ 284 | "import numpy as np\n", 285 | "\n", 286 | "DEVELOPMENT = False # Set to True for development using a subset of data\n", 287 | "\n", 288 | "if DEVELOPMENT:\n", 289 | " # Sub-sample for development\n", 290 | " df = df_orig.sample(n=20, replace=False, random_state=9).copy()\n", 291 | "else:\n", 292 | " df = df_orig.copy()\n", 293 | "\n", 294 | "df" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 31, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "data": { 304 | "text/html": [ 305 | "
\n", 306 | "\n", 319 | "\n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | "
categoryfilenametitlecontent
510entertainment001.txtGallery unveils interactive treeA Christmas tree that can receive text messag...
511entertainment002.txtJarre joins fairytale celebrationFrench musician Jean-Michel Jarre is to perfo...
512entertainment003.txtMusical treatment for Capra filmThe classic film It's A Wonderful Life is to ...
513entertainment004.txtRichard and Judy choose top booksThe 10 authors shortlisted for a Richard and ...
514entertainment005.txtPoppins musical gets flying startThe stage adaptation of children's film Mary ...
...............
891entertainment382.txtLast Star Wars 'not for children'The sixth and final Star Wars movie may not b...
892entertainment383.txtFrench honour for director ParkerBritish film director Sir Alan Parker has bee...
893entertainment384.txtRobots march to US cinema summitAnimated movie Robots has opened at the top o...
894entertainment385.txtHobbit picture 'four years away'Lord of the Rings director Peter Jackson has ...
895entertainment386.txtBuffy creator joins Wonder WomanThe creator of Buffy the Vampire Slayer is to...
\n", 409 | "

386 rows × 4 columns

\n", 410 | "
" 411 | ], 412 | "text/plain": [ 413 | " category filename title \\\n", 414 | "510 entertainment 001.txt Gallery unveils interactive tree \n", 415 | "511 entertainment 002.txt Jarre joins fairytale celebration \n", 416 | "512 entertainment 003.txt Musical treatment for Capra film \n", 417 | "513 entertainment 004.txt Richard and Judy choose top books \n", 418 | "514 entertainment 005.txt Poppins musical gets flying start \n", 419 | ".. ... ... ... \n", 420 | "891 entertainment 382.txt Last Star Wars 'not for children' \n", 421 | "892 entertainment 383.txt French honour for director Parker \n", 422 | "893 entertainment 384.txt Robots march to US cinema summit \n", 423 | "894 entertainment 385.txt Hobbit picture 'four years away' \n", 424 | "895 entertainment 386.txt Buffy creator joins Wonder Woman \n", 425 | "\n", 426 | " content \n", 427 | "510 A Christmas tree that can receive text messag... \n", 428 | "511 French musician Jean-Michel Jarre is to perfo... \n", 429 | "512 The classic film It's A Wonderful Life is to ... \n", 430 | "513 The 10 authors shortlisted for a Richard and ... \n", 431 | "514 The stage adaptation of children's film Mary ... \n", 432 | ".. ... \n", 433 | "891 The sixth and final Star Wars movie may not b... \n", 434 | "892 British film director Sir Alan Parker has bee... \n", 435 | "893 Animated movie Robots has opened at the top o... \n", 436 | "894 Lord of the Rings director Peter Jackson has ... \n", 437 | "895 The creator of Buffy the Vampire Slayer is to... \n", 438 | "\n", 439 | "[386 rows x 4 columns]" 440 | ] 441 | }, 442 | "execution_count": 31, 443 | "metadata": {}, 444 | "output_type": "execute_result" 445 | } 446 | ], 447 | "source": [ 448 | "df[df['category']=='entertainment']" 449 | ] 450 | }, 451 | { 452 | "attachments": {}, 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "## Unstrcutured data to structured data" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 28, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [ 465 | "def retrieve_structured_data(prompt):\n", 466 | " try:\n", 467 | " # Request API\n", 468 | " response = openai.Completion.create(\n", 469 | " deployment_id= deployment_id, \n", 470 | " prompt=prompt,\n", 471 | " temperature=1,\n", 472 | " max_tokens=300,\n", 473 | " top_p=1.0,\n", 474 | " frequency_penalty=0.0,\n", 475 | " presence_penalty=1\n", 476 | " )\n", 477 | "\n", 478 | " # response\n", 479 | " result = response['choices'][0]['text']; print(result)\n", 480 | " except Exception as err:\n", 481 | " print(f\"Unexpected {err=}, {type(err)=}\")\n", 482 | "\n", 483 | " return " 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 39, 489 | "metadata": {}, 490 | "outputs": [ 491 | { 492 | "name": "stdout", 493 | "output_type": "stream", 494 | "text": [ 495 | " The 10 authors shortlisted for a Richard and Judy book award in 2005 are hoping for a boost in sales following the success of this year's winner. The TV couple's interest in the book world coined the term \"the Richard & Judy effect\" and created the top two best-selling paperbacks of 2004 so far. The finalists for 2005 include Andrew Taylor's The American Boy and Robbie Williams' autobiography Feel. This year's winner, Alice Sebold's The Lovely Bones, sold over one million. Joseph O'Connor's Star of the Sea came second and saw sales increase by 350%. The best read award, on Richard Madeley and Judy Finnigan's Channel 4 show, is part of the British Book Awards. David Mitchell's Booker-shortlisted novel, Cloud Atlas, makes it into this year's top 10 along with several lesser known works. \"There's no doubt that this year's selection of book club entries is the best yet. If anything, the choice is even wider than last time,\" said Madeley. \"It was very hard to follow last year's extremely successful list, but we think this year's books will do even better,\" said Richard and Judy executive producer Amanda Ross. \"We were spoiled for choice and it was tough getting down to only 10 from the 301 submitted.\" \n", 496 | " \n", 497 | "\n", 498 | " Extract author and books from the text above in a table. \n", 499 | "\n", 500 | " \n", 501 | " \n", 502 | "Author | Book \n", 503 | "----------------|-----------------\n", 504 | "Andrew Taylor | The American Boy \n", 505 | "Robbie Williams | Feel \n", 506 | "Alice Sebold | The Lovely Bones \n", 507 | "Joseph O'Connor | Star of the Sea\n", 508 | "David Mitchell | Cloud Atlas\n" 509 | ] 510 | } 511 | ], 512 | "source": [ 513 | "idx = 513 #index of the selected text\n", 514 | "\n", 515 | "# prompt postifx\n", 516 | "prompt_postfix = \"\"\" \n", 517 | " \\n\\n Extract author and books from the text above in a table. \n", 518 | "\"\"\"\n", 519 | "# build prompt\n", 520 | "prompt = df['content'].loc[idx] + prompt_postfix; print(prompt)\n", 521 | "\n", 522 | "# query\n", 523 | "retrieve_structured_data(prompt=prompt)" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": 41, 529 | "metadata": {}, 530 | "outputs": [ 531 | { 532 | "name": "stdout", 533 | "output_type": "stream", 534 | "text": [ 535 | " The sixth and final Star Wars movie may not be suitable for young children, film-maker George Lucas has said. He told US TV show 60 Minutes that Revenge of the Sith would be the darkest and most violent of the series. \"I don't think I would take a five or six-year-old to this,\" he told the CBS programme, to be aired on Sunday. Lucas predicted the film would get a US rating advising parents some scenes may be unsuitable for under-13s. It opens in the UK and US on 19 May. He said he expected the film would be classified PG-13 - roughly equivalent to a British 12A rating. The five previous Star Wars films have all carried less restrictive PG - parental guidance - ratings in the US. In the UK, they have all been passed U - suitable for all - with the exception of Attack of The Clones, which got a PG rating in 2002. Revenge of the Sith - the third prequel to the original 1977 Star Wars film - chronicles the transformation of the heroic Anakin Skywalker into the evil Darth Vader as he travels to a Hell-like planet composed of erupting volcanoes and molten lava. \"We're going to watch him make a pact with the devil,\" Lucas said. \"The film is much more dark, more emotional. It's much more of a tragedy.\" \n", 536 | " \n", 537 | "\n", 538 | " Extract Star Wars movie series and associated ratings from the text above into a table. \n", 539 | "\n", 540 | "\n", 541 | "Star Wars Movie | US Rating | UK Rating \n", 542 | "-------------------------------------------\n", 543 | "Revenge of the Sith | PG-13 | 12A \n", 544 | "Attack of the Clones | PG | PG \n", 545 | "Original 1977 Star Wars Film | PG | U\n" 546 | ] 547 | } 548 | ], 549 | "source": [ 550 | "idx = 891 #index of the selected text\n", 551 | "\n", 552 | "# prompt postifx\n", 553 | "prompt_postfix = \"\"\" \n", 554 | " \\n\\n Extract Star Wars movie series and associated ratings from the text above into a table. \n", 555 | "\"\"\"\n", 556 | "# build prompt\n", 557 | "prompt = df['content'].loc[idx] + prompt_postfix; print(prompt)\n", 558 | "\n", 559 | "# query\n", 560 | "retrieve_structured_data(prompt=prompt)" 561 | ] 562 | } 563 | ], 564 | "metadata": { 565 | "kernelspec": { 566 | "display_name": "azureml_py38", 567 | "language": "python", 568 | "name": "python3" 569 | }, 570 | "language_info": { 571 | "codemirror_mode": { 572 | "name": "ipython", 573 | "version": 3 574 | }, 575 | "file_extension": ".py", 576 | "mimetype": "text/x-python", 577 | "name": "python", 578 | "nbconvert_exporter": "python", 579 | "pygments_lexer": "ipython3", 580 | "version": "3.8.5" 581 | }, 582 | "orig_nbformat": 4, 583 | "vscode": { 584 | "interpreter": { 585 | "hash": "6d65a8c07f5b6469e0fc613f182488c0dccce05038bbda39e5ac9075c0454d11" 586 | } 587 | } 588 | }, 589 | "nbformat": 4, 590 | "nbformat_minor": 2 591 | } 592 | --------------------------------------------------------------------------------