├── .github
    ├── CODE_OF_CONDUCT.md
    ├── ISSUE_TEMPLATE.md
    └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE.md
├── README.md
├── _assets
    ├── azure-sql-cosine-similarity-vector-type.gif
    └── cosine-similarity-search-result.png
├── distance-calculations-in-tsql.md
├── python
    ├── .env.sample
    ├── 00-setup-database.sql
    ├── README.md
    ├── hybrid_search.py
    ├── requirements.txt
    └── utilities.py
└── vector-embeddings
    ├── 00-setup-blob-access.sql
    ├── 01-import-wikipedia.sql
    ├── 02-use-native-vectors.sql
    ├── 03-store-openai-credentials.sql
    ├── 04-create-get-embeddings-procedure.sql
    ├── 05-find-similar-articles.sql
    ├── 06-hybrid-search-setup.sql
    └── 07-hybrid-search.sql


/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | IF SUFFICIENT INFORMATION IS NOT PROVIDED VIA THE FOLLOWING TEMPLATE THE ISSUE MIGHT BE CLOSED WITHOUT FURTHER CONSIDERATION OR INVESTIGATION
 3 | -->
 4 | > Please provide us with the following information:
 5 | > ---------------------------------------------------------------
 6 | 
 7 | ### This issue is for a: (mark with an `x`)
 8 | ```
 9 | - [ ] bug report -> please search issues before submitting
10 | - [ ] feature request
11 | - [ ] documentation issue or request
12 | - [ ] regression (a behavior that used to work and stopped in a new release)
13 | ```
14 | 
15 | ### Minimal steps to reproduce
16 | >
17 | 
18 | ### Any log messages given by the failure
19 | >
20 | 
21 | ### Expected/desired behavior
22 | >
23 | 
24 | ### OS and Version?
25 | > Windows 7, 8 or 10. Linux (which distribution). macOS (Yosemite? El Capitan? Sierra?)
26 | 
27 | ### Versions
28 | >
29 | 
30 | ### Mention any other details that might be useful
31 | 
32 | > ---------------------------------------------------------------
33 | > Thanks! We'll be in touch soon.
34 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Purpose
 2 | <!-- Describe the intention of the changes being proposed. What problem does it solve or functionality does it add? -->
 3 | * ...
 4 | 
 5 | ## Does this introduce a breaking change?
 6 | <!-- Mark one with an "x". -->
 7 | ```
 8 | [ ] Yes
 9 | [ ] No
10 | ```
11 | 
12 | ## Pull Request Type
13 | What kind of change does this Pull Request introduce?
14 | 
15 | <!-- Please check the one that applies to this PR using "x". -->
16 | ```
17 | [ ] Bugfix
18 | [ ] Feature
19 | [ ] Code style update (formatting, local variables)
20 | [ ] Refactoring (no functional changes, no api changes)
21 | [ ] Documentation content changes
22 | [ ] Other... Please describe:
23 | ```
24 | 
25 | ## How to Test
26 | *  Get the code
27 | 
28 | ```
29 | git clone [repo-address]
30 | cd [repo-name]
31 | git checkout [branch-name]
32 | npm install
33 | ```
34 | 
35 | * Test the code
36 | <!-- Add steps to run the tests suite and/or manually test -->
37 | ```
38 | ```
39 | 
40 | ## What to Check
41 | Verify that the following are valid
42 | * ...
43 | 
44 | ## Other Information
45 | <!-- Add any other helpful information that may be needed here. -->


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.rsuser
  8 | *.suo
  9 | *.user
 10 | *.userosscache
 11 | *.sln.docstates
 12 | 
 13 | # User-specific files (MonoDevelop/Xamarin Studio)
 14 | *.userprefs
 15 | 
 16 | # Mono auto generated files
 17 | mono_crash.*
 18 | 
 19 | # Build results
 20 | [Dd]ebug/
 21 | [Dd]ebugPublic/
 22 | [Rr]elease/
 23 | [Rr]eleases/
 24 | x64/
 25 | x86/
 26 | [Ww][Ii][Nn]32/
 27 | [Aa][Rr][Mm]/
 28 | [Aa][Rr][Mm]64/
 29 | bld/
 30 | [Bb]in/
 31 | [Oo]bj/
 32 | [Ll]og/
 33 | [Ll]ogs/
 34 | 
 35 | # Visual Studio 2015/2017 cache/options directory
 36 | .vs/
 37 | # Uncomment if you have tasks that create the project's static files in wwwroot
 38 | #wwwroot/
 39 | 
 40 | # Visual Studio 2017 auto generated files
 41 | Generated\ Files/
 42 | 
 43 | # MSTest test Results
 44 | [Tt]est[Rr]esult*/
 45 | [Bb]uild[Ll]og.*
 46 | 
 47 | # NUnit
 48 | *.VisualState.xml
 49 | TestResult.xml
 50 | nunit-*.xml
 51 | 
 52 | # Build Results of an ATL Project
 53 | [Dd]ebugPS/
 54 | [Rr]eleasePS/
 55 | dlldata.c
 56 | 
 57 | # Benchmark Results
 58 | BenchmarkDotNet.Artifacts/
 59 | 
 60 | # .NET Core
 61 | project.lock.json
 62 | project.fragment.lock.json
 63 | artifacts/
 64 | 
 65 | # ASP.NET Scaffolding
 66 | ScaffoldingReadMe.txt
 67 | 
 68 | # StyleCop
 69 | StyleCopReport.xml
 70 | 
 71 | # Files built by Visual Studio
 72 | *_i.c
 73 | *_p.c
 74 | *_h.h
 75 | *.ilk
 76 | *.meta
 77 | *.obj
 78 | *.iobj
 79 | *.pch
 80 | *.pdb
 81 | *.ipdb
 82 | *.pgc
 83 | *.pgd
 84 | *.rsp
 85 | *.sbr
 86 | *.tlb
 87 | *.tli
 88 | *.tlh
 89 | *.tmp
 90 | *.tmp_proj
 91 | *_wpftmp.csproj
 92 | *.log
 93 | *.tlog
 94 | *.vspscc
 95 | *.vssscc
 96 | .builds
 97 | *.pidb
 98 | *.svclog
 99 | *.scc
100 | 
101 | # Chutzpah Test files
102 | _Chutzpah*
103 | 
104 | # Visual C++ cache files
105 | ipch/
106 | *.aps
107 | *.ncb
108 | *.opendb
109 | *.opensdf
110 | *.sdf
111 | *.cachefile
112 | *.VC.db
113 | *.VC.VC.opendb
114 | 
115 | # Visual Studio profiler
116 | *.psess
117 | *.vsp
118 | *.vspx
119 | *.sap
120 | 
121 | # Visual Studio Trace Files
122 | *.e2e
123 | 
124 | # TFS 2012 Local Workspace
125 | $tf/
126 | 
127 | # Guidance Automation Toolkit
128 | *.gpState
129 | 
130 | # ReSharper is a .NET coding add-in
131 | _ReSharper*/
132 | *.[Rr]e[Ss]harper
133 | *.DotSettings.user
134 | 
135 | # TeamCity is a build add-in
136 | _TeamCity*
137 | 
138 | # DotCover is a Code Coverage Tool
139 | *.dotCover
140 | 
141 | # AxoCover is a Code Coverage Tool
142 | .axoCover/*
143 | !.axoCover/settings.json
144 | 
145 | # Coverlet is a free, cross platform Code Coverage Tool
146 | coverage*.json
147 | coverage*.xml
148 | coverage*.info
149 | 
150 | # Visual Studio code coverage results
151 | *.coverage
152 | *.coveragexml
153 | 
154 | # NCrunch
155 | _NCrunch_*
156 | .*crunch*.local.xml
157 | nCrunchTemp_*
158 | 
159 | # MightyMoose
160 | *.mm.*
161 | AutoTest.Net/
162 | 
163 | # Web workbench (sass)
164 | .sass-cache/
165 | 
166 | # Installshield output folder
167 | [Ee]xpress/
168 | 
169 | # DocProject is a documentation generator add-in
170 | DocProject/buildhelp/
171 | DocProject/Help/*.HxT
172 | DocProject/Help/*.HxC
173 | DocProject/Help/*.hhc
174 | DocProject/Help/*.hhk
175 | DocProject/Help/*.hhp
176 | DocProject/Help/Html2
177 | DocProject/Help/html
178 | 
179 | # Click-Once directory
180 | publish/
181 | 
182 | # Publish Web Output
183 | *.[Pp]ublish.xml
184 | *.azurePubxml
185 | # Note: Comment the next line if you want to checkin your web deploy settings,
186 | # but database connection strings (with potential passwords) will be unencrypted
187 | *.pubxml
188 | *.publishproj
189 | 
190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
191 | # checkin your Azure Web App publish settings, but sensitive information contained
192 | # in these scripts will be unencrypted
193 | PublishScripts/
194 | 
195 | # NuGet Packages
196 | *.nupkg
197 | # NuGet Symbol Packages
198 | *.snupkg
199 | # The packages folder can be ignored because of Package Restore
200 | **/[Pp]ackages/*
201 | # except build/, which is used as an MSBuild target.
202 | !**/[Pp]ackages/build/
203 | # Uncomment if necessary however generally it will be regenerated when needed
204 | #!**/[Pp]ackages/repositories.config
205 | # NuGet v3's project.json files produces more ignorable files
206 | *.nuget.props
207 | *.nuget.targets
208 | 
209 | # Microsoft Azure Build Output
210 | csx/
211 | *.build.csdef
212 | 
213 | # Microsoft Azure Emulator
214 | ecf/
215 | rcf/
216 | 
217 | # Windows Store app package directories and files
218 | AppPackages/
219 | BundleArtifacts/
220 | Package.StoreAssociation.xml
221 | _pkginfo.txt
222 | *.appx
223 | *.appxbundle
224 | *.appxupload
225 | 
226 | # Visual Studio cache files
227 | # files ending in .cache can be ignored
228 | *.[Cc]ache
229 | # but keep track of directories ending in .cache
230 | !?*.[Cc]ache/
231 | 
232 | # Others
233 | ClientBin/
234 | ~$*
235 | *~
236 | *.dbmdl
237 | *.dbproj.schemaview
238 | *.jfm
239 | *.pfx
240 | *.publishsettings
241 | orleans.codegen.cs
242 | 
243 | # Including strong name files can present a security risk
244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245 | #*.snk
246 | 
247 | # Since there are multiple workflows, uncomment next line to ignore bower_components
248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249 | #bower_components/
250 | 
251 | # RIA/Silverlight projects
252 | Generated_Code/
253 | 
254 | # Backup & report files from converting an old project file
255 | # to a newer Visual Studio version. Backup files are not needed,
256 | # because we have git ;-)
257 | _UpgradeReport_Files/
258 | Backup*/
259 | UpgradeLog*.XML
260 | UpgradeLog*.htm
261 | ServiceFabricBackup/
262 | *.rptproj.bak
263 | 
264 | # SQL Server files
265 | *.mdf
266 | *.ldf
267 | *.ndf
268 | 
269 | # Business Intelligence projects
270 | *.rdl.data
271 | *.bim.layout
272 | *.bim_*.settings
273 | *.rptproj.rsuser
274 | *- [Bb]ackup.rdl
275 | *- [Bb]ackup ([0-9]).rdl
276 | *- [Bb]ackup ([0-9][0-9]).rdl
277 | 
278 | # Microsoft Fakes
279 | FakesAssemblies/
280 | 
281 | # GhostDoc plugin setting file
282 | *.GhostDoc.xml
283 | 
284 | # Node.js Tools for Visual Studio
285 | .ntvs_analysis.dat
286 | node_modules/
287 | 
288 | # Visual Studio 6 build log
289 | *.plg
290 | 
291 | # Visual Studio 6 workspace options file
292 | *.opt
293 | 
294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295 | *.vbw
296 | 
297 | # Visual Studio 6 auto-generated project file (contains which files were open etc.)
298 | *.vbp
299 | 
300 | # Visual Studio 6 workspace and project file (working project files containing files to include in project)
301 | *.dsw
302 | *.dsp
303 | 
304 | # Visual Studio 6 technical files
305 | *.ncb
306 | *.aps
307 | 
308 | # Visual Studio LightSwitch build output
309 | **/*.HTMLClient/GeneratedArtifacts
310 | **/*.DesktopClient/GeneratedArtifacts
311 | **/*.DesktopClient/ModelManifest.xml
312 | **/*.Server/GeneratedArtifacts
313 | **/*.Server/ModelManifest.xml
314 | _Pvt_Extensions
315 | 
316 | # Paket dependency manager
317 | .paket/paket.exe
318 | paket-files/
319 | 
320 | # FAKE - F# Make
321 | .fake/
322 | 
323 | # CodeRush personal settings
324 | .cr/personal
325 | 
326 | # Python Tools for Visual Studio (PTVS)
327 | __pycache__/
328 | *.pyc
329 | 
330 | # Cake - Uncomment if you are using it
331 | # tools/**
332 | # !tools/packages.config
333 | 
334 | # Tabs Studio
335 | *.tss
336 | 
337 | # Telerik's JustMock configuration file
338 | *.jmconfig
339 | 
340 | # BizTalk build output
341 | *.btp.cs
342 | *.btm.cs
343 | *.odx.cs
344 | *.xsd.cs
345 | 
346 | # OpenCover UI analysis results
347 | OpenCover/
348 | 
349 | # Azure Stream Analytics local run output
350 | ASALocalRun/
351 | 
352 | # MSBuild Binary and Structured Log
353 | *.binlog
354 | 
355 | # NVidia Nsight GPU debugger configuration file
356 | *.nvuser
357 | 
358 | # MFractors (Xamarin productivity tool) working folder
359 | .mfractor/
360 | 
361 | # Local History for Visual Studio
362 | .localhistory/
363 | 
364 | # Visual Studio History (VSHistory) files
365 | .vshistory/
366 | 
367 | # BeatPulse healthcheck temp database
368 | healthchecksdb
369 | 
370 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
371 | MigrationBackup/
372 | 
373 | # Ionide (cross platform F# VS Code tools) working folder
374 | .ionide/
375 | 
376 | # Fody - auto-generated XML schema
377 | FodyWeavers.xsd
378 | 
379 | # VS Code files for those working on multiple tools
380 | .vscode/*
381 | !.vscode/settings.json
382 | !.vscode/tasks.json
383 | !.vscode/launch.json
384 | !.vscode/extensions.json
385 | *.code-workspace
386 | 
387 | # Local History for Visual Studio Code
388 | .history/
389 | 
390 | # Windows Installer files from build outputs
391 | *.cab
392 | *.msi
393 | *.msix
394 | *.msm
395 | *.msp
396 | 
397 | # JetBrains Rider
398 | *.sln.iml
399 | 
400 | # Custom
401 | *.local.txt
402 | *.local.sql
403 | .venv/
404 | .env
405 | __pycache__/


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## [project-title] Changelog
 2 | 
 3 | <a name="x.y.z"></a>
 4 | # x.y.z (yyyy-mm-dd)
 5 | 
 6 | *Features*
 7 | * ...
 8 | 
 9 | *Bug Fixes*
10 | * ...
11 | 
12 | *Breaking Changes*
13 | * ...
14 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to [project-title]
 2 | 
 3 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
 5 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
 6 | 
 7 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
 8 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
 9 | provided by the bot. You will only need to do this once across all repos using our CLA.
10 | 
11 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
12 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
13 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
14 | 
15 |  - [Code of Conduct](#coc)
16 |  - [Issues and Bugs](#issue)
17 |  - [Feature Requests](#feature)
18 |  - [Submission Guidelines](#submit)
19 | 
20 | ## <a name="coc"></a> Code of Conduct
21 | Help us keep this project open and inclusive. Please read and follow our [Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
22 | 
23 | ## <a name="issue"></a> Found an Issue?
24 | If you find a bug in the source code or a mistake in the documentation, you can help us by
25 | [submitting an issue](#submit-issue) to the GitHub Repository. Even better, you can
26 | [submit a Pull Request](#submit-pr) with a fix.
27 | 
28 | ## <a name="feature"></a> Want a Feature?
29 | You can *request* a new feature by [submitting an issue](#submit-issue) to the GitHub
30 | Repository. If you would like to *implement* a new feature, please submit an issue with
31 | a proposal for your work first, to be sure that we can use it.
32 | 
33 | * **Small Features** can be crafted and directly [submitted as a Pull Request](#submit-pr).
34 | 
35 | ## <a name="submit"></a> Submission Guidelines
36 | 
37 | ### <a name="submit-issue"></a> Submitting an Issue
38 | Before you submit an issue, search the archive, maybe your question was already answered.
39 | 
40 | If your issue appears to be a bug, and hasn't been reported, open a new issue.
41 | Help us to maximize the effort we can spend fixing issues and adding new
42 | features, by not reporting duplicate issues.  Providing the following information will increase the
43 | chances of your issue being dealt with quickly:
44 | 
45 | * **Overview of the Issue** - if an error is being thrown a non-minified stack trace helps
46 | * **Version** - what version is affected (e.g. 0.1.2)
47 | * **Motivation for or Use Case** - explain what are you trying to do and why the current behavior is a bug for you
48 | * **Browsers and Operating System** - is this a problem with all browsers?
49 | * **Reproduce the Error** - provide a live example or a unambiguous set of steps
50 | * **Related Issues** - has a similar issue been reported before?
51 | * **Suggest a Fix** - if you can't fix the bug yourself, perhaps you can point to what might be
52 |   causing the problem (line of code or commit)
53 | 
54 | You can file new issues by providing the above information at the corresponding repository's issues link: https://github.com/[organization-name]/[repository-name]/issues/new].
55 | 
56 | ### <a name="submit-pr"></a> Submitting a Pull Request (PR)
57 | Before you submit your Pull Request (PR) consider the following guidelines:
58 | 
59 | * Search the repository (https://github.com/[organization-name]/[repository-name]/pulls) for an open or closed PR
60 |   that relates to your submission. You don't want to duplicate effort.
61 | 
62 | * Make your changes in a new git fork:
63 | 
64 | * Commit your changes using a descriptive commit message
65 | * Push your fork to GitHub:
66 | * In GitHub, create a pull request
67 | * If we suggest changes then:
68 |   * Make the required updates.
69 |   * Rebase your fork and force push to your GitHub repository (this will update your Pull Request):
70 | 
71 |     ```shell
72 |     git rebase master -i
73 |     git push -f
74 |     ```
75 | 
76 | That's it! Thank you for your contribution!
77 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | page_type: sample
  3 | languages:
  4 | - sql
  5 | products:
  6 | - azure-openai
  7 | - azure-sql-database
  8 | urlFragment: azure-sql-db-openai
  9 | name: Vector similarity search with Azure SQL & Azure OpenAI
 10 | description: |
 11 |   Use Azure OpenAI from Azure SQL database to get the vector embeddings of any chosen text, and then calculate the cosine similarity to find related topics
 12 | ---
 13 | 
 14 | # Vector similarity search with Azure SQL & Azure OpenAI
 15 | 
 16 | This example shows how to use Azure OpenAI from Azure SQL database to get the vector embeddings of any chosen text, and then calculate the [cosine similarity](https://learn.microsoft.com/en-us/azure/storage/common/storage-sas-overview) against the Wikipedia articles (for which vector embeddings have been already calculated,) to find the articles that covers topics that are close - or similar - to the provided text.
 17 | 
 18 | For an introduction on text and code embeddings, check out this OpenAI article: [Introducing text and code embeddings](https://openai.com/blog/introducing-text-and-code-embeddings).
 19 | 
 20 | ## Native or Classic?
 21 | 
 22 | Azure SQL database can be used to easily and quickly perform vector similarity search. There are two options for this: a native option and a classic option.
 23 | 
 24 | The **native option** is to use the new Vector Functions, recently introduced in Azure SQL database. Vector Functions are a set of functions that can be used to perform vector operations directly in the database. 
 25 | 
 26 | > [!NOTE]  
 27 | > Vector Functions are in Public Preview. Learn the details about vectors in Azure SQL here: https://aka.ms/azure-sql-vector-public-preview
 28 | 
 29 | ![](_assets/azure-sql-cosine-similarity-vector-type.gif)
 30 | 
 31 | The **classic option** is to use the classic T-SQL to perform vector operations, with the support of columnstore indexes for getting good performance.
 32 | 
 33 | > [!IMPORTANT]  
 34 | > This branch (the `main` branch) uses the native vector support in Azure SQL. If you want to use the classic T-SQL, switch to the `classic` branch.
 35 | 
 36 | ## Download and import the Wikipedia Article with Vector Embeddings
 37 | 
 38 | Download the [wikipedia embeddings from here](https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip), unzip it and upload it (using [Azure Storage Explorer](https://learn.microsoft.com/azure/vs-azure-tools-storage-manage-with-storage-explorer?tabs=windows) for example) to an Azure Blob Storage container.
 39 | 
 40 | In the example the unzipped CSV file `vector_database_wikipedia_articles_embedded.csv` is assumed to be uploaded to a blob container named `playground` and in a folder named `wikipedia`.
 41 | 
 42 | Once the file is uploaded, get the [SAS token](https://learn.microsoft.com/azure/storage/common/storage-sas-overview) to allow Azure SQL database to access it. (From Azure storage Explorer, right click on the `playground` container and then select `Get Shared Access Signature`. Set the expiration date to some time in future and then click on "Create". Copy the generated query string somewhere, for example into Notepad, as it will be needed later)
 43 | 
 44 | Use a client tool like [Azure Data Studio](https://azure.microsoft.com/products/data-studio/) to connect to an Azure SQL database and then use the `./vector-embeddings/01-import-wikipedia.sql` to create the `wikipedia_articles_embeddings` table where the uploaded CSV file will be imported.
 45 | 
 46 | Make sure to replace the `<account>` and `<sas-token>` placeholders with the value correct for your environment:
 47 | 
 48 | - `<account>` is the name of the storage account where the CSV file has been uploaded
 49 | - `<sas-token>` is the Share Access Signature obtained before
 50 | 
 51 | Run each section (each section starts with a comment) separately. At the end of the process (will take up to a couple of minutes) you will have all the CSV data imported in the `wikipedia_articles_embeddings` table.
 52 | 
 53 | ## Add embeddings columns to table
 54 | 
 55 | In the imported data, vectors are stored as JSON arrays. To take advtange of vector processing, the arrays must be saved into a more compact and optimized binary format index. Thanks to the new `VECTOR` type, turning a vector into a set of values that can be saved into a column is very easy:
 56 | 
 57 | ```sql
 58 | alter table wikipedia_articles_embeddings
 59 | add title_vector_ada2 vector(1536);
 60 | 
 61 | update 
 62 |     wikipedia_articles_embeddings
 63 | set 
 64 |     title_vector_ada2 = cast(title_vector as vector(1536)),
 65 | ```
 66 | 
 67 | The script `./vector-embeddings/02-use-native-vectors.sql` does exactly that. It takes the existing columns with vectors stored in JSON arrays and turns them into vectors saved in binary format.
 68 | 
 69 | ## Find similar articles by calculating cosine distance
 70 | 
 71 | Make sure to have an Azure OpenAI [embeddings model](https://learn.microsoft.com/azure/cognitive-services/openai/concepts/models#embeddings-models) deployed and make sure it is using the `text-embedding-ada-002` model.
 72 | 
 73 | Once the Azure OpenAI model is deployed, it can be called from Azure SQL database using [sp_invoke_external_rest_endpoint](https://learn.microsoft.com/sql/relational-databases/system-stored-procedures/sp-invoke-external-rest-endpoint-transact-sql), to get the embedding vector for the "the foundation series by isaac asimov", text, for example, using the following code (make sure to replace the `<your-api-name>` and `<api-key>` with yout Azure OpenAI deployment):
 74 | 
 75 | ```sql
 76 | declare @inputText nvarchar(max) = 'the foundation series by isaac asimov';
 77 | declare @retval int, @response nvarchar(max);
 78 | declare @payload nvarchar(max) = json_object('input': @inputText);
 79 | exec @retval = sp_invoke_external_rest_endpoint
 80 |     @url = 'https://<your-api-name>.openai.azure.com/openai/deployments/<deployment-id>/embeddings?api-version=2023-03-15-preview',
 81 |     @method = 'POST',
 82 |     @headers = '{"api-key":"<api-key>"}',
 83 |     @payload = @payload,
 84 |     @response = @response output;
 85 | select @response;
 86 | ```
 87 | 
 88 | The vector returned in the response can extracted using `json_query`:
 89 | 
 90 | ```sql
 91 | set @re = json_query(@response, '$.result.data[0].embedding')
 92 | ```
 93 | 
 94 | Now it is just a matter of taking the vector of the sample text and the vectors of all wikipedia articles and calculating the cosine similarity. The math can be easily expressed in T-SQL:
 95 | 
 96 | ```sql
 97 | vector_distance('cosine', @embedding, title_vector) 
 98 | ```
 99 | 
100 | ## Encapsulating logic to retrieve embeddings
101 | 
102 | The described process can be wrapped into stored procedures to make it easy to re-use. The scripts in the `./vector-embeddings/` directory show how to create a stored procedure to retrieve the embeddings from OpenAI:
103 | 
104 | - `03-store-openai-credentials.sql`: stores the Azure OpenAI credentials in the Azure SQL database
105 | - `04-create-get-embeddings-procedure.sql`: creates a stored procedure to encapsulate the call to OpenAI using the script. 
106 | 
107 | ## Finding similar articles
108 | 
109 | The script `05-find-similar-articles.sql` uses the created stored procedure and the process explained above to find similar articles to the provided text. 
110 | 
111 | ## Alternative sample with Python and a local embedding model
112 | 
113 | If you don't want to, or can't use OpenAI to generate embeddings, you can use a local model like `https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1` to generate embeddings. The Python script `./python/hybrid_search.py` shows how to 
114 | 
115 | - use Python to generate the embeddings 
116 | - do similarity search in Azure SQL database
117 | - use [Fulltext search in Azure SQL database with BM25 ranking](https://learn.microsoft.com/en-us/sql/relational-databases/search/limit-search-results-with-rank?view=sql-server-ver16#ranking-of-freetexttable)
118 | - do re-ranking applying Reciprocal Rank Fusion (RRF) to combine the BM25 ranking with the cosine similarity ranking
119 | 
120 | Make sure to setup the database for this sample using the `./python/00-setup-database.sql` script. The database can be either an Azure SQL DB or a SQL Server database.
121 | 
122 | ## Conclusions
123 | 
124 | Azure SQL database, now has support to perform vector operations directly in the database, making it easy to perform vector similarity search. Using vector search along with fulltext search and BM25 ranking, it is possible to build powerful search engines that can be used in a variety of scenarios. 
125 | 
126 | > [!NOTE]  
127 | > Vector Functions are in Early Adopter Preview. Get access to the preview via https://aka.ms/azuresql-vector-eap-announcement
128 | 
129 | ## More resources
130 | 
131 | - [Azure SQL & AI](https://aka.ms/sql-ai)
132 | - [Azure SQL Vector Samples](https://github.com/Azure-Samples/azure-sql-db-vector-search)
133 | 


--------------------------------------------------------------------------------
/_assets/azure-sql-cosine-similarity-vector-type.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-sql-db-openai/a529ca04dd12d3ae2117be5fbc7c4877f6a4d393/_assets/azure-sql-cosine-similarity-vector-type.gif


--------------------------------------------------------------------------------
/_assets/cosine-similarity-search-result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-sql-db-openai/a529ca04dd12d3ae2117be5fbc7c4877f6a4d393/_assets/cosine-similarity-search-result.png


--------------------------------------------------------------------------------
/distance-calculations-in-tsql.md:
--------------------------------------------------------------------------------
 1 | # How to calculate common vectors distances in T-SQL
 2 | 
 3 | The sample data used to show how to calculate the common vector distances in T-SQL the following
 4 | 
 5 | ```sql
 6 | declare @v1 nvarchar(max) = '[1,3,-5]';
 7 | declare @v2 nvarchar(max) = '[4,-2,-1]';
 8 | 
 9 | drop table if exists #v1;
10 | select 
11 |     cast([key] as int) as [vector_value_id], 
12 |     cast([value] as float) as [vector_value]
13 | into 
14 |     #v1 
15 | from
16 |     openjson(@v1);
17 | 
18 | drop table if exists #v2;
19 | select 
20 |     cast([key] as int) as [vector_value_id], 
21 |     cast([value] as float) as [vector_value]
22 | into 
23 |     #v2
24 | from
25 |     openjson(@v2);
26 | ```
27 | 
28 | ## Cosine Distance
29 | 
30 | The cosine distance can be calculated as follows:
31 | 
32 | ```sql
33 | select
34 |     1-SUM(v1.[vector_value] * v2.[vector_value]) / 
35 |     (
36 |         SQRT(SUM(v1.[vector_value] * v1.[vector_value])) 
37 |         * 
38 |         SQRT(SUM(v2.[vector_value] * v2.[vector_value]))
39 |     ) as cosine_distance
40 | from
41 |     #v1 as v1
42 | inner join 
43 |     #v2 as v2 on v1.[vector_value_id] = v2.[vector_value_id]
44 | ```
45 | 
46 | ## Dot Product
47 | 
48 | The dot produce can be calculated as follows:
49 | 
50 | ```sql
51 | select
52 |     SUM(v1.[vector_value] * v2.[vector_value]) as dot_product
53 | from
54 |     #v1 as v1
55 | inner join 
56 |     #v2 as v2 on v1.[vector_value_id] = v2.[vector_value_id]
57 | ```
58 | 
59 | ## Euclidean Distance
60 | 
61 | The euclidean distance can be calculated as follows:
62 | 
63 | ```sql
64 | select
65 |     SQRT(SUM(POWER(v1.[vector_value] - v2.[vector_value], 2))) as euclidean_distance    
66 | from
67 |     #v1 as v1  
68 | inner join
69 |     #v2 as v2 on v1.[vector_value_id] = v2.[vector_value_id]
70 | ```
71 | 
72 | ## Manhattan Distance
73 | 
74 | The manhattan distance can be calculated as follows:
75 | 
76 | ```sql
77 | select
78 |     SUM(ABS(v1.[vector_value] - v2.[vector_value])) as manhattan_distance
79 | from
80 |     #v1 as v1  
81 | inner join
82 |     #v2 as v2 on v1.[vector_value_id] = v2.[vector_value_id]
83 | ```


--------------------------------------------------------------------------------
/python/.env.sample:
--------------------------------------------------------------------------------
1 | MSSQL='Driver={ODBC Driver 18 for SQL Server};Server=tcp:<server>.database.windows.net,1433;Database=<database>;LongAsMax=yes;Connection Timeout=30'
2 | 


--------------------------------------------------------------------------------
/python/00-setup-database.sql:
--------------------------------------------------------------------------------
 1 | drop table if exists dbo.hybrid_search_sample
 2 | go
 3 | 
 4 | create table dbo.hybrid_search_sample
 5 | (
 6 |     id int constraint pk__hybrid_search_sample primary key,
 7 |     content nvarchar(max),
 8 |     embedding vector(384)
 9 | )
10 | 
11 | if not exists(select * from sys.fulltext_catalogs where [name] = 'main_ft_catalog')
12 | begin
13 |     create fulltext catalog [main_ft_catalog] as default;
14 | end
15 | go
16 | 
17 | create fulltext index on dbo.hybrid_search_sample (content) key index pk__hybrid_search_sample;
18 | go
19 | 
20 | alter fulltext index on dbo.hybrid_search_sample enable; 
21 | go
22 | 


--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
 1 | # Hybrid Search 
 2 | 
 3 | This sample shows how to combine Fulltext search in Azure SQL database with BM25 ranking and cosine similarity ranking to do hybrid search.
 4 | 
 5 | In this sample the local model [multi-qa-MiniLM-L6-cos-v1](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1) to generate embeddings. The Python script `./hybrid_search.py` shows how to 
 6 | 
 7 | - use Python to generate the embeddings 
 8 | - do similarity search in Azure SQL database
 9 | - use [Fulltext search in Azure SQL database with BM25 ranking](https://learn.microsoft.com/en-us/sql/relational-databases/search/limit-search-results-with-rank?view=sql-server-ver16#ranking-of-freetexttable)
10 | - do re-ranking applying Reciprocal Rank Fusion (RRF) to combine the BM25 ranking with the cosine similarity ranking
11 | 
12 | Make sure to setup the database for this sample using the `./python/00-setup-database.sql` script. Database can be either an Azure SQL DB or a SQL Server database. Once the database has been created, you can run the `./python/hybrid_search.py` script to do the hybrid search:
13 | 
14 | First, set up the virtual environment and install the required packages:
15 | 
16 | ```bash
17 | python -m venv .venv
18 | ```
19 | 
20 | Activate the virtual environment and then install the required packages:
21 | 
22 | ```bash
23 | pip install -r requirements.txt
24 | ```
25 | 
26 | Create an environment file `.env` with the connection string to Azure SQL database. You can use the `.env.sample` as a starting point. The sample `.env` file shows how to use Entra ID to connect to the database, which looks like:
27 | 
28 | ```text
29 | MSSQL='Driver={ODBC Driver 18 for SQL Server};Server=tcp:<server>,1433;Database=<database>;Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30'
30 | ```
31 | 
32 | If you want to use SQL Authentication the connection string would instead look like the following:
33 | 
34 | ```
35 | MSSQL='Driver={ODBC Driver 18 for SQL Server};Server=tcp:<server>,1433;Database=<database>;UID=<user>;PWD=<password>;Encrypt=yes;TrustServerCertificate=yes;Connection Timeout=30'
36 | ```
37 | 
38 | Then run the script:    
39 | 
40 | ```bash
41 | python hybrid_search.py
42 | ```


--------------------------------------------------------------------------------
/python/hybrid_search.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import pyodbc
  4 | import logging
  5 | import json
  6 | from sentence_transformers import SentenceTransformer
  7 | from dotenv import load_dotenv
  8 | from utilities import get_mssql_connection
  9 | 
 10 | load_dotenv()
 11 | 
 12 | if __name__ == '__main__':
 13 |     print('Initializing sample...')
 14 |     model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1', tokenizer_kwargs={'clean_up_tokenization_spaces': True})
 15 | 
 16 |     print('Getting embeddings...')    
 17 |     sentences = [
 18 |         'The dog is barking',
 19 |         'The cat is purring',
 20 |         'The bear is growling',
 21 |         'A bear growling to a cat',
 22 |         'A cat purring to a dog',
 23 |         'A dog barking to a bear',
 24 |         'A bear growling to a dog',
 25 |         'A cat purring to a bear',
 26 |         'A wolf howling to a bear',
 27 |         'A bear growling to a wolf'
 28 |     ]
 29 |     embeddings = model.encode(sentences)
 30 | 
 31 |     conn = get_mssql_connection()
 32 | 
 33 |     print('Cleaning up the database...')
 34 |     try:
 35 |         cursor = conn.cursor()    
 36 |         cursor.execute("DELETE FROM dbo.hybrid_search_sample;")
 37 |         cursor.commit();        
 38 |     finally:
 39 |         cursor.close()
 40 | 
 41 |     print('Saving documents and embeddings in the database...')    
 42 |     try:
 43 |         cursor = conn.cursor()  
 44 |         
 45 |         for id, (sentence, embedding) in enumerate(zip(sentences, embeddings)):
 46 |             cursor.execute(f"""
 47 |                 DECLARE @id INT = ?;
 48 |                 DECLARE @content NVARCHAR(MAX) = ?;
 49 |                 DECLARE @embedding VECTOR(384) = CAST(? AS VECTOR(384));
 50 |                 INSERT INTO dbo.hybrid_search_sample (id, content, embedding) VALUES (@id, @content, @embedding);
 51 |             """,
 52 |             id,
 53 |             sentence, 
 54 |             json.dumps(embedding.tolist())
 55 |             )
 56 | 
 57 |         cursor.commit()
 58 |     finally:
 59 |         cursor.close()
 60 | 
 61 |     print('Waiting a few seconds to let fulltext index sync...')    
 62 |     time.sleep(3)
 63 | 
 64 |     print('Searching for similar documents...')
 65 |     print('Getting embeddings...')    
 66 |     query = 'a growling bear'
 67 |     embedding = model.encode(query)    
 68 |     
 69 |     k = 5  
 70 |     print(f'Querying database for {k} similar sentenct to "{query}"...')  
 71 |     try:
 72 |         cursor = conn.cursor()  
 73 |         
 74 |         results  = cursor.execute(f"""
 75 |             DECLARE @k INT = ?;
 76 |             DECLARE @q NVARCHAR(4000) = ?;
 77 |             DECLARE @e VECTOR(384) = CAST(? AS VECTOR(384));
 78 |             WITH keyword_search AS (
 79 |                 SELECT TOP(@k)
 80 |                     id,
 81 |                     RANK() OVER (ORDER BY rank) AS rank,
 82 |                     content
 83 |                 FROM
 84 |                     (
 85 |                         SELECT TOP(@k)
 86 |                             sd.id,
 87 |                             ftt.[RANK] AS rank,
 88 |                             sd.content
 89 |                         FROM 
 90 |                             dbo.hybrid_search_sample AS sd
 91 |                         INNER JOIN 
 92 |                             FREETEXTTABLE(dbo.hybrid_search_sample, *, @q) AS ftt ON sd.id = ftt.[KEY]
 93 |                         ORDER BY
 94 |                             rank DESC
 95 |                     ) AS t
 96 |                 ORDER BY
 97 |                     rank
 98 |             ),
 99 |             semantic_search AS
100 |             (
101 |                 SELECT TOP(@k)
102 |                     id,
103 |                     RANK() OVER (ORDER BY distance) AS rank,
104 |                     content
105 |                 FROM
106 |                     (
107 |                         SELECT TOP(@k)
108 |                             id, 
109 |                             VECTOR_DISTANCE('cosine', embedding, @e) AS distance,
110 |                             content
111 |                         FROM 
112 |                             dbo.hybrid_search_sample
113 |                         ORDER BY
114 |                             distance
115 |                     ) AS t
116 |                 ORDER BY
117 |                     rank
118 |             )
119 |             SELECT TOP(@k)
120 |                 COALESCE(ss.id, ks.id) AS id,
121 |                 COALESCE(1.0 / (@k + ss.rank), 0.0) +
122 |                 COALESCE(1.0 / (@k + ks.rank), 0.0) AS score, -- Reciprocal Rank Fusion (RRF)
123 |                 COALESCE(ss.content, ks.content) AS content,
124 |                 ss.rank AS semantic_rank,
125 |                 ks.rank AS keyword_rank
126 |             FROM
127 |                 semantic_search ss
128 |             FULL OUTER JOIN
129 |                 keyword_search ks ON ss.id = ks.id
130 |             ORDER BY 
131 |                 score DESC
132 |             """,
133 |             k,
134 |             query, 
135 |             json.dumps(embedding.tolist()),        
136 |         )
137 | 
138 |         for (pos, row) in enumerate(results):
139 |             print(f'[{pos}] RRF score: {row[1]:0.4} (Semantic Rank: {row[3]}, Keyword Rank: {row[4]})\tDocument: "{row[2]}", Id: {row[0]}')
140 | 
141 |     finally:
142 |         cursor.close()
143 |     
144 |     print("Done.")


--------------------------------------------------------------------------------
/python/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dotenv
2 | pyodbc
3 | azure-identity
4 | sentence-transformers
5 | 


--------------------------------------------------------------------------------
/python/utilities.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pyodbc
 3 | import struct
 4 | import logging
 5 | from azure import identity
 6 | 
 7 | def get_mssql_connection():
 8 |     print('Getting MSSQL connection')
 9 |     mssql_connection_string = os.environ["MSSQL"]    
10 |     if any(s in mssql_connection_string.lower() for s in ["uid"]):
11 |         print(' - Using SQL Server authentication')
12 |         attrs_before = None
13 |     else:
14 |         print(' - Getting EntraID credentials...')    
15 |         mssql_connection_string = os.environ["MSSQL"]    
16 |         credential = identity.DefaultAzureCredential(exclude_interactive_browser_credential=False)    
17 |         token_bytes = credential.get_token("https://database.windows.net/.default").token.encode("UTF-16-LE")    
18 |         token_struct = struct.pack(f'<I{len(token_bytes)}s', len(token_bytes), token_bytes)
19 |         SQL_COPT_SS_ACCESS_TOKEN = 1256  # This connection option is defined by microsoft in msodbcsql.h        
20 |         attrs_before = {SQL_COPT_SS_ACCESS_TOKEN: token_struct}
21 | 
22 |     print(' - Connecting to MSSQL...')    
23 |     conn = pyodbc.connect(mssql_connection_string, attrs_before=attrs_before)
24 | 
25 |     return conn


--------------------------------------------------------------------------------
/vector-embeddings/00-setup-blob-access.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | 	Cleanup if needed
 3 | */
 4 | if not exists(select * from sys.symmetric_keys where [name] = '##MS_DatabaseMasterKey##')
 5 | begin
 6 | 	create master key encryption by password = 'Pa$$w0rd!'
 7 | end
 8 | go
 9 | if exists(select * from sys.[external_data_sources] where name = 'openai_playground')
10 | begin
11 | 	drop external data source [openai_playground];
12 | end
13 | go
14 | if exists(select * from sys.[database_scoped_credentials] where name = 'openai_playground')
15 | begin
16 | 	drop database scoped credential [openai_playground];
17 | end
18 | go
19 | 
20 | /*
21 | 	Create database scoped credential and external data source.
22 | 	File is assumed to be in a path like: 
23 | 	https://<myaccount>.blob.core.windows.net/playground/wikipedia/vector_database_wikipedia_articles_embedded.csv
24 | 
25 | 	Please note that it is recommened to avoid using SAS tokens: the best practice is to use Managed Identity as described here:
26 | 	https://learn.microsoft.com/en-us/sql/relational-databases/import-export/import-bulk-data-by-using-bulk-insert-or-openrowset-bulk-sql-server?view=sql-server-ver16#bulk-importing-from-azure-blob-storage
27 | 
28 | */
29 | create database scoped credential [openai_playground]
30 | with identity = 'SHARED ACCESS SIGNATURE',
31 | secret = '<sas-token>'; -- make sure not to include the ? at the beginning
32 | go
33 | create external data source [openai_playground]
34 | with 
35 | ( 
36 | 	type = blob_storage,
37 |  	location = 'https://<account>.blob.core.windows.net/playground',
38 |  	credential = [openai_playground]
39 | );
40 | go
41 | 


--------------------------------------------------------------------------------
/vector-embeddings/01-import-wikipedia.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | 	Create table
 3 | */
 4 | drop table if exists [dbo].[wikipedia_articles_embeddings];
 5 | create table [dbo].[wikipedia_articles_embeddings]
 6 | (
 7 | 	[id] [int] not null,
 8 | 	[url] [varchar](1000) not null,
 9 | 	[title] [varchar](1000) not null,
10 | 	[text] [varchar](max) not null,
11 | 	[title_vector] [varchar](max) not null,
12 | 	[content_vector] [varchar](max) not null,
13 | 	[vector_id] [int] not null
14 | )
15 | go
16 | 
17 | /*
18 | 	Import data
19 | */
20 | bulk insert dbo.[wikipedia_articles_embeddings]
21 | from 'wikipedia/vector_database_wikipedia_articles_embedded.csv'
22 | with (
23 | 	data_source = 'openai_playground',
24 |     format = 'csv',
25 |     firstrow = 2,
26 |     codepage = '65001',
27 | 	fieldterminator = ',',
28 | 	rowterminator = '0x0a',
29 |     fieldquote = '"',
30 |     batchsize = 1000,
31 |     tablock
32 | )
33 | go
34 | 
35 | /*
36 | 	Add primary key
37 | */
38 | alter table [dbo].[wikipedia_articles_embeddings]
39 | add constraint pk__wikipedia_articles_embeddings primary key clustered (id)
40 | go
41 | 
42 | /*
43 | 	Add index on title
44 | */
45 | create index [ix_title] on [dbo].[wikipedia_articles_embeddings](title)
46 | go
47 | 
48 | /*
49 | 	Verify data
50 | */
51 | select top (10) * from [dbo].[wikipedia_articles_embeddings]
52 | go
53 | 
54 | select * from [dbo].[wikipedia_articles_embeddings] where title = 'Alan Turing'
55 | go
56 | 
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/vector-embeddings/02-use-native-vectors.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Add columns to store the native vectors
 3 | */
 4 | alter table wikipedia_articles_embeddings
 5 | add title_vector_ada2 vector(1536);
 6 | 
 7 | alter table wikipedia_articles_embeddings
 8 | add content_vector_ada2 vector(1536);
 9 | go
10 | 
11 | /*
12 |     Update the native vectors
13 | */
14 | update 
15 |     wikipedia_articles_embeddings
16 | set 
17 |     title_vector_ada2 = cast(title_vector as vector(1536)),
18 |     content_vector_ada2 = cast(content_vector as vector(1536))
19 | go
20 | 
21 | /*
22 |     Remove old columns
23 | */
24 | alter table wikipedia_articles_embeddings
25 | drop column title_vector;
26 | go
27 | 
28 | alter table wikipedia_articles_embeddings
29 | drop column content_vector;
30 | go
31 | 
32 | /*
33 | 	Verify data
34 | */
35 | select top (10) * from [dbo].[wikipedia_articles_embeddings]
36 | go
37 | 
38 | select * from [dbo].[wikipedia_articles_embeddings] where title = 'Alan Turing'
39 | go
40 | 


--------------------------------------------------------------------------------
/vector-embeddings/03-store-openai-credentials.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Create database credentials to store API key
 3 | */
 4 | if exists(select * from sys.[database_scoped_credentials] where name = 'https://<your-app-name>.openai.azure.com')
 5 | begin
 6 | 	drop database scoped credential [https://<your-app-name>.openai.azure.com];
 7 | end
 8 | create database scoped credential [https://<your-app-name>.openai.azure.com]
 9 | with identity = 'HTTPEndpointHeaders', secret = '{"api-key": "<api-key>"}';
10 | go


--------------------------------------------------------------------------------
/vector-embeddings/04-create-get-embeddings-procedure.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Get the embeddings for the input text by calling the OpenAI API
 3 | */
 4 | create or alter procedure dbo.get_embedding
 5 | @deployedModelName nvarchar(1000),
 6 | @inputText nvarchar(max),
 7 | @embedding vector(1536) output
 8 | as
 9 | declare @retval int, @response nvarchar(max);
10 | declare @payload nvarchar(max) = json_object('input': @inputText);
11 | declare @url nvarchar(1000) = 'https://<your-api-name>.openai.azure.com/openai/deployments/' + @deployedModelName + '/embeddings?api-version=2024-08-01-preview'
12 | exec @retval = sp_invoke_external_rest_endpoint
13 |     @url = @url,
14 |     @method = 'POST',
15 |     @credential = [https://<your-app-name>],
16 |     @payload = @payload,
17 |     @response = @response output;
18 | 
19 | declare @re nvarchar(max) = null;
20 | if (@retval = 0) begin
21 |     set @re = json_query(@response, '$.result.data[0].embedding')
22 | end else begin
23 |     select @response as 'Error message from OpenAI API';
24 | end
25 | 
26 | set @embedding = cast(@re as vector(1536));
27 | 
28 | return @retval
29 | go
30 | 
31 | 


--------------------------------------------------------------------------------
/vector-embeddings/05-find-similar-articles.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Get the embeddings for the input text by calling the OpenAI API
 3 |     and then search the most similar articles (by title)
 4 |     Note: <deployment-id> needs to be replaced with the deployment name of your embedding model in Azure OpenAI
 5 | */
 6 | 
 7 | declare @inputText nvarchar(max) = 'the foundation series by isaac asimov';
 8 | declare @retval int, @embedding vector(1536);
 9 | 
10 | exec @retval = dbo.get_embedding '<deployment-id>', @inputText, @embedding output;
11 | 
12 | select top(10)
13 |     a.id,
14 |     a.title,
15 |     a.url,
16 |     vector_distance('cosine', @embedding, title_vector_ada2) cosine_distance
17 | from
18 |     dbo.wikipedia_articles_embeddings a
19 | order by
20 |     cosine_distance;
21 | go
22 | 
23 | 


--------------------------------------------------------------------------------
/vector-embeddings/06-hybrid-search-setup.sql:
--------------------------------------------------------------------------------
 1 | if not exists(select * from sys.fulltext_catalogs where [name] = 'FullTextCatalog')
 2 | begin
 3 |     create fulltext catalog [FullTextCatalog] as default;
 4 | end
 5 | go
 6 | 
 7 | create fulltext index on dbo.wikipedia_articles_embeddings ([text]) key index pk__wikipedia_articles_embeddings;
 8 | go
 9 | 
10 | alter fulltext index on dbo.wikipedia_articles_embeddings enable; 
11 | go
12 | 
13 | select * from sys.fulltext_catalogs
14 | go
15 | 


--------------------------------------------------------------------------------
/vector-embeddings/07-hybrid-search.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Get the embeddings for the input text by calling the OpenAI API
 3 |     and then search the most similar articles (by title)
 4 |     Note: <deployment-id> needs to be replaced with the deployment name of your embedding model in Azure OpenAI
 5 | */
 6 | 
 7 | DECLARE @q NVARCHAR(1000) = 'the foundation series by isaac asimov';
 8 | DECLARE @k INT = 10
 9 | 
10 | DECLARE @r INT, @e VECTOR(1536);
11 | 
12 | EXEC @r = dbo.get_embedding '<deployment-id>', @q, @e OUTPUT;
13 | IF (@r != 0) SELECT @r;
14 | 
15 | WITH keyword_search AS (
16 |     SELECT TOP(@k)
17 |         id, 
18 |         RANK() OVER (ORDER BY ft_rank DESC) AS rank,
19 |         title,
20 |         [text]
21 |     FROM
22 |         (
23 |             SELECT TOP(@k)
24 |                 id,
25 |                 ftt.[RANK] AS ft_rank,
26 |                 title,
27 |                 [text]
28 |             FROM 
29 |                 dbo.wikipedia_articles_embeddings w
30 |             INNER JOIN 
31 |                 FREETEXTTABLE(dbo.wikipedia_articles_embeddings, *, @q) AS ftt ON w.id = ftt.[KEY]
32 |             ORDER BY
33 |                 ft_rank DESC
34 |         ) AS freetext_documents
35 |     ORDER BY
36 |         rank ASC
37 | ),
38 | semantic_search AS
39 | (
40 |     SELECT TOP(@k)
41 |         id, 
42 |         RANK() OVER (ORDER BY cosine_distance) AS rank
43 |     FROM
44 |         (
45 |             SELECT TOP(@k)
46 |                 id, 
47 |                 VECTOR_DISTANCE('cosine', @e, content_vector_ada2) AS cosine_distance
48 |             FROM 
49 |                 dbo.wikipedia_articles_embeddings w
50 |             ORDER BY
51 |                 cosine_distance
52 |         ) AS similar_documents
53 | ),
54 | result AS (
55 |     SELECT TOP(@k)
56 |         COALESCE(ss.id, ks.id) AS id,
57 |         ss.rank AS semantic_rank,
58 |         ks.rank AS keyword_rank,
59 |         COALESCE(1.0 / (@k + ss.rank), 0.0) +
60 |         COALESCE(1.0 / (@k + ks.rank), 0.0) AS score -- Reciprocal Rank Fusion (RRF) 
61 |     FROM
62 |         semantic_search ss
63 |     FULL OUTER JOIN
64 |         keyword_search ks ON ss.id = ks.id
65 |     ORDER BY 
66 |         score DESC
67 | )   
68 | SELECT
69 |     w.id,
70 |     cast(score * 1000 as int) as rrf_score,
71 |     rank() OVER(ORDER BY cast(score * 1000 AS INT) DESC) AS rrf_rank,
72 |     semantic_rank,
73 |     keyword_rank,
74 |     w.title,
75 |     w.[text]
76 | FROM
77 |     result AS r
78 | INNER JOIN
79 |     dbo.wikipedia_articles_embeddings AS w ON r.id = w.id
80 | ORDER BY
81 |     rrf_rank


--------------------------------------------------------------------------------