├── .ddev ├── config.yaml └── post-start.sh ├── .gitignore ├── LICENSE ├── README.md ├── composer.json ├── composer.lock ├── sql └── test-queries.sql ├── src ├── Nlp │ ├── AddedToken.php │ ├── BertNormalizer.php │ ├── BertPreTokenizer.php │ ├── BertTokenizer.php │ ├── Decoder.php │ ├── Embedder.php │ ├── PostProcessor.php │ ├── PreTokenizer.php │ ├── TemplateProcessing.php │ ├── TextNormalizer.php │ ├── TokenizerModel.php │ ├── WordPieceDecoder.php │ ├── WordpieceTokenizer.php │ ├── model_quantized.onnx │ ├── tokenizer.json │ └── tokenizer_config.json └── VectorTable.php └── tests ├── Nlp ├── BertNormalizerTest.php ├── BertPreTokenizerTest.php ├── BertTokenizerTest.php ├── EmbedderTest.php ├── TokenizerModelTest.php ├── WordPieceDecoderTest.php └── WordpieceTokenizerTest.php ├── PerformanceBenchmarkTest.php └── VectorTableTest.php /.ddev/config.yaml: -------------------------------------------------------------------------------- 1 | name: mysql-vector 2 | type: php 3 | docroot: "" 4 | php_version: "8.4" 5 | webserver_type: nginx-fpm 6 | xdebug_enabled: false 7 | additional_hostnames: [] 8 | additional_fqdns: [] 9 | database: 10 | type: mysql 11 | version: "8.0" 12 | use_dns_when_possible: true 13 | composer_version: "2" 14 | web_environment: [] 15 | corepack_enable: false 16 | hooks: 17 | post-start: 18 | - exec: "/bin/bash /var/www/html/.ddev/post-start.sh" 19 | 20 | # Key features of DDEV's config.yaml: 21 | 22 | # name: # Name of the project, automatically provides 23 | # http://projectname.ddev.site and https://projectname.ddev.site 24 | 25 | # type: # backdrop, craftcms, django4, drupal, drupal6, drupal7, laravel, magento, magento2, php, python, shopware6, silverstripe, typo3, wordpress 26 | # See https://ddev.readthedocs.io/en/stable/users/quickstart/ for more 27 | # information on the different project types 28 | # "drupal" covers recent Drupal 8+ 29 | 30 | # docroot: # Relative path to the directory containing index.php. 31 | 32 | # php_version: "8.2" # PHP version to use, "5.6", "7.0", "7.1", "7.2", "7.3", "7.4", "8.0", "8.1", "8.2", "8.3" 33 | 34 | # You can explicitly specify the webimage but this 35 | # is not recommended, as the images are often closely tied to DDEV's' behavior, 36 | # so this can break upgrades. 37 | 38 | # webimage: # nginx/php docker image. 39 | 40 | # database: 41 | # type: # mysql, mariadb, postgres 42 | # version: # database version, like "10.11" or "8.0" 43 | # MariaDB versions can be 5.5-10.8, 10.11, and 11.4. 44 | # MySQL versions can be 5.5-8.0. 45 | # PostgreSQL versions can be 9-16. 46 | 47 | # router_http_port: # Port to be used for http (defaults to global configuration, usually 80) 48 | # router_https_port: # Port for https (defaults to global configuration, usually 443) 49 | 50 | # xdebug_enabled: false # Set to true to enable Xdebug and "ddev start" or "ddev restart" 51 | # Note that for most people the commands 52 | # "ddev xdebug" to enable Xdebug and "ddev xdebug off" to disable it work better, 53 | # as leaving Xdebug enabled all the time is a big performance hit. 54 | 55 | # xhprof_enabled: false # Set to true to enable Xhprof and "ddev start" or "ddev restart" 56 | # Note that for most people the commands 57 | # "ddev xhprof" to enable Xhprof and "ddev xhprof off" to disable it work better, 58 | # as leaving Xhprof enabled all the time is a big performance hit. 59 | 60 | # webserver_type: nginx-fpm, apache-fpm, or nginx-gunicorn 61 | 62 | # timezone: Europe/Berlin 63 | # This is the timezone used in the containers and by PHP; 64 | # it can be set to any valid timezone, 65 | # see https://en.wikipedia.org/wiki/List_of_tz_database_time_zones 66 | # For example Europe/Dublin or MST7MDT 67 | 68 | # composer_root: 69 | # Relative path to the Composer root directory from the project root. This is 70 | # the directory which contains the composer.json and where all Composer related 71 | # commands are executed. 72 | 73 | # composer_version: "2" 74 | # You can set it to "" or "2" (default) for Composer v2 or "1" for Composer v1 75 | # to use the latest major version available at the time your container is built. 76 | # It is also possible to use each other Composer version channel. This includes: 77 | # - 2.2 (latest Composer LTS version) 78 | # - stable 79 | # - preview 80 | # - snapshot 81 | # Alternatively, an explicit Composer version may be specified, for example "2.2.18". 82 | # To reinstall Composer after the image was built, run "ddev debug refresh". 83 | 84 | # nodejs_version: "20" 85 | # change from the default system Node.js version to any other version. 86 | # Numeric version numbers can be complete (i.e. 18.15.0) or 87 | # incomplete (18, 17.2, 16). 'lts' and 'latest' can be used as well along with 88 | # other named releases. 89 | # see https://www.npmjs.com/package/n#specifying-nodejs-versions 90 | # Note that you can continue using 'ddev nvm' or nvm inside the web container 91 | # to change the project's installed node version if you need to. 92 | 93 | # corepack_enable: false 94 | # Change to 'true' to 'corepack enable' and gain access to latest versions of yarn/pnpm 95 | 96 | # additional_hostnames: 97 | # - somename 98 | # - someothername 99 | # would provide http and https URLs for "somename.ddev.site" 100 | # and "someothername.ddev.site". 101 | 102 | # additional_fqdns: 103 | # - example.com 104 | # - sub1.example.com 105 | # would provide http and https URLs for "example.com" and "sub1.example.com" 106 | # Please take care with this because it can cause great confusion. 107 | 108 | # upload_dirs: "custom/upload/dir" 109 | # 110 | # upload_dirs: 111 | # - custom/upload/dir 112 | # - ../private 113 | # 114 | # would set the destination paths for ddev import-files to /custom/upload/dir 115 | # When Mutagen is enabled this path is bind-mounted so that all the files 116 | # in the upload_dirs don't have to be synced into Mutagen. 117 | 118 | # disable_upload_dirs_warning: false 119 | # If true, turns off the normal warning that says 120 | # "You have Mutagen enabled and your 'php' project type doesn't have upload_dirs set" 121 | 122 | # ddev_version_constraint: "" 123 | # Example: 124 | # ddev_version_constraint: ">= 1.22.4" 125 | # This will enforce that the running ddev version is within this constraint. 126 | # See https://github.com/Masterminds/semver#checking-version-constraints for 127 | # supported constraint formats 128 | 129 | # working_dir: 130 | # web: /var/www/html 131 | # db: /home 132 | # would set the default working directory for the web and db services. 133 | # These values specify the destination directory for ddev ssh and the 134 | # directory in which commands passed into ddev exec are run. 135 | 136 | # omit_containers: [db, ddev-ssh-agent] 137 | # Currently only these containers are supported. Some containers can also be 138 | # omitted globally in the ~/.ddev/global_config.yaml. Note that if you omit 139 | # the "db" container, several standard features of DDEV that access the 140 | # database container will be unusable. In the global configuration it is also 141 | # possible to omit ddev-router, but not here. 142 | 143 | # performance_mode: "global" 144 | # DDEV offers performance optimization strategies to improve the filesystem 145 | # performance depending on your host system. Should be configured globally. 146 | # 147 | # If set, will override the global config. Possible values are: 148 | # - "global": uses the value from the global config. 149 | # - "none": disables performance optimization for this project. 150 | # - "mutagen": enables Mutagen for this project. 151 | # - "nfs": enables NFS for this project. 152 | # 153 | # See https://ddev.readthedocs.io/en/stable/users/install/performance/#nfs 154 | # See https://ddev.readthedocs.io/en/stable/users/install/performance/#mutagen 155 | 156 | # fail_on_hook_fail: False 157 | # Decide whether 'ddev start' should be interrupted by a failing hook 158 | 159 | # host_https_port: "59002" 160 | # The host port binding for https can be explicitly specified. It is 161 | # dynamic unless otherwise specified. 162 | # This is not used by most people, most people use the *router* instead 163 | # of the localhost port. 164 | 165 | # host_webserver_port: "59001" 166 | # The host port binding for the ddev-webserver can be explicitly specified. It is 167 | # dynamic unless otherwise specified. 168 | # This is not used by most people, most people use the *router* instead 169 | # of the localhost port. 170 | 171 | # host_db_port: "59002" 172 | # The host port binding for the ddev-dbserver can be explicitly specified. It is dynamic 173 | # unless explicitly specified. 174 | 175 | # mailpit_http_port: "8025" 176 | # mailpit_https_port: "8026" 177 | # The Mailpit ports can be changed from the default 8025 and 8026 178 | 179 | # host_mailpit_port: "8025" 180 | # The mailpit port is not normally bound on the host at all, instead being routed 181 | # through ddev-router, but it can be bound directly to localhost if specified here. 182 | 183 | # webimage_extra_packages: [php7.4-tidy, php-bcmath] 184 | # Extra Debian packages that are needed in the webimage can be added here 185 | 186 | # dbimage_extra_packages: [telnet,netcat] 187 | # Extra Debian packages that are needed in the dbimage can be added here 188 | 189 | # use_dns_when_possible: true 190 | # If the host has internet access and the domain configured can 191 | # successfully be looked up, DNS will be used for hostname resolution 192 | # instead of editing /etc/hosts 193 | # Defaults to true 194 | 195 | # project_tld: ddev.site 196 | # The top-level domain used for project URLs 197 | # The default "ddev.site" allows DNS lookup via a wildcard 198 | # If you prefer you can change this to "ddev.local" to preserve 199 | # pre-v1.9 behavior. 200 | 201 | # ngrok_args: --basic-auth username:pass1234 202 | # Provide extra flags to the "ngrok http" command, see 203 | # https://ngrok.com/docs/ngrok-agent/config or run "ngrok http -h" 204 | 205 | # disable_settings_management: false 206 | # If true, DDEV will not create CMS-specific settings files like 207 | # Drupal's settings.php/settings.ddev.php or TYPO3's additional.php 208 | # In this case the user must provide all such settings. 209 | 210 | # You can inject environment variables into the web container with: 211 | # web_environment: 212 | # - SOMEENV=somevalue 213 | # - SOMEOTHERENV=someothervalue 214 | 215 | # no_project_mount: false 216 | # (Experimental) If true, DDEV will not mount the project into the web container; 217 | # the user is responsible for mounting it manually or via a script. 218 | # This is to enable experimentation with alternate file mounting strategies. 219 | # For advanced users only! 220 | 221 | # bind_all_interfaces: false 222 | # If true, host ports will be bound on all network interfaces, 223 | # not the localhost interface only. This means that ports 224 | # will be available on the local network if the host firewall 225 | # allows it. 226 | 227 | # default_container_timeout: 120 228 | # The default time that DDEV waits for all containers to become ready can be increased from 229 | # the default 120. This helps in importing huge databases, for example. 230 | 231 | #web_extra_exposed_ports: 232 | #- name: nodejs 233 | # container_port: 3000 234 | # http_port: 2999 235 | # https_port: 3000 236 | #- name: something 237 | # container_port: 4000 238 | # https_port: 4000 239 | # http_port: 3999 240 | # Allows a set of extra ports to be exposed via ddev-router 241 | # Fill in all three fields even if you don’t intend to use the https_port! 242 | # If you don’t add https_port, then it defaults to 0 and ddev-router will fail to start. 243 | # 244 | # The port behavior on the ddev-webserver must be arranged separately, for example 245 | # using web_extra_daemons. 246 | # For example, with a web app on port 3000 inside the container, this config would 247 | # expose that web app on https://.ddev.site:9999 and http://.ddev.site:9998 248 | # web_extra_exposed_ports: 249 | # - name: myapp 250 | # container_port: 3000 251 | # http_port: 9998 252 | # https_port: 9999 253 | 254 | #web_extra_daemons: 255 | #- name: "http-1" 256 | # command: "/var/www/html/node_modules/.bin/http-server -p 3000" 257 | # directory: /var/www/html 258 | #- name: "http-2" 259 | # command: "/var/www/html/node_modules/.bin/http-server /var/www/html/sub -p 3000" 260 | # directory: /var/www/html 261 | 262 | # override_config: false 263 | # By default, config.*.yaml files are *merged* into the configuration 264 | # But this means that some things can't be overridden 265 | # For example, if you have 'use_dns_when_possible: true'' you can't override it with a merge 266 | # and you can't erase existing hooks or all environment variables. 267 | # However, with "override_config: true" in a particular config.*.yaml file, 268 | # 'use_dns_when_possible: false' can override the existing values, and 269 | # hooks: 270 | # post-start: [] 271 | # or 272 | # web_environment: [] 273 | # or 274 | # additional_hostnames: [] 275 | # can have their intended affect. 'override_config' affects only behavior of the 276 | # config.*.yaml file it exists in. 277 | 278 | # Many DDEV commands can be extended to run tasks before or after the 279 | # DDEV command is executed, for example "post-start", "post-import-db", 280 | # "pre-composer", "post-composer" 281 | # See https://ddev.readthedocs.io/en/stable/users/extend/custom-commands/ for more 282 | # information on the commands that can be extended and the tasks you can define 283 | # for them. Example: 284 | #hooks: 285 | -------------------------------------------------------------------------------- /.ddev/post-start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Create the test database 3 | mysql -uroot -proot -e " 4 | CREATE DATABASE IF NOT EXISTS db; 5 | GRANT ALL PRIVILEGES ON db.* TO 'db'@'%'; 6 | FLUSH PRIVILEGES; 7 | " -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/composer,macos,phpstorm 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=composer,macos,phpstorm 3 | 4 | .idea 5 | .phpunit.result.cache 6 | 7 | ### Composer ### 8 | composer.phar 9 | /vendor/ 10 | 11 | # Commit your application's lock file https://getcomposer.org/doc/01-basic-usage.md#commit-your-composer-lock-file-to-version-control 12 | # You may choose to ignore a library lock file http://getcomposer.org/doc/02-libraries.md#lock-file 13 | # composer.lock 14 | 15 | ### macOS ### 16 | # General 17 | .DS_Store 18 | .AppleDouble 19 | .LSOverride 20 | 21 | # Icon must end with two \r 22 | Icon 23 | 24 | 25 | # Thumbnails 26 | ._* 27 | 28 | # Files that might appear in the root of a volume 29 | .DocumentRevisions-V100 30 | .fseventsd 31 | .Spotlight-V100 32 | .TemporaryItems 33 | .Trashes 34 | .VolumeIcon.icns 35 | .com.apple.timemachine.donotpresent 36 | 37 | # Directories potentially created on remote AFP share 38 | .AppleDB 39 | .AppleDesktop 40 | Network Trash Folder 41 | Temporary Items 42 | .apdisk 43 | 44 | ### macOS Patch ### 45 | # iCloud generated files 46 | *.icloud 47 | 48 | ### PhpStorm ### 49 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 50 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 51 | 52 | # User-specific stuff 53 | .idea/**/workspace.xml 54 | .idea/**/tasks.xml 55 | .idea/**/usage.statistics.xml 56 | .idea/**/dictionaries 57 | .idea/**/shelf 58 | 59 | # AWS User-specific 60 | .idea/**/aws.xml 61 | 62 | # Generated files 63 | .idea/**/contentModel.xml 64 | 65 | # Sensitive or high-churn files 66 | .idea/**/dataSources/ 67 | .idea/**/dataSources.ids 68 | .idea/**/dataSources.local.xml 69 | .idea/**/sqlDataSources.xml 70 | .idea/**/dynamic.xml 71 | .idea/**/uiDesigner.xml 72 | .idea/**/dbnavigator.xml 73 | 74 | # Gradle 75 | .idea/**/gradle.xml 76 | .idea/**/libraries 77 | 78 | # Gradle and Maven with auto-import 79 | # When using Gradle or Maven with auto-import, you should exclude module files, 80 | # since they will be recreated, and may cause churn. Uncomment if using 81 | # auto-import. 82 | # .idea/artifacts 83 | # .idea/compiler.xml 84 | # .idea/jarRepositories.xml 85 | # .idea/modules.xml 86 | # .idea/*.iml 87 | # .idea/modules 88 | # *.iml 89 | # *.ipr 90 | 91 | # CMake 92 | cmake-build-*/ 93 | 94 | # Mongo Explorer plugin 95 | .idea/**/mongoSettings.xml 96 | 97 | # File-based project format 98 | *.iws 99 | 100 | # IntelliJ 101 | out/ 102 | 103 | # mpeltonen/sbt-idea plugin 104 | .idea_modules/ 105 | 106 | # JIRA plugin 107 | atlassian-ide-plugin.xml 108 | 109 | # Cursive Clojure plugin 110 | .idea/replstate.xml 111 | 112 | # SonarLint plugin 113 | .idea/sonarlint/ 114 | 115 | # Crashlytics plugin (for Android Studio and IntelliJ) 116 | com_crashlytics_export_strings.xml 117 | crashlytics.properties 118 | crashlytics-build.properties 119 | fabric.properties 120 | 121 | # Editor-based Rest Client 122 | .idea/httpRequests 123 | 124 | # Android studio 3.1+ serialized cache file 125 | .idea/caches/build_file_checksums.ser 126 | 127 | ### PhpStorm Patch ### 128 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 129 | 130 | # *.iml 131 | # modules.xml 132 | # .idea/misc.xml 133 | # *.ipr 134 | 135 | # Sonarlint plugin 136 | # https://plugins.jetbrains.com/plugin/7973-sonarlint 137 | .idea/**/sonarlint/ 138 | 139 | # SonarQube Plugin 140 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin 141 | .idea/**/sonarIssues.xml 142 | 143 | # Markdown Navigator plugin 144 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced 145 | .idea/**/markdown-navigator.xml 146 | .idea/**/markdown-navigator-enh.xml 147 | .idea/**/markdown-navigator/ 148 | 149 | # Cache file creation bug 150 | # See https://youtrack.jetbrains.com/issue/JBR-2257 151 | .idea/$CACHE_FILE$ 152 | 153 | # CodeStream plugin 154 | # https://plugins.jetbrains.com/plugin/12206-codestream 155 | .idea/codestream.xml 156 | 157 | # Azure Toolkit for IntelliJ plugin 158 | # https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij 159 | .idea/**/azureSettings.xml 160 | 161 | # End of https://www.toptal.com/developers/gitignore/api/composer,macos,phpstorm -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Allan Pichardo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Library for MySQL Vector Operations and Text Embeddings 2 | 3 | ## Overview 4 | The `VectorTable` class is a PHP implementation designed to facilitate the storage, retrieval, and comparison of high-dimensional vectors in a MySQL database. This class utilizes MySQL JSON data types and a custom cosine similarity function (`COSIM`) to perform vector comparisons efficiently. 5 | 6 | ### Search Performance 7 | Vectors are binary quantized upon insertion into the database to optimize search speed and reranked to improve accuracy. 8 | However, this library is only suitable for small datasets (less than 1,000,000 vectors). For large datasets, it is recommended that you use a dedicated vector database such as [Qdrant](https://qdrant.tech/). 9 | 10 | Search Benchmarks (384-dimensional vectors): 11 | Vectors | Time (seconds) 12 | --------|--------------- 13 | 100 | 0.02 14 | 1000 | 0.02 15 | 10000 | 0.03 16 | 100000 | 0.06 17 | 1000000 | 0.48 18 | 19 | ## Features 20 | - Store vectors in a MySQL database using JSON data types. 21 | - Calculate cosine similarity between vectors using a custom MySQL function. 22 | - Normalize vectors and handle vector operations such as insertion, deletion, and searching. 23 | - Support for vector quantization for optimized search operations. 24 | - Native PHP support for generating for text embeddings using the [BGE embedding model](https://huggingface.co/BAAI/bge-base-en-v1.5). 25 | 26 | ## Requirements 27 | - PHP 8.0 or higher. 28 | - MySQL 5.7 or higher with support for JSON data types and stored functions. 29 | - A MySQLi extension for PHP. 30 | 31 | ## Installation 32 | 1. Ensure that PHP and MySQL are installed and properly configured on your system. 33 | 2. Install the library using [Composer](https://getcomposer.org/). 34 | 35 | ```bash 36 | composer require allanpichardo/mysql-vector 37 | ``` 38 | 39 | ## Usage 40 | 41 | ### Initializing the Vector Table 42 | Import the `VectorTable` class and create a new instance using the MySQLi connection, table name, and vector dimension. 43 | ```php 44 | use MHz\MysqlVector\VectorTable; 45 | 46 | 47 | $mysqli = new mysqli("hostname", "username", "password", "database"); 48 | $tableName = "my_vector_table"; 49 | $dimension = 384; 50 | $engine = 'InnoDB'; 51 | 52 | $vectorTable = new VectorTable($mysqli, $tableName, $dimension, $engine); 53 | ``` 54 | 55 | ### Setting Up the Vector Table in MySQL 56 | The `initialize` method will create the vector table in MySQL if it does not already exist. This method will also create the `COSIM` function in MySQL if it does not already exist. 57 | ```php 58 | $vectorTable->initialize(); 59 | ``` 60 | 61 | ### Inserting and Managing Vectors 62 | ```php 63 | // Insert a new vector 64 | $vector = [0.1, 0.2, 0.3, ..., 0.384]; 65 | $vectorId = $vectorTable->upsert($vector); 66 | 67 | // Update an existing vector 68 | $vectorTable->upsert($vector, $vectorId); 69 | 70 | // Delete a vector 71 | $vectorTable->delete($vectorId); 72 | ``` 73 | 74 | ### Calculating Cosine Similarity 75 | ```php 76 | // Calculate cosine similarity between two vectors 77 | $similarity = $vectorTable->cosim($vector1, $vector2); 78 | ``` 79 | 80 | ### Searching for Similar Vectors 81 | Perform a search for vectors similar to a given vector using the cosine similarity criteria. The `topN` parameter specifies the maximum number of similar vectors to return. 82 | ```php 83 | // Find vectors similar to a given vector 84 | $similarVectors = $vectorTable->search($vector, $topN); 85 | ``` 86 | 87 | ## Text Embeddings 88 | The `Embedder` class calculates 384-dimensional text embeddings using the [BGE embedding model](https://huggingface.co/BAAI/bge-base-en-v1.5). The first time you instanciate the `Embedder` class, the ONNX runtime will be installed automatically. 89 | The maximum length of the input text is 512 characters. The `Embedder` class will automatically truncate the input text to 512 characters if it is longer than 512 characters. 90 | 91 | ```php 92 | use MHz\MysqlVector\Nlp\Embedder; 93 | 94 | $embedder = new Embedder(); 95 | 96 | // Calculate the embeddings for a batch of text 97 | $texts = ["Hello world!", "This is a test."]; 98 | $embeddings = $embedder->embed($texts); 99 | 100 | print_r($embeddings[0][0]); // [0.1, 0.2, 0.3, ..., 0.384] 101 | print_r($embeddings[1][0]); // [0.1, 0.2, 0.3, ..., 0.384] 102 | ``` 103 | 104 | ## Contributions 105 | Contributions to this project are welcome. Please ensure that your code adheres to the existing coding standards and includes appropriate tests. 106 | 107 | ## Development 108 | This project uses DDEV, a Docker-based development environment. To get started, install DDEV and run the following commands: 109 | 110 | ```bash 111 | ddev start 112 | ddev composer install 113 | ``` 114 | 115 | To run the tests, use the following command: 116 | 117 | ```bash 118 | ddev composer test 119 | ``` 120 | 121 | ## License 122 | MIT License -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "allanpichardo/mysql-vector", 3 | "description": "Perform vector operations natively on MySQL", 4 | "type": "library", 5 | "license": "MIT", 6 | "version": "2.0.4", 7 | "authors": [ 8 | { 9 | "name": "Allan Pichardo", 10 | "email": "allan.pichardo@gmail.com", 11 | "homepage": "https://allanpichardo.com" 12 | } 13 | ], 14 | "require": { 15 | "php": ">=8.0", 16 | "ext-mysqli": "*", 17 | "bdelespierre/php-kmeans": "^2.2", 18 | "ext-mbstring": "*", 19 | "ext-intl": "*", 20 | "symfony/polyfill-intl-normalizer": "^1.28", 21 | "symfony/polyfill-mbstring": "^1.28", 22 | "ext-ctype": "*", 23 | "symfony/polyfill-ctype": "^1.28", 24 | "ext-iconv": "*", 25 | "symfony/polyfill-iconv": "^1.28", 26 | "ankane/onnxruntime": "^0.2.1" 27 | }, 28 | "replace": { 29 | "symfony/polyfill-intl-normalizer": "^1.28", 30 | "symfony/polyfill-mbstring": "^1.28", 31 | "symfony/polyfill-ctype": "^1.28", 32 | "symfony/polyfill-iconv": "^1.28" 33 | }, 34 | "autoload": { 35 | "psr-4": { 36 | "MHz\\MysqlVector\\": "src/" 37 | } 38 | }, 39 | "autoload-dev": { 40 | "psr-4": { 41 | "MHz\\MysqlVector\\Tests\\": "tests/" 42 | } 43 | }, 44 | "require-dev": { 45 | "phpunit/phpunit": "^9.5" 46 | }, 47 | "scripts": { 48 | "test": "phpunit tests" 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /composer.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_readme": [ 3 | "This file locks the dependencies of your project to a known state", 4 | "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", 5 | "This file is @generated automatically" 6 | ], 7 | "content-hash": "95b27675f256a68e948818c8f568ee77", 8 | "packages": [ 9 | { 10 | "name": "ankane/onnxruntime", 11 | "version": "v0.2.4", 12 | "source": { 13 | "type": "git", 14 | "url": "https://github.com/ankane/onnxruntime-php.git", 15 | "reference": "6ba9b2fd980db69f5299ee466f1170bcdb526d04" 16 | }, 17 | "dist": { 18 | "type": "zip", 19 | "url": "https://api.github.com/repos/ankane/onnxruntime-php/zipball/6ba9b2fd980db69f5299ee466f1170bcdb526d04", 20 | "reference": "6ba9b2fd980db69f5299ee466f1170bcdb526d04", 21 | "shasum": "" 22 | }, 23 | "require": { 24 | "ext-ffi": ">= 8.1", 25 | "php": ">= 8.1" 26 | }, 27 | "require-dev": { 28 | "phpunit/phpunit": "^10" 29 | }, 30 | "type": "library", 31 | "autoload": { 32 | "psr-4": { 33 | "OnnxRuntime\\": "src/" 34 | } 35 | }, 36 | "notification-url": "https://packagist.org/downloads/", 37 | "license": [ 38 | "MIT" 39 | ], 40 | "authors": [ 41 | { 42 | "name": "Andrew Kane", 43 | "email": "andrew@ankane.org" 44 | } 45 | ], 46 | "description": "Run ONNX models in PHP", 47 | "support": { 48 | "issues": "https://github.com/ankane/onnxruntime-php/issues", 49 | "source": "https://github.com/ankane/onnxruntime-php" 50 | }, 51 | "time": "2024-11-01T18:42:30+00:00" 52 | }, 53 | { 54 | "name": "bdelespierre/php-kmeans", 55 | "version": "v2.2.0", 56 | "source": { 57 | "type": "git", 58 | "url": "https://github.com/bdelespierre/php-kmeans.git", 59 | "reference": "625f95f0c8f6b20c58fcc7566ef8a59b0b7552f3" 60 | }, 61 | "dist": { 62 | "type": "zip", 63 | "url": "https://api.github.com/repos/bdelespierre/php-kmeans/zipball/625f95f0c8f6b20c58fcc7566ef8a59b0b7552f3", 64 | "reference": "625f95f0c8f6b20c58fcc7566ef8a59b0b7552f3", 65 | "shasum": "" 66 | }, 67 | "require": { 68 | "php": "^7.3|^8.0" 69 | }, 70 | "require-dev": { 71 | "phpunit/phpunit": "^9.3" 72 | }, 73 | "type": "library", 74 | "autoload": { 75 | "psr-0": { 76 | "KMeans": "src/" 77 | } 78 | }, 79 | "notification-url": "https://packagist.org/downloads/", 80 | "license": [ 81 | "MIT" 82 | ], 83 | "authors": [ 84 | { 85 | "name": "Benjamin Delespierre", 86 | "email": "benjamin.delespierre@gmail.com", 87 | "role": "Developer" 88 | } 89 | ], 90 | "description": "K-Means algorithm for PHP", 91 | "keywords": [ 92 | "clustering", 93 | "kmeans", 94 | "php" 95 | ], 96 | "support": { 97 | "issues": "https://github.com/bdelespierre/php-kmeans/issues", 98 | "source": "https://github.com/bdelespierre/php-kmeans/tree/v2.2.0" 99 | }, 100 | "time": "2021-08-31T11:54:28+00:00" 101 | } 102 | ], 103 | "packages-dev": [ 104 | { 105 | "name": "doctrine/instantiator", 106 | "version": "2.0.0", 107 | "source": { 108 | "type": "git", 109 | "url": "https://github.com/doctrine/instantiator.git", 110 | "reference": "c6222283fa3f4ac679f8b9ced9a4e23f163e80d0" 111 | }, 112 | "dist": { 113 | "type": "zip", 114 | "url": "https://api.github.com/repos/doctrine/instantiator/zipball/c6222283fa3f4ac679f8b9ced9a4e23f163e80d0", 115 | "reference": "c6222283fa3f4ac679f8b9ced9a4e23f163e80d0", 116 | "shasum": "" 117 | }, 118 | "require": { 119 | "php": "^8.1" 120 | }, 121 | "require-dev": { 122 | "doctrine/coding-standard": "^11", 123 | "ext-pdo": "*", 124 | "ext-phar": "*", 125 | "phpbench/phpbench": "^1.2", 126 | "phpstan/phpstan": "^1.9.4", 127 | "phpstan/phpstan-phpunit": "^1.3", 128 | "phpunit/phpunit": "^9.5.27", 129 | "vimeo/psalm": "^5.4" 130 | }, 131 | "type": "library", 132 | "autoload": { 133 | "psr-4": { 134 | "Doctrine\\Instantiator\\": "src/Doctrine/Instantiator/" 135 | } 136 | }, 137 | "notification-url": "https://packagist.org/downloads/", 138 | "license": [ 139 | "MIT" 140 | ], 141 | "authors": [ 142 | { 143 | "name": "Marco Pivetta", 144 | "email": "ocramius@gmail.com", 145 | "homepage": "https://ocramius.github.io/" 146 | } 147 | ], 148 | "description": "A small, lightweight utility to instantiate objects in PHP without invoking their constructors", 149 | "homepage": "https://www.doctrine-project.org/projects/instantiator.html", 150 | "keywords": [ 151 | "constructor", 152 | "instantiate" 153 | ], 154 | "support": { 155 | "issues": "https://github.com/doctrine/instantiator/issues", 156 | "source": "https://github.com/doctrine/instantiator/tree/2.0.0" 157 | }, 158 | "funding": [ 159 | { 160 | "url": "https://www.doctrine-project.org/sponsorship.html", 161 | "type": "custom" 162 | }, 163 | { 164 | "url": "https://www.patreon.com/phpdoctrine", 165 | "type": "patreon" 166 | }, 167 | { 168 | "url": "https://tidelift.com/funding/github/packagist/doctrine%2Finstantiator", 169 | "type": "tidelift" 170 | } 171 | ], 172 | "time": "2022-12-30T00:23:10+00:00" 173 | }, 174 | { 175 | "name": "myclabs/deep-copy", 176 | "version": "1.12.1", 177 | "source": { 178 | "type": "git", 179 | "url": "https://github.com/myclabs/DeepCopy.git", 180 | "reference": "123267b2c49fbf30d78a7b2d333f6be754b94845" 181 | }, 182 | "dist": { 183 | "type": "zip", 184 | "url": "https://api.github.com/repos/myclabs/DeepCopy/zipball/123267b2c49fbf30d78a7b2d333f6be754b94845", 185 | "reference": "123267b2c49fbf30d78a7b2d333f6be754b94845", 186 | "shasum": "" 187 | }, 188 | "require": { 189 | "php": "^7.1 || ^8.0" 190 | }, 191 | "conflict": { 192 | "doctrine/collections": "<1.6.8", 193 | "doctrine/common": "<2.13.3 || >=3 <3.2.2" 194 | }, 195 | "require-dev": { 196 | "doctrine/collections": "^1.6.8", 197 | "doctrine/common": "^2.13.3 || ^3.2.2", 198 | "phpspec/prophecy": "^1.10", 199 | "phpunit/phpunit": "^7.5.20 || ^8.5.23 || ^9.5.13" 200 | }, 201 | "type": "library", 202 | "autoload": { 203 | "files": [ 204 | "src/DeepCopy/deep_copy.php" 205 | ], 206 | "psr-4": { 207 | "DeepCopy\\": "src/DeepCopy/" 208 | } 209 | }, 210 | "notification-url": "https://packagist.org/downloads/", 211 | "license": [ 212 | "MIT" 213 | ], 214 | "description": "Create deep copies (clones) of your objects", 215 | "keywords": [ 216 | "clone", 217 | "copy", 218 | "duplicate", 219 | "object", 220 | "object graph" 221 | ], 222 | "support": { 223 | "issues": "https://github.com/myclabs/DeepCopy/issues", 224 | "source": "https://github.com/myclabs/DeepCopy/tree/1.12.1" 225 | }, 226 | "funding": [ 227 | { 228 | "url": "https://tidelift.com/funding/github/packagist/myclabs/deep-copy", 229 | "type": "tidelift" 230 | } 231 | ], 232 | "time": "2024-11-08T17:47:46+00:00" 233 | }, 234 | { 235 | "name": "nikic/php-parser", 236 | "version": "v5.4.0", 237 | "source": { 238 | "type": "git", 239 | "url": "https://github.com/nikic/PHP-Parser.git", 240 | "reference": "447a020a1f875a434d62f2a401f53b82a396e494" 241 | }, 242 | "dist": { 243 | "type": "zip", 244 | "url": "https://api.github.com/repos/nikic/PHP-Parser/zipball/447a020a1f875a434d62f2a401f53b82a396e494", 245 | "reference": "447a020a1f875a434d62f2a401f53b82a396e494", 246 | "shasum": "" 247 | }, 248 | "require": { 249 | "ext-ctype": "*", 250 | "ext-json": "*", 251 | "ext-tokenizer": "*", 252 | "php": ">=7.4" 253 | }, 254 | "require-dev": { 255 | "ircmaxell/php-yacc": "^0.0.7", 256 | "phpunit/phpunit": "^9.0" 257 | }, 258 | "bin": [ 259 | "bin/php-parse" 260 | ], 261 | "type": "library", 262 | "extra": { 263 | "branch-alias": { 264 | "dev-master": "5.0-dev" 265 | } 266 | }, 267 | "autoload": { 268 | "psr-4": { 269 | "PhpParser\\": "lib/PhpParser" 270 | } 271 | }, 272 | "notification-url": "https://packagist.org/downloads/", 273 | "license": [ 274 | "BSD-3-Clause" 275 | ], 276 | "authors": [ 277 | { 278 | "name": "Nikita Popov" 279 | } 280 | ], 281 | "description": "A PHP parser written in PHP", 282 | "keywords": [ 283 | "parser", 284 | "php" 285 | ], 286 | "support": { 287 | "issues": "https://github.com/nikic/PHP-Parser/issues", 288 | "source": "https://github.com/nikic/PHP-Parser/tree/v5.4.0" 289 | }, 290 | "time": "2024-12-30T11:07:19+00:00" 291 | }, 292 | { 293 | "name": "phar-io/manifest", 294 | "version": "2.0.4", 295 | "source": { 296 | "type": "git", 297 | "url": "https://github.com/phar-io/manifest.git", 298 | "reference": "54750ef60c58e43759730615a392c31c80e23176" 299 | }, 300 | "dist": { 301 | "type": "zip", 302 | "url": "https://api.github.com/repos/phar-io/manifest/zipball/54750ef60c58e43759730615a392c31c80e23176", 303 | "reference": "54750ef60c58e43759730615a392c31c80e23176", 304 | "shasum": "" 305 | }, 306 | "require": { 307 | "ext-dom": "*", 308 | "ext-libxml": "*", 309 | "ext-phar": "*", 310 | "ext-xmlwriter": "*", 311 | "phar-io/version": "^3.0.1", 312 | "php": "^7.2 || ^8.0" 313 | }, 314 | "type": "library", 315 | "extra": { 316 | "branch-alias": { 317 | "dev-master": "2.0.x-dev" 318 | } 319 | }, 320 | "autoload": { 321 | "classmap": [ 322 | "src/" 323 | ] 324 | }, 325 | "notification-url": "https://packagist.org/downloads/", 326 | "license": [ 327 | "BSD-3-Clause" 328 | ], 329 | "authors": [ 330 | { 331 | "name": "Arne Blankerts", 332 | "email": "arne@blankerts.de", 333 | "role": "Developer" 334 | }, 335 | { 336 | "name": "Sebastian Heuer", 337 | "email": "sebastian@phpeople.de", 338 | "role": "Developer" 339 | }, 340 | { 341 | "name": "Sebastian Bergmann", 342 | "email": "sebastian@phpunit.de", 343 | "role": "Developer" 344 | } 345 | ], 346 | "description": "Component for reading phar.io manifest information from a PHP Archive (PHAR)", 347 | "support": { 348 | "issues": "https://github.com/phar-io/manifest/issues", 349 | "source": "https://github.com/phar-io/manifest/tree/2.0.4" 350 | }, 351 | "funding": [ 352 | { 353 | "url": "https://github.com/theseer", 354 | "type": "github" 355 | } 356 | ], 357 | "time": "2024-03-03T12:33:53+00:00" 358 | }, 359 | { 360 | "name": "phar-io/version", 361 | "version": "3.2.1", 362 | "source": { 363 | "type": "git", 364 | "url": "https://github.com/phar-io/version.git", 365 | "reference": "4f7fd7836c6f332bb2933569e566a0d6c4cbed74" 366 | }, 367 | "dist": { 368 | "type": "zip", 369 | "url": "https://api.github.com/repos/phar-io/version/zipball/4f7fd7836c6f332bb2933569e566a0d6c4cbed74", 370 | "reference": "4f7fd7836c6f332bb2933569e566a0d6c4cbed74", 371 | "shasum": "" 372 | }, 373 | "require": { 374 | "php": "^7.2 || ^8.0" 375 | }, 376 | "type": "library", 377 | "autoload": { 378 | "classmap": [ 379 | "src/" 380 | ] 381 | }, 382 | "notification-url": "https://packagist.org/downloads/", 383 | "license": [ 384 | "BSD-3-Clause" 385 | ], 386 | "authors": [ 387 | { 388 | "name": "Arne Blankerts", 389 | "email": "arne@blankerts.de", 390 | "role": "Developer" 391 | }, 392 | { 393 | "name": "Sebastian Heuer", 394 | "email": "sebastian@phpeople.de", 395 | "role": "Developer" 396 | }, 397 | { 398 | "name": "Sebastian Bergmann", 399 | "email": "sebastian@phpunit.de", 400 | "role": "Developer" 401 | } 402 | ], 403 | "description": "Library for handling version information and constraints", 404 | "support": { 405 | "issues": "https://github.com/phar-io/version/issues", 406 | "source": "https://github.com/phar-io/version/tree/3.2.1" 407 | }, 408 | "time": "2022-02-21T01:04:05+00:00" 409 | }, 410 | { 411 | "name": "phpunit/php-code-coverage", 412 | "version": "9.2.32", 413 | "source": { 414 | "type": "git", 415 | "url": "https://github.com/sebastianbergmann/php-code-coverage.git", 416 | "reference": "85402a822d1ecf1db1096959413d35e1c37cf1a5" 417 | }, 418 | "dist": { 419 | "type": "zip", 420 | "url": "https://api.github.com/repos/sebastianbergmann/php-code-coverage/zipball/85402a822d1ecf1db1096959413d35e1c37cf1a5", 421 | "reference": "85402a822d1ecf1db1096959413d35e1c37cf1a5", 422 | "shasum": "" 423 | }, 424 | "require": { 425 | "ext-dom": "*", 426 | "ext-libxml": "*", 427 | "ext-xmlwriter": "*", 428 | "nikic/php-parser": "^4.19.1 || ^5.1.0", 429 | "php": ">=7.3", 430 | "phpunit/php-file-iterator": "^3.0.6", 431 | "phpunit/php-text-template": "^2.0.4", 432 | "sebastian/code-unit-reverse-lookup": "^2.0.3", 433 | "sebastian/complexity": "^2.0.3", 434 | "sebastian/environment": "^5.1.5", 435 | "sebastian/lines-of-code": "^1.0.4", 436 | "sebastian/version": "^3.0.2", 437 | "theseer/tokenizer": "^1.2.3" 438 | }, 439 | "require-dev": { 440 | "phpunit/phpunit": "^9.6" 441 | }, 442 | "suggest": { 443 | "ext-pcov": "PHP extension that provides line coverage", 444 | "ext-xdebug": "PHP extension that provides line coverage as well as branch and path coverage" 445 | }, 446 | "type": "library", 447 | "extra": { 448 | "branch-alias": { 449 | "dev-main": "9.2.x-dev" 450 | } 451 | }, 452 | "autoload": { 453 | "classmap": [ 454 | "src/" 455 | ] 456 | }, 457 | "notification-url": "https://packagist.org/downloads/", 458 | "license": [ 459 | "BSD-3-Clause" 460 | ], 461 | "authors": [ 462 | { 463 | "name": "Sebastian Bergmann", 464 | "email": "sebastian@phpunit.de", 465 | "role": "lead" 466 | } 467 | ], 468 | "description": "Library that provides collection, processing, and rendering functionality for PHP code coverage information.", 469 | "homepage": "https://github.com/sebastianbergmann/php-code-coverage", 470 | "keywords": [ 471 | "coverage", 472 | "testing", 473 | "xunit" 474 | ], 475 | "support": { 476 | "issues": "https://github.com/sebastianbergmann/php-code-coverage/issues", 477 | "security": "https://github.com/sebastianbergmann/php-code-coverage/security/policy", 478 | "source": "https://github.com/sebastianbergmann/php-code-coverage/tree/9.2.32" 479 | }, 480 | "funding": [ 481 | { 482 | "url": "https://github.com/sebastianbergmann", 483 | "type": "github" 484 | } 485 | ], 486 | "time": "2024-08-22T04:23:01+00:00" 487 | }, 488 | { 489 | "name": "phpunit/php-file-iterator", 490 | "version": "3.0.6", 491 | "source": { 492 | "type": "git", 493 | "url": "https://github.com/sebastianbergmann/php-file-iterator.git", 494 | "reference": "cf1c2e7c203ac650e352f4cc675a7021e7d1b3cf" 495 | }, 496 | "dist": { 497 | "type": "zip", 498 | "url": "https://api.github.com/repos/sebastianbergmann/php-file-iterator/zipball/cf1c2e7c203ac650e352f4cc675a7021e7d1b3cf", 499 | "reference": "cf1c2e7c203ac650e352f4cc675a7021e7d1b3cf", 500 | "shasum": "" 501 | }, 502 | "require": { 503 | "php": ">=7.3" 504 | }, 505 | "require-dev": { 506 | "phpunit/phpunit": "^9.3" 507 | }, 508 | "type": "library", 509 | "extra": { 510 | "branch-alias": { 511 | "dev-master": "3.0-dev" 512 | } 513 | }, 514 | "autoload": { 515 | "classmap": [ 516 | "src/" 517 | ] 518 | }, 519 | "notification-url": "https://packagist.org/downloads/", 520 | "license": [ 521 | "BSD-3-Clause" 522 | ], 523 | "authors": [ 524 | { 525 | "name": "Sebastian Bergmann", 526 | "email": "sebastian@phpunit.de", 527 | "role": "lead" 528 | } 529 | ], 530 | "description": "FilterIterator implementation that filters files based on a list of suffixes.", 531 | "homepage": "https://github.com/sebastianbergmann/php-file-iterator/", 532 | "keywords": [ 533 | "filesystem", 534 | "iterator" 535 | ], 536 | "support": { 537 | "issues": "https://github.com/sebastianbergmann/php-file-iterator/issues", 538 | "source": "https://github.com/sebastianbergmann/php-file-iterator/tree/3.0.6" 539 | }, 540 | "funding": [ 541 | { 542 | "url": "https://github.com/sebastianbergmann", 543 | "type": "github" 544 | } 545 | ], 546 | "time": "2021-12-02T12:48:52+00:00" 547 | }, 548 | { 549 | "name": "phpunit/php-invoker", 550 | "version": "3.1.1", 551 | "source": { 552 | "type": "git", 553 | "url": "https://github.com/sebastianbergmann/php-invoker.git", 554 | "reference": "5a10147d0aaf65b58940a0b72f71c9ac0423cc67" 555 | }, 556 | "dist": { 557 | "type": "zip", 558 | "url": "https://api.github.com/repos/sebastianbergmann/php-invoker/zipball/5a10147d0aaf65b58940a0b72f71c9ac0423cc67", 559 | "reference": "5a10147d0aaf65b58940a0b72f71c9ac0423cc67", 560 | "shasum": "" 561 | }, 562 | "require": { 563 | "php": ">=7.3" 564 | }, 565 | "require-dev": { 566 | "ext-pcntl": "*", 567 | "phpunit/phpunit": "^9.3" 568 | }, 569 | "suggest": { 570 | "ext-pcntl": "*" 571 | }, 572 | "type": "library", 573 | "extra": { 574 | "branch-alias": { 575 | "dev-master": "3.1-dev" 576 | } 577 | }, 578 | "autoload": { 579 | "classmap": [ 580 | "src/" 581 | ] 582 | }, 583 | "notification-url": "https://packagist.org/downloads/", 584 | "license": [ 585 | "BSD-3-Clause" 586 | ], 587 | "authors": [ 588 | { 589 | "name": "Sebastian Bergmann", 590 | "email": "sebastian@phpunit.de", 591 | "role": "lead" 592 | } 593 | ], 594 | "description": "Invoke callables with a timeout", 595 | "homepage": "https://github.com/sebastianbergmann/php-invoker/", 596 | "keywords": [ 597 | "process" 598 | ], 599 | "support": { 600 | "issues": "https://github.com/sebastianbergmann/php-invoker/issues", 601 | "source": "https://github.com/sebastianbergmann/php-invoker/tree/3.1.1" 602 | }, 603 | "funding": [ 604 | { 605 | "url": "https://github.com/sebastianbergmann", 606 | "type": "github" 607 | } 608 | ], 609 | "time": "2020-09-28T05:58:55+00:00" 610 | }, 611 | { 612 | "name": "phpunit/php-text-template", 613 | "version": "2.0.4", 614 | "source": { 615 | "type": "git", 616 | "url": "https://github.com/sebastianbergmann/php-text-template.git", 617 | "reference": "5da5f67fc95621df9ff4c4e5a84d6a8a2acf7c28" 618 | }, 619 | "dist": { 620 | "type": "zip", 621 | "url": "https://api.github.com/repos/sebastianbergmann/php-text-template/zipball/5da5f67fc95621df9ff4c4e5a84d6a8a2acf7c28", 622 | "reference": "5da5f67fc95621df9ff4c4e5a84d6a8a2acf7c28", 623 | "shasum": "" 624 | }, 625 | "require": { 626 | "php": ">=7.3" 627 | }, 628 | "require-dev": { 629 | "phpunit/phpunit": "^9.3" 630 | }, 631 | "type": "library", 632 | "extra": { 633 | "branch-alias": { 634 | "dev-master": "2.0-dev" 635 | } 636 | }, 637 | "autoload": { 638 | "classmap": [ 639 | "src/" 640 | ] 641 | }, 642 | "notification-url": "https://packagist.org/downloads/", 643 | "license": [ 644 | "BSD-3-Clause" 645 | ], 646 | "authors": [ 647 | { 648 | "name": "Sebastian Bergmann", 649 | "email": "sebastian@phpunit.de", 650 | "role": "lead" 651 | } 652 | ], 653 | "description": "Simple template engine.", 654 | "homepage": "https://github.com/sebastianbergmann/php-text-template/", 655 | "keywords": [ 656 | "template" 657 | ], 658 | "support": { 659 | "issues": "https://github.com/sebastianbergmann/php-text-template/issues", 660 | "source": "https://github.com/sebastianbergmann/php-text-template/tree/2.0.4" 661 | }, 662 | "funding": [ 663 | { 664 | "url": "https://github.com/sebastianbergmann", 665 | "type": "github" 666 | } 667 | ], 668 | "time": "2020-10-26T05:33:50+00:00" 669 | }, 670 | { 671 | "name": "phpunit/php-timer", 672 | "version": "5.0.3", 673 | "source": { 674 | "type": "git", 675 | "url": "https://github.com/sebastianbergmann/php-timer.git", 676 | "reference": "5a63ce20ed1b5bf577850e2c4e87f4aa902afbd2" 677 | }, 678 | "dist": { 679 | "type": "zip", 680 | "url": "https://api.github.com/repos/sebastianbergmann/php-timer/zipball/5a63ce20ed1b5bf577850e2c4e87f4aa902afbd2", 681 | "reference": "5a63ce20ed1b5bf577850e2c4e87f4aa902afbd2", 682 | "shasum": "" 683 | }, 684 | "require": { 685 | "php": ">=7.3" 686 | }, 687 | "require-dev": { 688 | "phpunit/phpunit": "^9.3" 689 | }, 690 | "type": "library", 691 | "extra": { 692 | "branch-alias": { 693 | "dev-master": "5.0-dev" 694 | } 695 | }, 696 | "autoload": { 697 | "classmap": [ 698 | "src/" 699 | ] 700 | }, 701 | "notification-url": "https://packagist.org/downloads/", 702 | "license": [ 703 | "BSD-3-Clause" 704 | ], 705 | "authors": [ 706 | { 707 | "name": "Sebastian Bergmann", 708 | "email": "sebastian@phpunit.de", 709 | "role": "lead" 710 | } 711 | ], 712 | "description": "Utility class for timing", 713 | "homepage": "https://github.com/sebastianbergmann/php-timer/", 714 | "keywords": [ 715 | "timer" 716 | ], 717 | "support": { 718 | "issues": "https://github.com/sebastianbergmann/php-timer/issues", 719 | "source": "https://github.com/sebastianbergmann/php-timer/tree/5.0.3" 720 | }, 721 | "funding": [ 722 | { 723 | "url": "https://github.com/sebastianbergmann", 724 | "type": "github" 725 | } 726 | ], 727 | "time": "2020-10-26T13:16:10+00:00" 728 | }, 729 | { 730 | "name": "phpunit/phpunit", 731 | "version": "9.6.22", 732 | "source": { 733 | "type": "git", 734 | "url": "https://github.com/sebastianbergmann/phpunit.git", 735 | "reference": "f80235cb4d3caa59ae09be3adf1ded27521d1a9c" 736 | }, 737 | "dist": { 738 | "type": "zip", 739 | "url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/f80235cb4d3caa59ae09be3adf1ded27521d1a9c", 740 | "reference": "f80235cb4d3caa59ae09be3adf1ded27521d1a9c", 741 | "shasum": "" 742 | }, 743 | "require": { 744 | "doctrine/instantiator": "^1.5.0 || ^2", 745 | "ext-dom": "*", 746 | "ext-json": "*", 747 | "ext-libxml": "*", 748 | "ext-mbstring": "*", 749 | "ext-xml": "*", 750 | "ext-xmlwriter": "*", 751 | "myclabs/deep-copy": "^1.12.1", 752 | "phar-io/manifest": "^2.0.4", 753 | "phar-io/version": "^3.2.1", 754 | "php": ">=7.3", 755 | "phpunit/php-code-coverage": "^9.2.32", 756 | "phpunit/php-file-iterator": "^3.0.6", 757 | "phpunit/php-invoker": "^3.1.1", 758 | "phpunit/php-text-template": "^2.0.4", 759 | "phpunit/php-timer": "^5.0.3", 760 | "sebastian/cli-parser": "^1.0.2", 761 | "sebastian/code-unit": "^1.0.8", 762 | "sebastian/comparator": "^4.0.8", 763 | "sebastian/diff": "^4.0.6", 764 | "sebastian/environment": "^5.1.5", 765 | "sebastian/exporter": "^4.0.6", 766 | "sebastian/global-state": "^5.0.7", 767 | "sebastian/object-enumerator": "^4.0.4", 768 | "sebastian/resource-operations": "^3.0.4", 769 | "sebastian/type": "^3.2.1", 770 | "sebastian/version": "^3.0.2" 771 | }, 772 | "suggest": { 773 | "ext-soap": "To be able to generate mocks based on WSDL files", 774 | "ext-xdebug": "PHP extension that provides line coverage as well as branch and path coverage" 775 | }, 776 | "bin": [ 777 | "phpunit" 778 | ], 779 | "type": "library", 780 | "extra": { 781 | "branch-alias": { 782 | "dev-master": "9.6-dev" 783 | } 784 | }, 785 | "autoload": { 786 | "files": [ 787 | "src/Framework/Assert/Functions.php" 788 | ], 789 | "classmap": [ 790 | "src/" 791 | ] 792 | }, 793 | "notification-url": "https://packagist.org/downloads/", 794 | "license": [ 795 | "BSD-3-Clause" 796 | ], 797 | "authors": [ 798 | { 799 | "name": "Sebastian Bergmann", 800 | "email": "sebastian@phpunit.de", 801 | "role": "lead" 802 | } 803 | ], 804 | "description": "The PHP Unit Testing framework.", 805 | "homepage": "https://phpunit.de/", 806 | "keywords": [ 807 | "phpunit", 808 | "testing", 809 | "xunit" 810 | ], 811 | "support": { 812 | "issues": "https://github.com/sebastianbergmann/phpunit/issues", 813 | "security": "https://github.com/sebastianbergmann/phpunit/security/policy", 814 | "source": "https://github.com/sebastianbergmann/phpunit/tree/9.6.22" 815 | }, 816 | "funding": [ 817 | { 818 | "url": "https://phpunit.de/sponsors.html", 819 | "type": "custom" 820 | }, 821 | { 822 | "url": "https://github.com/sebastianbergmann", 823 | "type": "github" 824 | }, 825 | { 826 | "url": "https://tidelift.com/funding/github/packagist/phpunit/phpunit", 827 | "type": "tidelift" 828 | } 829 | ], 830 | "time": "2024-12-05T13:48:26+00:00" 831 | }, 832 | { 833 | "name": "sebastian/cli-parser", 834 | "version": "1.0.2", 835 | "source": { 836 | "type": "git", 837 | "url": "https://github.com/sebastianbergmann/cli-parser.git", 838 | "reference": "2b56bea83a09de3ac06bb18b92f068e60cc6f50b" 839 | }, 840 | "dist": { 841 | "type": "zip", 842 | "url": "https://api.github.com/repos/sebastianbergmann/cli-parser/zipball/2b56bea83a09de3ac06bb18b92f068e60cc6f50b", 843 | "reference": "2b56bea83a09de3ac06bb18b92f068e60cc6f50b", 844 | "shasum": "" 845 | }, 846 | "require": { 847 | "php": ">=7.3" 848 | }, 849 | "require-dev": { 850 | "phpunit/phpunit": "^9.3" 851 | }, 852 | "type": "library", 853 | "extra": { 854 | "branch-alias": { 855 | "dev-master": "1.0-dev" 856 | } 857 | }, 858 | "autoload": { 859 | "classmap": [ 860 | "src/" 861 | ] 862 | }, 863 | "notification-url": "https://packagist.org/downloads/", 864 | "license": [ 865 | "BSD-3-Clause" 866 | ], 867 | "authors": [ 868 | { 869 | "name": "Sebastian Bergmann", 870 | "email": "sebastian@phpunit.de", 871 | "role": "lead" 872 | } 873 | ], 874 | "description": "Library for parsing CLI options", 875 | "homepage": "https://github.com/sebastianbergmann/cli-parser", 876 | "support": { 877 | "issues": "https://github.com/sebastianbergmann/cli-parser/issues", 878 | "source": "https://github.com/sebastianbergmann/cli-parser/tree/1.0.2" 879 | }, 880 | "funding": [ 881 | { 882 | "url": "https://github.com/sebastianbergmann", 883 | "type": "github" 884 | } 885 | ], 886 | "time": "2024-03-02T06:27:43+00:00" 887 | }, 888 | { 889 | "name": "sebastian/code-unit", 890 | "version": "1.0.8", 891 | "source": { 892 | "type": "git", 893 | "url": "https://github.com/sebastianbergmann/code-unit.git", 894 | "reference": "1fc9f64c0927627ef78ba436c9b17d967e68e120" 895 | }, 896 | "dist": { 897 | "type": "zip", 898 | "url": "https://api.github.com/repos/sebastianbergmann/code-unit/zipball/1fc9f64c0927627ef78ba436c9b17d967e68e120", 899 | "reference": "1fc9f64c0927627ef78ba436c9b17d967e68e120", 900 | "shasum": "" 901 | }, 902 | "require": { 903 | "php": ">=7.3" 904 | }, 905 | "require-dev": { 906 | "phpunit/phpunit": "^9.3" 907 | }, 908 | "type": "library", 909 | "extra": { 910 | "branch-alias": { 911 | "dev-master": "1.0-dev" 912 | } 913 | }, 914 | "autoload": { 915 | "classmap": [ 916 | "src/" 917 | ] 918 | }, 919 | "notification-url": "https://packagist.org/downloads/", 920 | "license": [ 921 | "BSD-3-Clause" 922 | ], 923 | "authors": [ 924 | { 925 | "name": "Sebastian Bergmann", 926 | "email": "sebastian@phpunit.de", 927 | "role": "lead" 928 | } 929 | ], 930 | "description": "Collection of value objects that represent the PHP code units", 931 | "homepage": "https://github.com/sebastianbergmann/code-unit", 932 | "support": { 933 | "issues": "https://github.com/sebastianbergmann/code-unit/issues", 934 | "source": "https://github.com/sebastianbergmann/code-unit/tree/1.0.8" 935 | }, 936 | "funding": [ 937 | { 938 | "url": "https://github.com/sebastianbergmann", 939 | "type": "github" 940 | } 941 | ], 942 | "time": "2020-10-26T13:08:54+00:00" 943 | }, 944 | { 945 | "name": "sebastian/code-unit-reverse-lookup", 946 | "version": "2.0.3", 947 | "source": { 948 | "type": "git", 949 | "url": "https://github.com/sebastianbergmann/code-unit-reverse-lookup.git", 950 | "reference": "ac91f01ccec49fb77bdc6fd1e548bc70f7faa3e5" 951 | }, 952 | "dist": { 953 | "type": "zip", 954 | "url": "https://api.github.com/repos/sebastianbergmann/code-unit-reverse-lookup/zipball/ac91f01ccec49fb77bdc6fd1e548bc70f7faa3e5", 955 | "reference": "ac91f01ccec49fb77bdc6fd1e548bc70f7faa3e5", 956 | "shasum": "" 957 | }, 958 | "require": { 959 | "php": ">=7.3" 960 | }, 961 | "require-dev": { 962 | "phpunit/phpunit": "^9.3" 963 | }, 964 | "type": "library", 965 | "extra": { 966 | "branch-alias": { 967 | "dev-master": "2.0-dev" 968 | } 969 | }, 970 | "autoload": { 971 | "classmap": [ 972 | "src/" 973 | ] 974 | }, 975 | "notification-url": "https://packagist.org/downloads/", 976 | "license": [ 977 | "BSD-3-Clause" 978 | ], 979 | "authors": [ 980 | { 981 | "name": "Sebastian Bergmann", 982 | "email": "sebastian@phpunit.de" 983 | } 984 | ], 985 | "description": "Looks up which function or method a line of code belongs to", 986 | "homepage": "https://github.com/sebastianbergmann/code-unit-reverse-lookup/", 987 | "support": { 988 | "issues": "https://github.com/sebastianbergmann/code-unit-reverse-lookup/issues", 989 | "source": "https://github.com/sebastianbergmann/code-unit-reverse-lookup/tree/2.0.3" 990 | }, 991 | "funding": [ 992 | { 993 | "url": "https://github.com/sebastianbergmann", 994 | "type": "github" 995 | } 996 | ], 997 | "time": "2020-09-28T05:30:19+00:00" 998 | }, 999 | { 1000 | "name": "sebastian/comparator", 1001 | "version": "4.0.8", 1002 | "source": { 1003 | "type": "git", 1004 | "url": "https://github.com/sebastianbergmann/comparator.git", 1005 | "reference": "fa0f136dd2334583309d32b62544682ee972b51a" 1006 | }, 1007 | "dist": { 1008 | "type": "zip", 1009 | "url": "https://api.github.com/repos/sebastianbergmann/comparator/zipball/fa0f136dd2334583309d32b62544682ee972b51a", 1010 | "reference": "fa0f136dd2334583309d32b62544682ee972b51a", 1011 | "shasum": "" 1012 | }, 1013 | "require": { 1014 | "php": ">=7.3", 1015 | "sebastian/diff": "^4.0", 1016 | "sebastian/exporter": "^4.0" 1017 | }, 1018 | "require-dev": { 1019 | "phpunit/phpunit": "^9.3" 1020 | }, 1021 | "type": "library", 1022 | "extra": { 1023 | "branch-alias": { 1024 | "dev-master": "4.0-dev" 1025 | } 1026 | }, 1027 | "autoload": { 1028 | "classmap": [ 1029 | "src/" 1030 | ] 1031 | }, 1032 | "notification-url": "https://packagist.org/downloads/", 1033 | "license": [ 1034 | "BSD-3-Clause" 1035 | ], 1036 | "authors": [ 1037 | { 1038 | "name": "Sebastian Bergmann", 1039 | "email": "sebastian@phpunit.de" 1040 | }, 1041 | { 1042 | "name": "Jeff Welch", 1043 | "email": "whatthejeff@gmail.com" 1044 | }, 1045 | { 1046 | "name": "Volker Dusch", 1047 | "email": "github@wallbash.com" 1048 | }, 1049 | { 1050 | "name": "Bernhard Schussek", 1051 | "email": "bschussek@2bepublished.at" 1052 | } 1053 | ], 1054 | "description": "Provides the functionality to compare PHP values for equality", 1055 | "homepage": "https://github.com/sebastianbergmann/comparator", 1056 | "keywords": [ 1057 | "comparator", 1058 | "compare", 1059 | "equality" 1060 | ], 1061 | "support": { 1062 | "issues": "https://github.com/sebastianbergmann/comparator/issues", 1063 | "source": "https://github.com/sebastianbergmann/comparator/tree/4.0.8" 1064 | }, 1065 | "funding": [ 1066 | { 1067 | "url": "https://github.com/sebastianbergmann", 1068 | "type": "github" 1069 | } 1070 | ], 1071 | "time": "2022-09-14T12:41:17+00:00" 1072 | }, 1073 | { 1074 | "name": "sebastian/complexity", 1075 | "version": "2.0.3", 1076 | "source": { 1077 | "type": "git", 1078 | "url": "https://github.com/sebastianbergmann/complexity.git", 1079 | "reference": "25f207c40d62b8b7aa32f5ab026c53561964053a" 1080 | }, 1081 | "dist": { 1082 | "type": "zip", 1083 | "url": "https://api.github.com/repos/sebastianbergmann/complexity/zipball/25f207c40d62b8b7aa32f5ab026c53561964053a", 1084 | "reference": "25f207c40d62b8b7aa32f5ab026c53561964053a", 1085 | "shasum": "" 1086 | }, 1087 | "require": { 1088 | "nikic/php-parser": "^4.18 || ^5.0", 1089 | "php": ">=7.3" 1090 | }, 1091 | "require-dev": { 1092 | "phpunit/phpunit": "^9.3" 1093 | }, 1094 | "type": "library", 1095 | "extra": { 1096 | "branch-alias": { 1097 | "dev-master": "2.0-dev" 1098 | } 1099 | }, 1100 | "autoload": { 1101 | "classmap": [ 1102 | "src/" 1103 | ] 1104 | }, 1105 | "notification-url": "https://packagist.org/downloads/", 1106 | "license": [ 1107 | "BSD-3-Clause" 1108 | ], 1109 | "authors": [ 1110 | { 1111 | "name": "Sebastian Bergmann", 1112 | "email": "sebastian@phpunit.de", 1113 | "role": "lead" 1114 | } 1115 | ], 1116 | "description": "Library for calculating the complexity of PHP code units", 1117 | "homepage": "https://github.com/sebastianbergmann/complexity", 1118 | "support": { 1119 | "issues": "https://github.com/sebastianbergmann/complexity/issues", 1120 | "source": "https://github.com/sebastianbergmann/complexity/tree/2.0.3" 1121 | }, 1122 | "funding": [ 1123 | { 1124 | "url": "https://github.com/sebastianbergmann", 1125 | "type": "github" 1126 | } 1127 | ], 1128 | "time": "2023-12-22T06:19:30+00:00" 1129 | }, 1130 | { 1131 | "name": "sebastian/diff", 1132 | "version": "4.0.6", 1133 | "source": { 1134 | "type": "git", 1135 | "url": "https://github.com/sebastianbergmann/diff.git", 1136 | "reference": "ba01945089c3a293b01ba9badc29ad55b106b0bc" 1137 | }, 1138 | "dist": { 1139 | "type": "zip", 1140 | "url": "https://api.github.com/repos/sebastianbergmann/diff/zipball/ba01945089c3a293b01ba9badc29ad55b106b0bc", 1141 | "reference": "ba01945089c3a293b01ba9badc29ad55b106b0bc", 1142 | "shasum": "" 1143 | }, 1144 | "require": { 1145 | "php": ">=7.3" 1146 | }, 1147 | "require-dev": { 1148 | "phpunit/phpunit": "^9.3", 1149 | "symfony/process": "^4.2 || ^5" 1150 | }, 1151 | "type": "library", 1152 | "extra": { 1153 | "branch-alias": { 1154 | "dev-master": "4.0-dev" 1155 | } 1156 | }, 1157 | "autoload": { 1158 | "classmap": [ 1159 | "src/" 1160 | ] 1161 | }, 1162 | "notification-url": "https://packagist.org/downloads/", 1163 | "license": [ 1164 | "BSD-3-Clause" 1165 | ], 1166 | "authors": [ 1167 | { 1168 | "name": "Sebastian Bergmann", 1169 | "email": "sebastian@phpunit.de" 1170 | }, 1171 | { 1172 | "name": "Kore Nordmann", 1173 | "email": "mail@kore-nordmann.de" 1174 | } 1175 | ], 1176 | "description": "Diff implementation", 1177 | "homepage": "https://github.com/sebastianbergmann/diff", 1178 | "keywords": [ 1179 | "diff", 1180 | "udiff", 1181 | "unidiff", 1182 | "unified diff" 1183 | ], 1184 | "support": { 1185 | "issues": "https://github.com/sebastianbergmann/diff/issues", 1186 | "source": "https://github.com/sebastianbergmann/diff/tree/4.0.6" 1187 | }, 1188 | "funding": [ 1189 | { 1190 | "url": "https://github.com/sebastianbergmann", 1191 | "type": "github" 1192 | } 1193 | ], 1194 | "time": "2024-03-02T06:30:58+00:00" 1195 | }, 1196 | { 1197 | "name": "sebastian/environment", 1198 | "version": "5.1.5", 1199 | "source": { 1200 | "type": "git", 1201 | "url": "https://github.com/sebastianbergmann/environment.git", 1202 | "reference": "830c43a844f1f8d5b7a1f6d6076b784454d8b7ed" 1203 | }, 1204 | "dist": { 1205 | "type": "zip", 1206 | "url": "https://api.github.com/repos/sebastianbergmann/environment/zipball/830c43a844f1f8d5b7a1f6d6076b784454d8b7ed", 1207 | "reference": "830c43a844f1f8d5b7a1f6d6076b784454d8b7ed", 1208 | "shasum": "" 1209 | }, 1210 | "require": { 1211 | "php": ">=7.3" 1212 | }, 1213 | "require-dev": { 1214 | "phpunit/phpunit": "^9.3" 1215 | }, 1216 | "suggest": { 1217 | "ext-posix": "*" 1218 | }, 1219 | "type": "library", 1220 | "extra": { 1221 | "branch-alias": { 1222 | "dev-master": "5.1-dev" 1223 | } 1224 | }, 1225 | "autoload": { 1226 | "classmap": [ 1227 | "src/" 1228 | ] 1229 | }, 1230 | "notification-url": "https://packagist.org/downloads/", 1231 | "license": [ 1232 | "BSD-3-Clause" 1233 | ], 1234 | "authors": [ 1235 | { 1236 | "name": "Sebastian Bergmann", 1237 | "email": "sebastian@phpunit.de" 1238 | } 1239 | ], 1240 | "description": "Provides functionality to handle HHVM/PHP environments", 1241 | "homepage": "http://www.github.com/sebastianbergmann/environment", 1242 | "keywords": [ 1243 | "Xdebug", 1244 | "environment", 1245 | "hhvm" 1246 | ], 1247 | "support": { 1248 | "issues": "https://github.com/sebastianbergmann/environment/issues", 1249 | "source": "https://github.com/sebastianbergmann/environment/tree/5.1.5" 1250 | }, 1251 | "funding": [ 1252 | { 1253 | "url": "https://github.com/sebastianbergmann", 1254 | "type": "github" 1255 | } 1256 | ], 1257 | "time": "2023-02-03T06:03:51+00:00" 1258 | }, 1259 | { 1260 | "name": "sebastian/exporter", 1261 | "version": "4.0.6", 1262 | "source": { 1263 | "type": "git", 1264 | "url": "https://github.com/sebastianbergmann/exporter.git", 1265 | "reference": "78c00df8f170e02473b682df15bfcdacc3d32d72" 1266 | }, 1267 | "dist": { 1268 | "type": "zip", 1269 | "url": "https://api.github.com/repos/sebastianbergmann/exporter/zipball/78c00df8f170e02473b682df15bfcdacc3d32d72", 1270 | "reference": "78c00df8f170e02473b682df15bfcdacc3d32d72", 1271 | "shasum": "" 1272 | }, 1273 | "require": { 1274 | "php": ">=7.3", 1275 | "sebastian/recursion-context": "^4.0" 1276 | }, 1277 | "require-dev": { 1278 | "ext-mbstring": "*", 1279 | "phpunit/phpunit": "^9.3" 1280 | }, 1281 | "type": "library", 1282 | "extra": { 1283 | "branch-alias": { 1284 | "dev-master": "4.0-dev" 1285 | } 1286 | }, 1287 | "autoload": { 1288 | "classmap": [ 1289 | "src/" 1290 | ] 1291 | }, 1292 | "notification-url": "https://packagist.org/downloads/", 1293 | "license": [ 1294 | "BSD-3-Clause" 1295 | ], 1296 | "authors": [ 1297 | { 1298 | "name": "Sebastian Bergmann", 1299 | "email": "sebastian@phpunit.de" 1300 | }, 1301 | { 1302 | "name": "Jeff Welch", 1303 | "email": "whatthejeff@gmail.com" 1304 | }, 1305 | { 1306 | "name": "Volker Dusch", 1307 | "email": "github@wallbash.com" 1308 | }, 1309 | { 1310 | "name": "Adam Harvey", 1311 | "email": "aharvey@php.net" 1312 | }, 1313 | { 1314 | "name": "Bernhard Schussek", 1315 | "email": "bschussek@gmail.com" 1316 | } 1317 | ], 1318 | "description": "Provides the functionality to export PHP variables for visualization", 1319 | "homepage": "https://www.github.com/sebastianbergmann/exporter", 1320 | "keywords": [ 1321 | "export", 1322 | "exporter" 1323 | ], 1324 | "support": { 1325 | "issues": "https://github.com/sebastianbergmann/exporter/issues", 1326 | "source": "https://github.com/sebastianbergmann/exporter/tree/4.0.6" 1327 | }, 1328 | "funding": [ 1329 | { 1330 | "url": "https://github.com/sebastianbergmann", 1331 | "type": "github" 1332 | } 1333 | ], 1334 | "time": "2024-03-02T06:33:00+00:00" 1335 | }, 1336 | { 1337 | "name": "sebastian/global-state", 1338 | "version": "5.0.7", 1339 | "source": { 1340 | "type": "git", 1341 | "url": "https://github.com/sebastianbergmann/global-state.git", 1342 | "reference": "bca7df1f32ee6fe93b4d4a9abbf69e13a4ada2c9" 1343 | }, 1344 | "dist": { 1345 | "type": "zip", 1346 | "url": "https://api.github.com/repos/sebastianbergmann/global-state/zipball/bca7df1f32ee6fe93b4d4a9abbf69e13a4ada2c9", 1347 | "reference": "bca7df1f32ee6fe93b4d4a9abbf69e13a4ada2c9", 1348 | "shasum": "" 1349 | }, 1350 | "require": { 1351 | "php": ">=7.3", 1352 | "sebastian/object-reflector": "^2.0", 1353 | "sebastian/recursion-context": "^4.0" 1354 | }, 1355 | "require-dev": { 1356 | "ext-dom": "*", 1357 | "phpunit/phpunit": "^9.3" 1358 | }, 1359 | "suggest": { 1360 | "ext-uopz": "*" 1361 | }, 1362 | "type": "library", 1363 | "extra": { 1364 | "branch-alias": { 1365 | "dev-master": "5.0-dev" 1366 | } 1367 | }, 1368 | "autoload": { 1369 | "classmap": [ 1370 | "src/" 1371 | ] 1372 | }, 1373 | "notification-url": "https://packagist.org/downloads/", 1374 | "license": [ 1375 | "BSD-3-Clause" 1376 | ], 1377 | "authors": [ 1378 | { 1379 | "name": "Sebastian Bergmann", 1380 | "email": "sebastian@phpunit.de" 1381 | } 1382 | ], 1383 | "description": "Snapshotting of global state", 1384 | "homepage": "http://www.github.com/sebastianbergmann/global-state", 1385 | "keywords": [ 1386 | "global state" 1387 | ], 1388 | "support": { 1389 | "issues": "https://github.com/sebastianbergmann/global-state/issues", 1390 | "source": "https://github.com/sebastianbergmann/global-state/tree/5.0.7" 1391 | }, 1392 | "funding": [ 1393 | { 1394 | "url": "https://github.com/sebastianbergmann", 1395 | "type": "github" 1396 | } 1397 | ], 1398 | "time": "2024-03-02T06:35:11+00:00" 1399 | }, 1400 | { 1401 | "name": "sebastian/lines-of-code", 1402 | "version": "1.0.4", 1403 | "source": { 1404 | "type": "git", 1405 | "url": "https://github.com/sebastianbergmann/lines-of-code.git", 1406 | "reference": "e1e4a170560925c26d424b6a03aed157e7dcc5c5" 1407 | }, 1408 | "dist": { 1409 | "type": "zip", 1410 | "url": "https://api.github.com/repos/sebastianbergmann/lines-of-code/zipball/e1e4a170560925c26d424b6a03aed157e7dcc5c5", 1411 | "reference": "e1e4a170560925c26d424b6a03aed157e7dcc5c5", 1412 | "shasum": "" 1413 | }, 1414 | "require": { 1415 | "nikic/php-parser": "^4.18 || ^5.0", 1416 | "php": ">=7.3" 1417 | }, 1418 | "require-dev": { 1419 | "phpunit/phpunit": "^9.3" 1420 | }, 1421 | "type": "library", 1422 | "extra": { 1423 | "branch-alias": { 1424 | "dev-master": "1.0-dev" 1425 | } 1426 | }, 1427 | "autoload": { 1428 | "classmap": [ 1429 | "src/" 1430 | ] 1431 | }, 1432 | "notification-url": "https://packagist.org/downloads/", 1433 | "license": [ 1434 | "BSD-3-Clause" 1435 | ], 1436 | "authors": [ 1437 | { 1438 | "name": "Sebastian Bergmann", 1439 | "email": "sebastian@phpunit.de", 1440 | "role": "lead" 1441 | } 1442 | ], 1443 | "description": "Library for counting the lines of code in PHP source code", 1444 | "homepage": "https://github.com/sebastianbergmann/lines-of-code", 1445 | "support": { 1446 | "issues": "https://github.com/sebastianbergmann/lines-of-code/issues", 1447 | "source": "https://github.com/sebastianbergmann/lines-of-code/tree/1.0.4" 1448 | }, 1449 | "funding": [ 1450 | { 1451 | "url": "https://github.com/sebastianbergmann", 1452 | "type": "github" 1453 | } 1454 | ], 1455 | "time": "2023-12-22T06:20:34+00:00" 1456 | }, 1457 | { 1458 | "name": "sebastian/object-enumerator", 1459 | "version": "4.0.4", 1460 | "source": { 1461 | "type": "git", 1462 | "url": "https://github.com/sebastianbergmann/object-enumerator.git", 1463 | "reference": "5c9eeac41b290a3712d88851518825ad78f45c71" 1464 | }, 1465 | "dist": { 1466 | "type": "zip", 1467 | "url": "https://api.github.com/repos/sebastianbergmann/object-enumerator/zipball/5c9eeac41b290a3712d88851518825ad78f45c71", 1468 | "reference": "5c9eeac41b290a3712d88851518825ad78f45c71", 1469 | "shasum": "" 1470 | }, 1471 | "require": { 1472 | "php": ">=7.3", 1473 | "sebastian/object-reflector": "^2.0", 1474 | "sebastian/recursion-context": "^4.0" 1475 | }, 1476 | "require-dev": { 1477 | "phpunit/phpunit": "^9.3" 1478 | }, 1479 | "type": "library", 1480 | "extra": { 1481 | "branch-alias": { 1482 | "dev-master": "4.0-dev" 1483 | } 1484 | }, 1485 | "autoload": { 1486 | "classmap": [ 1487 | "src/" 1488 | ] 1489 | }, 1490 | "notification-url": "https://packagist.org/downloads/", 1491 | "license": [ 1492 | "BSD-3-Clause" 1493 | ], 1494 | "authors": [ 1495 | { 1496 | "name": "Sebastian Bergmann", 1497 | "email": "sebastian@phpunit.de" 1498 | } 1499 | ], 1500 | "description": "Traverses array structures and object graphs to enumerate all referenced objects", 1501 | "homepage": "https://github.com/sebastianbergmann/object-enumerator/", 1502 | "support": { 1503 | "issues": "https://github.com/sebastianbergmann/object-enumerator/issues", 1504 | "source": "https://github.com/sebastianbergmann/object-enumerator/tree/4.0.4" 1505 | }, 1506 | "funding": [ 1507 | { 1508 | "url": "https://github.com/sebastianbergmann", 1509 | "type": "github" 1510 | } 1511 | ], 1512 | "time": "2020-10-26T13:12:34+00:00" 1513 | }, 1514 | { 1515 | "name": "sebastian/object-reflector", 1516 | "version": "2.0.4", 1517 | "source": { 1518 | "type": "git", 1519 | "url": "https://github.com/sebastianbergmann/object-reflector.git", 1520 | "reference": "b4f479ebdbf63ac605d183ece17d8d7fe49c15c7" 1521 | }, 1522 | "dist": { 1523 | "type": "zip", 1524 | "url": "https://api.github.com/repos/sebastianbergmann/object-reflector/zipball/b4f479ebdbf63ac605d183ece17d8d7fe49c15c7", 1525 | "reference": "b4f479ebdbf63ac605d183ece17d8d7fe49c15c7", 1526 | "shasum": "" 1527 | }, 1528 | "require": { 1529 | "php": ">=7.3" 1530 | }, 1531 | "require-dev": { 1532 | "phpunit/phpunit": "^9.3" 1533 | }, 1534 | "type": "library", 1535 | "extra": { 1536 | "branch-alias": { 1537 | "dev-master": "2.0-dev" 1538 | } 1539 | }, 1540 | "autoload": { 1541 | "classmap": [ 1542 | "src/" 1543 | ] 1544 | }, 1545 | "notification-url": "https://packagist.org/downloads/", 1546 | "license": [ 1547 | "BSD-3-Clause" 1548 | ], 1549 | "authors": [ 1550 | { 1551 | "name": "Sebastian Bergmann", 1552 | "email": "sebastian@phpunit.de" 1553 | } 1554 | ], 1555 | "description": "Allows reflection of object attributes, including inherited and non-public ones", 1556 | "homepage": "https://github.com/sebastianbergmann/object-reflector/", 1557 | "support": { 1558 | "issues": "https://github.com/sebastianbergmann/object-reflector/issues", 1559 | "source": "https://github.com/sebastianbergmann/object-reflector/tree/2.0.4" 1560 | }, 1561 | "funding": [ 1562 | { 1563 | "url": "https://github.com/sebastianbergmann", 1564 | "type": "github" 1565 | } 1566 | ], 1567 | "time": "2020-10-26T13:14:26+00:00" 1568 | }, 1569 | { 1570 | "name": "sebastian/recursion-context", 1571 | "version": "4.0.5", 1572 | "source": { 1573 | "type": "git", 1574 | "url": "https://github.com/sebastianbergmann/recursion-context.git", 1575 | "reference": "e75bd0f07204fec2a0af9b0f3cfe97d05f92efc1" 1576 | }, 1577 | "dist": { 1578 | "type": "zip", 1579 | "url": "https://api.github.com/repos/sebastianbergmann/recursion-context/zipball/e75bd0f07204fec2a0af9b0f3cfe97d05f92efc1", 1580 | "reference": "e75bd0f07204fec2a0af9b0f3cfe97d05f92efc1", 1581 | "shasum": "" 1582 | }, 1583 | "require": { 1584 | "php": ">=7.3" 1585 | }, 1586 | "require-dev": { 1587 | "phpunit/phpunit": "^9.3" 1588 | }, 1589 | "type": "library", 1590 | "extra": { 1591 | "branch-alias": { 1592 | "dev-master": "4.0-dev" 1593 | } 1594 | }, 1595 | "autoload": { 1596 | "classmap": [ 1597 | "src/" 1598 | ] 1599 | }, 1600 | "notification-url": "https://packagist.org/downloads/", 1601 | "license": [ 1602 | "BSD-3-Clause" 1603 | ], 1604 | "authors": [ 1605 | { 1606 | "name": "Sebastian Bergmann", 1607 | "email": "sebastian@phpunit.de" 1608 | }, 1609 | { 1610 | "name": "Jeff Welch", 1611 | "email": "whatthejeff@gmail.com" 1612 | }, 1613 | { 1614 | "name": "Adam Harvey", 1615 | "email": "aharvey@php.net" 1616 | } 1617 | ], 1618 | "description": "Provides functionality to recursively process PHP variables", 1619 | "homepage": "https://github.com/sebastianbergmann/recursion-context", 1620 | "support": { 1621 | "issues": "https://github.com/sebastianbergmann/recursion-context/issues", 1622 | "source": "https://github.com/sebastianbergmann/recursion-context/tree/4.0.5" 1623 | }, 1624 | "funding": [ 1625 | { 1626 | "url": "https://github.com/sebastianbergmann", 1627 | "type": "github" 1628 | } 1629 | ], 1630 | "time": "2023-02-03T06:07:39+00:00" 1631 | }, 1632 | { 1633 | "name": "sebastian/resource-operations", 1634 | "version": "3.0.4", 1635 | "source": { 1636 | "type": "git", 1637 | "url": "https://github.com/sebastianbergmann/resource-operations.git", 1638 | "reference": "05d5692a7993ecccd56a03e40cd7e5b09b1d404e" 1639 | }, 1640 | "dist": { 1641 | "type": "zip", 1642 | "url": "https://api.github.com/repos/sebastianbergmann/resource-operations/zipball/05d5692a7993ecccd56a03e40cd7e5b09b1d404e", 1643 | "reference": "05d5692a7993ecccd56a03e40cd7e5b09b1d404e", 1644 | "shasum": "" 1645 | }, 1646 | "require": { 1647 | "php": ">=7.3" 1648 | }, 1649 | "require-dev": { 1650 | "phpunit/phpunit": "^9.0" 1651 | }, 1652 | "type": "library", 1653 | "extra": { 1654 | "branch-alias": { 1655 | "dev-main": "3.0-dev" 1656 | } 1657 | }, 1658 | "autoload": { 1659 | "classmap": [ 1660 | "src/" 1661 | ] 1662 | }, 1663 | "notification-url": "https://packagist.org/downloads/", 1664 | "license": [ 1665 | "BSD-3-Clause" 1666 | ], 1667 | "authors": [ 1668 | { 1669 | "name": "Sebastian Bergmann", 1670 | "email": "sebastian@phpunit.de" 1671 | } 1672 | ], 1673 | "description": "Provides a list of PHP built-in functions that operate on resources", 1674 | "homepage": "https://www.github.com/sebastianbergmann/resource-operations", 1675 | "support": { 1676 | "source": "https://github.com/sebastianbergmann/resource-operations/tree/3.0.4" 1677 | }, 1678 | "funding": [ 1679 | { 1680 | "url": "https://github.com/sebastianbergmann", 1681 | "type": "github" 1682 | } 1683 | ], 1684 | "time": "2024-03-14T16:00:52+00:00" 1685 | }, 1686 | { 1687 | "name": "sebastian/type", 1688 | "version": "3.2.1", 1689 | "source": { 1690 | "type": "git", 1691 | "url": "https://github.com/sebastianbergmann/type.git", 1692 | "reference": "75e2c2a32f5e0b3aef905b9ed0b179b953b3d7c7" 1693 | }, 1694 | "dist": { 1695 | "type": "zip", 1696 | "url": "https://api.github.com/repos/sebastianbergmann/type/zipball/75e2c2a32f5e0b3aef905b9ed0b179b953b3d7c7", 1697 | "reference": "75e2c2a32f5e0b3aef905b9ed0b179b953b3d7c7", 1698 | "shasum": "" 1699 | }, 1700 | "require": { 1701 | "php": ">=7.3" 1702 | }, 1703 | "require-dev": { 1704 | "phpunit/phpunit": "^9.5" 1705 | }, 1706 | "type": "library", 1707 | "extra": { 1708 | "branch-alias": { 1709 | "dev-master": "3.2-dev" 1710 | } 1711 | }, 1712 | "autoload": { 1713 | "classmap": [ 1714 | "src/" 1715 | ] 1716 | }, 1717 | "notification-url": "https://packagist.org/downloads/", 1718 | "license": [ 1719 | "BSD-3-Clause" 1720 | ], 1721 | "authors": [ 1722 | { 1723 | "name": "Sebastian Bergmann", 1724 | "email": "sebastian@phpunit.de", 1725 | "role": "lead" 1726 | } 1727 | ], 1728 | "description": "Collection of value objects that represent the types of the PHP type system", 1729 | "homepage": "https://github.com/sebastianbergmann/type", 1730 | "support": { 1731 | "issues": "https://github.com/sebastianbergmann/type/issues", 1732 | "source": "https://github.com/sebastianbergmann/type/tree/3.2.1" 1733 | }, 1734 | "funding": [ 1735 | { 1736 | "url": "https://github.com/sebastianbergmann", 1737 | "type": "github" 1738 | } 1739 | ], 1740 | "time": "2023-02-03T06:13:03+00:00" 1741 | }, 1742 | { 1743 | "name": "sebastian/version", 1744 | "version": "3.0.2", 1745 | "source": { 1746 | "type": "git", 1747 | "url": "https://github.com/sebastianbergmann/version.git", 1748 | "reference": "c6c1022351a901512170118436c764e473f6de8c" 1749 | }, 1750 | "dist": { 1751 | "type": "zip", 1752 | "url": "https://api.github.com/repos/sebastianbergmann/version/zipball/c6c1022351a901512170118436c764e473f6de8c", 1753 | "reference": "c6c1022351a901512170118436c764e473f6de8c", 1754 | "shasum": "" 1755 | }, 1756 | "require": { 1757 | "php": ">=7.3" 1758 | }, 1759 | "type": "library", 1760 | "extra": { 1761 | "branch-alias": { 1762 | "dev-master": "3.0-dev" 1763 | } 1764 | }, 1765 | "autoload": { 1766 | "classmap": [ 1767 | "src/" 1768 | ] 1769 | }, 1770 | "notification-url": "https://packagist.org/downloads/", 1771 | "license": [ 1772 | "BSD-3-Clause" 1773 | ], 1774 | "authors": [ 1775 | { 1776 | "name": "Sebastian Bergmann", 1777 | "email": "sebastian@phpunit.de", 1778 | "role": "lead" 1779 | } 1780 | ], 1781 | "description": "Library that helps with managing the version number of Git-hosted PHP projects", 1782 | "homepage": "https://github.com/sebastianbergmann/version", 1783 | "support": { 1784 | "issues": "https://github.com/sebastianbergmann/version/issues", 1785 | "source": "https://github.com/sebastianbergmann/version/tree/3.0.2" 1786 | }, 1787 | "funding": [ 1788 | { 1789 | "url": "https://github.com/sebastianbergmann", 1790 | "type": "github" 1791 | } 1792 | ], 1793 | "time": "2020-09-28T06:39:44+00:00" 1794 | }, 1795 | { 1796 | "name": "theseer/tokenizer", 1797 | "version": "1.2.3", 1798 | "source": { 1799 | "type": "git", 1800 | "url": "https://github.com/theseer/tokenizer.git", 1801 | "reference": "737eda637ed5e28c3413cb1ebe8bb52cbf1ca7a2" 1802 | }, 1803 | "dist": { 1804 | "type": "zip", 1805 | "url": "https://api.github.com/repos/theseer/tokenizer/zipball/737eda637ed5e28c3413cb1ebe8bb52cbf1ca7a2", 1806 | "reference": "737eda637ed5e28c3413cb1ebe8bb52cbf1ca7a2", 1807 | "shasum": "" 1808 | }, 1809 | "require": { 1810 | "ext-dom": "*", 1811 | "ext-tokenizer": "*", 1812 | "ext-xmlwriter": "*", 1813 | "php": "^7.2 || ^8.0" 1814 | }, 1815 | "type": "library", 1816 | "autoload": { 1817 | "classmap": [ 1818 | "src/" 1819 | ] 1820 | }, 1821 | "notification-url": "https://packagist.org/downloads/", 1822 | "license": [ 1823 | "BSD-3-Clause" 1824 | ], 1825 | "authors": [ 1826 | { 1827 | "name": "Arne Blankerts", 1828 | "email": "arne@blankerts.de", 1829 | "role": "Developer" 1830 | } 1831 | ], 1832 | "description": "A small library for converting tokenized PHP source code into XML and potentially other formats", 1833 | "support": { 1834 | "issues": "https://github.com/theseer/tokenizer/issues", 1835 | "source": "https://github.com/theseer/tokenizer/tree/1.2.3" 1836 | }, 1837 | "funding": [ 1838 | { 1839 | "url": "https://github.com/theseer", 1840 | "type": "github" 1841 | } 1842 | ], 1843 | "time": "2024-03-03T12:36:25+00:00" 1844 | } 1845 | ], 1846 | "aliases": [], 1847 | "minimum-stability": "stable", 1848 | "stability-flags": {}, 1849 | "prefer-stable": false, 1850 | "prefer-lowest": false, 1851 | "platform": { 1852 | "php": ">=8.0", 1853 | "ext-mysqli": "*", 1854 | "ext-mbstring": "*", 1855 | "ext-intl": "*", 1856 | "ext-ctype": "*", 1857 | "ext-iconv": "*" 1858 | }, 1859 | "platform-dev": {}, 1860 | "plugin-api-version": "2.6.0" 1861 | } 1862 | -------------------------------------------------------------------------------- /sql/test-queries.sql: -------------------------------------------------------------------------------- 1 | # Create test tables 2 | # ------------------ 3 | CREATE TABLE IF NOT EXISTS vector_meta_test_table ( 4 | vector_id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY, 5 | created TIMESTAMP DEFAULT CURRENT_TIMESTAMP 6 | ) ENGINE=InnoDB; 7 | CREATE TABLE IF NOT EXISTS vector_values_test_table ( 8 | id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY, 9 | vector_id INT UNSIGNED NOT NULL, 10 | element_position INT, 11 | vector_value DOUBLE, 12 | FOREIGN KEY (vector_id) REFERENCES vector_meta_test_table(vector_id) 13 | ) ENGINE=InnoDB; 14 | CREATE INDEX vector_id_index_test_table ON vector_values_test_table (vector_id); 15 | CREATE INDEX element_position_index_test_table ON vector_values_test_table (element_position); 16 | 17 | # Create test data 18 | # ---------------- 19 | -- Inserting data into vector_meta_test_table 20 | INSERT INTO vector_meta_test_table (vector_id) VALUES (1); 21 | INSERT INTO vector_meta_test_table (vector_id) VALUES (2); 22 | INSERT INTO vector_meta_test_table (vector_id) VALUES (3); 23 | 24 | -- Inserting data into vector_values_test_table 25 | INSERT INTO vector_values_test_table (vector_id, element_position, vector_value) VALUES (1, 1, 0.5); 26 | INSERT INTO vector_values_test_table (vector_id, element_position, vector_value) VALUES (1, 2, 0.6); 27 | INSERT INTO vector_values_test_table (vector_id, element_position, vector_value) VALUES (1, 3, 0.7); 28 | 29 | INSERT INTO vector_values_test_table (vector_id, element_position, vector_value) VALUES (2, 1, 0.8); 30 | INSERT INTO vector_values_test_table (vector_id, element_position, vector_value) VALUES (2, 2, 0.9); 31 | INSERT INTO vector_values_test_table (vector_id, element_position, vector_value) VALUES (2, 3, 1.0); 32 | 33 | INSERT INTO vector_values_test_table (vector_id, element_position, vector_value) VALUES (3, 1, 1.1); 34 | INSERT INTO vector_values_test_table (vector_id, element_position, vector_value) VALUES (3, 2, 1.2); 35 | INSERT INTO vector_values_test_table (vector_id, element_position, vector_value) VALUES (3, 3, 1.3); -------------------------------------------------------------------------------- /src/Nlp/AddedToken.php: -------------------------------------------------------------------------------- 1 | content = $config['content']; 17 | $this->id = $config['id']; 18 | $this->singleWord = $config['single_word'] ?? false; 19 | $this->lstrip = $config['lstrip'] ?? false; 20 | $this->rstrip = $config['rstrip'] ?? false; 21 | $this->special = $config['special'] ?? false; 22 | $this->normalized = $config['normalized'] ?? null; 23 | } 24 | } -------------------------------------------------------------------------------- /src/Nlp/BertNormalizer.php: -------------------------------------------------------------------------------- 1 | true, 9 | 'strip_accents' => false, 10 | 'clean_text' => true, 11 | 'handle_chinese_chars' => true 12 | ]) 13 | { 14 | parent::__construct($config); 15 | } 16 | 17 | private function tokenizeChineseChars($text) { 18 | $output = ''; 19 | for ($i = 0; $i < mb_strlen($text); ++$i) { 20 | $char = mb_substr($text, $i, 1); 21 | $cp = mb_ord($char); 22 | 23 | if ($this->isChineseChar($cp)) { // Assuming isChineseChar is a function you define to check Chinese characters 24 | $output .= " " . $char; 25 | } else { 26 | $output .= $char; 27 | } 28 | } 29 | $output = trim($output); 30 | return preg_replace('/\s+/', ' ', $output); 31 | } 32 | 33 | private function isChineseChar($cp): bool { 34 | return ( 35 | ($cp >= 0x4E00 && $cp <= 0x9FFF) || 36 | ($cp >= 0x3400 && $cp <= 0x4DBF) || 37 | ($cp >= 0x20000 && $cp <= 0x2A6DF) || 38 | ($cp >= 0x2A700 && $cp <= 0x2B73F) || 39 | ($cp >= 0x2B740 && $cp <= 0x2B81F) || 40 | ($cp >= 0x2B820 && $cp <= 0x2CEAF) || 41 | ($cp >= 0xF900 && $cp <= 0xFAFF) || 42 | ($cp >= 0x2F800 && $cp <= 0x2FA1F) 43 | ); 44 | } 45 | 46 | private function stripAccents($text): string { 47 | // Normalize the text to decompose the accents 48 | $normalized = \Normalizer::normalize($text, \Normalizer::FORM_D); 49 | 50 | // Remove the accents using a regex and return the result 51 | return preg_replace('/\p{Mn}/u', '', $normalized); 52 | } 53 | 54 | private function isControlCharacter($char) : bool { 55 | if (in_array($char, ["\t", "\n", "\r"])) { 56 | // These are control characters but counted as whitespace characters. 57 | return false; 58 | } 59 | 60 | // Check if the character is a control/format/private use/surrogate character 61 | return preg_match('/\p{Cc}|\p{Cf}|\p{Co}|\p{Cs}/u', $char) === 1; 62 | } 63 | 64 | private function cleanText($text): string { 65 | $output = ''; 66 | for ($i = 0; $i < mb_strlen($text); ++$i) { 67 | $char = mb_substr($text, $i, 1); 68 | $cp = mb_ord($char); 69 | 70 | if ($cp === 0 || $cp === 0xFFFD || $this->isControlCharacter($char)) { 71 | continue; 72 | } 73 | 74 | if (preg_match('/^\s$/u', $char)) { // is whitespace 75 | $output .= " "; 76 | } else { 77 | $output .= $char; 78 | } 79 | } 80 | return $output; 81 | } 82 | 83 | public function normalize($text): string { 84 | if (!empty($this->config['clean_text'])) { 85 | $text = $this->cleanText($text); 86 | } 87 | 88 | if (!empty($this->config['handle_chinese_chars'])) { 89 | $text = $this->tokenizeChineseChars($text); 90 | } 91 | 92 | if (!empty($this->config['lowercase'])) { 93 | $text = strtolower($text); 94 | 95 | if (!empty($this->config['strip_accents'] !== false)) { 96 | $text = $this->stripAccents($text); 97 | } 98 | } elseif (!empty($this->config['strip_accents'])) { 99 | $text = $this->stripAccents($text); 100 | } 101 | 102 | return $text; 103 | } 104 | } -------------------------------------------------------------------------------- /src/Nlp/BertPreTokenizer.php: -------------------------------------------------------------------------------- 1 | pattern = "/[^\\s" . PUNCTUATION_REGEX . "]+|[" . PUNCTUATION_REGEX . "]/u"; 17 | } 18 | 19 | /** 20 | * Tokenizes a single text using the BERT pre-tokenizer 21 | * @param string $text 22 | * @param array $options 23 | * @return array 24 | */ 25 | public function preTokenizeText(string $text, array $options = []): array 26 | { 27 | $text = trim($text); 28 | preg_match_all($this->pattern, $text, $matches); 29 | 30 | return $matches[0] ?? []; 31 | } 32 | } -------------------------------------------------------------------------------- /src/Nlp/BertTokenizer.php: -------------------------------------------------------------------------------- 1 | ' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"; 9 | protected array $tokenizerConfig; 10 | private BertNormalizer $normalizer; 11 | private BertPreTokenizer $preTokenizer; 12 | private WordpieceTokenizer $model; 13 | private TemplateProcessing $postProcessor; 14 | private WordPieceDecoder $decoder; 15 | private array $specialTokens; 16 | private array $allSpecialIds; 17 | private array $addedTokens; 18 | private ?string $addedTokensRegex; 19 | /** 20 | * @var array|mixed 21 | */ 22 | private mixed $additionalSpecialTokens; 23 | private $maskToken; 24 | /** 25 | * @var int|mixed 26 | */ 27 | private mixed $maskTokenId; 28 | private $padToken; 29 | /** 30 | * @var int|mixed 31 | */ 32 | private mixed $padTokenId; 33 | private $sepToken; 34 | /** 35 | * @var int|mixed 36 | */ 37 | private mixed $sepTokenId; 38 | private $unkToken; 39 | /** 40 | * @var int|mixed 41 | */ 42 | private mixed $unkTokenId; 43 | /** 44 | * @var int|mixed 45 | */ 46 | public mixed $modelMaxLength; 47 | /** 48 | * @var false|mixed 49 | */ 50 | private mixed $removeSpace; 51 | /** 52 | * @var mixed|true 53 | */ 54 | private mixed $cleanUpTokenizationSpaces; 55 | /** 56 | * @var false|mixed 57 | */ 58 | private mixed $doLowercaseAndRemoveAccent; 59 | private string $paddingSide; 60 | /** 61 | * @var false 62 | */ 63 | private bool $legacy; 64 | /** 65 | * @var mixed|null 66 | */ 67 | private mixed $chatTemplate; 68 | private array $compiledTemplateCache; 69 | 70 | public function __construct(array $tokenizerJSON, array $tokenizerConfig) { 71 | $this->tokenizerConfig = $tokenizerConfig; 72 | 73 | $this->normalizer = new BertNormalizer($tokenizerJSON['normalizer']); 74 | $this->preTokenizer = new BertPreTokenizer($tokenizerJSON['pre_tokenizer']); 75 | 76 | $this->model = new WordpieceTokenizer($tokenizerJSON['model']); 77 | $this->postProcessor = new TemplateProcessing($tokenizerJSON['post_processor']); 78 | 79 | $this->decoder = new WordPieceDecoder($tokenizerJSON['decoder']); 80 | 81 | $this->specialTokens = []; 82 | $this->allSpecialIds = []; 83 | 84 | $this->addedTokens = []; 85 | foreach ($tokenizerJSON['added_tokens'] as $addedToken) { 86 | $token = new AddedToken($addedToken); 87 | $this->addedTokens[] = $token; 88 | 89 | $this->model->tokensToIds[$token->content] = $token->id; 90 | $this->model->vocab[$token->id] = $token->content; 91 | 92 | if($token->special) { 93 | $this->specialTokens[] = $token->content; 94 | $this->allSpecialIds[] = $token->id; 95 | } 96 | } 97 | 98 | //Update additional special tokens 99 | $this->additionalSpecialTokens = $tokenizerConfig['additional_special_tokens'] ?? []; 100 | $this->specialTokens = array_merge($this->specialTokens, $this->additionalSpecialTokens); 101 | $this->specialTokens = array_unique($this->specialTokens); 102 | 103 | if(!empty($this->decoder)) { 104 | // Slight hack, but it prevents code duplication 105 | $this->decoder->addedTokens = $this->addedTokens; 106 | 107 | // Another slight hack to add `end_of_word_suffix` (if present) to the decoder 108 | // This is needed for cases where BPE model and ByteLevel decoder are used 109 | // For more information, see https://github.com/xenova/transformers.js/issues/74 110 | // TODO: save this to the decoder when exporting? 111 | $this->decoder->endOfWordSuffix = $this->model->endOfWordSuffix; 112 | } 113 | 114 | if (count($this->addedTokens) > 0) { 115 | $regexParts = array_map(function ($token) { 116 | $lstrip = $token->lstrip ? '\\s*' : ''; 117 | $rstrip = $token->rstrip ? '\\s*' : ''; 118 | $content = preg_quote($token->content, '/'); 119 | 120 | return $lstrip . '(' . $content . ')' . $rstrip; 121 | }, $this->addedTokens); 122 | 123 | $this->addedTokensRegex = '/' . implode('|', $regexParts) . '/'; 124 | } else { 125 | $this->addedTokensRegex = null; 126 | } 127 | 128 | $this->maskToken = $this->getToken('mask_token'); 129 | $this->maskTokenId = $this->model->tokensToIds[$this->maskToken]; 130 | 131 | $this->padToken = $this->getToken('pad_token', 'eos_token'); 132 | $this->padTokenId = $this->model->tokensToIds[$this->padToken]; 133 | 134 | $this->sepToken = $this->getToken('sep_token'); 135 | $this->sepTokenId = $this->model->tokensToIds[$this->sepToken]; 136 | 137 | $this->unkToken = $this->getToken('unk_token'); 138 | $this->unkTokenId = $this->model->tokensToIds[$this->unkToken]; 139 | 140 | $this->modelMaxLength = $tokenizerConfig['model_max_length'] ?? 512; 141 | 142 | $this->removeSpace = $tokenizerConfig['remove_space'] ?? false; 143 | 144 | $this->cleanUpTokenizationSpaces = $tokenizerConfig['clean_up_tokenization_spaces'] ?? true; 145 | $this->doLowercaseAndRemoveAccent = $tokenizerConfig['do_lowercase_and_remove_accent'] ?? false; 146 | 147 | $this->paddingSide = 'right'; 148 | 149 | $this->legacy = false; 150 | 151 | $this->chatTemplate = $tokenizerConfig['chat_template'] ?? null; 152 | $this->compiledTemplateCache = []; 153 | } 154 | 155 | /** 156 | * Returns the value of the first matching key in the tokenizer config object. 157 | * @param ...$keys string keys to search for. 158 | * @return mixed|null The value of the first matching key, or null if no key is found. 159 | * @throws \Exception If an object is found for a matching key and its __type property is not 'AddedToken'. 160 | */ 161 | public function getToken(...$keys): mixed 162 | { 163 | foreach ($keys as $key) { 164 | if (!isset($this->tokenizerConfig[$key])) { 165 | continue; 166 | } 167 | 168 | $item = $this->tokenizerConfig[$key]; 169 | 170 | if (is_array($item)) { 171 | if (isset($item['__type']) && $item['__type'] === 'AddedToken') { 172 | return $item['content']; 173 | } else { 174 | throw new \Exception("Unknown token: " . json_encode($item)); 175 | } 176 | } else { 177 | return $item; 178 | } 179 | } 180 | return null; 181 | } 182 | 183 | /** 184 | * This function can be overridden by a subclass to apply additional preprocessing steps to the inputs. 185 | * @param $inputs array The inputs to preprocess. 186 | * @return array The modified inputs. 187 | * @throws \Exception If input ids are not an array. 188 | */ 189 | public function prepareModelInputs($inputs): array 190 | { 191 | return $this->addTokenTypes($inputs); 192 | } 193 | 194 | /** 195 | * Helper method for adding `token_type_ids` to model inputs. 196 | * 197 | * @param array $inputs An associative array containing the input ids and attention mask. 198 | * @return array The prepared inputs array. 199 | * @throws \Exception If input ids are not an array. 200 | */ 201 | private function addTokenTypes(array $inputs): array 202 | { 203 | if (!is_array($inputs['input_ids'])) { 204 | throw new \Exception('Input ids must be an array'); 205 | } 206 | 207 | if (is_array($inputs['input_ids'][0])) { 208 | // Input is batched, so batch the token_type_ids as well 209 | $inputs['token_type_ids'] = array_map(function($x) { 210 | return array_fill(0, count($x), 0); 211 | }, $inputs['input_ids']); 212 | } else { 213 | // Single input 214 | $inputs['token_type_ids'] = array_fill(0, count($inputs['input_ids']), 0); 215 | } 216 | 217 | return $inputs; 218 | } 219 | 220 | public function call(string|array $text, array $options = [ 221 | 'text_pair' => null, 222 | 'add_special_tokens' => true, 223 | 'padding' => false, 224 | 'truncation' => null, 225 | 'max_length' => null, 226 | 'return_tensor' => false, // Different to HF 227 | ]): array 228 | { 229 | $textPair = $options['text_pair'] ?? null; 230 | $addSpecialTokens = $options['add_special_tokens'] ?? true; 231 | $padding = $options['padding'] ?? false; 232 | $truncation = $options['truncation'] ?? null; 233 | $maxLength = $options['max_length'] ?? null; 234 | $returnTensor = $options['return_tensor'] ?? true; 235 | 236 | $tokens = []; 237 | 238 | if(is_array($text)) { 239 | if(count($text) === 0) { 240 | throw new \Exception('Input is empty'); 241 | } 242 | 243 | if($textPair !== null) { 244 | if(!is_array($textPair)) { 245 | throw new \Exception('`text_pair` must be an array'); 246 | } else if(count($text) !== count($textPair)) { 247 | throw new \Exception('`text` and `text_pair` must have the same length'); 248 | } 249 | 250 | foreach ($text as $i => $t) { 251 | $tokens[] = $this->encode($t, $textPair[$i], $options); 252 | } 253 | } else { 254 | foreach ($text as $x) { 255 | $tokens[] = $this->encode($x, null, $options); 256 | } 257 | } 258 | } else { 259 | if($text === null) { 260 | throw new \Exception('text may not be null'); 261 | } 262 | 263 | if(is_array($textPair)) { 264 | throw new \Exception('When specifying `text_pair`, since `text` is a string, `text_pair` must also be a string (i.e., not an array).' ); 265 | } 266 | 267 | $tokens[] = $this->encode($text, $textPair, $options); 268 | } 269 | 270 | if($maxLength === null) { 271 | if($padding === 'max_length') { 272 | $maxLength = $this->modelMaxLength; 273 | } else { 274 | $maxLength = max(array_map(function($x) { return count($x); }, $tokens)); 275 | } 276 | } 277 | 278 | $maxLength = min($maxLength, $this->modelMaxLength); 279 | 280 | $attentionMask = []; 281 | if ($padding || $truncation) { 282 | for ($i = 0; $i < count($tokens); ++$i) { 283 | if (count($tokens[$i]) === $maxLength) { 284 | $attentionMask[] = array_fill(0, count($tokens[$i]), 1); 285 | } elseif (count($tokens[$i]) > $maxLength) { 286 | // Possibly truncate 287 | if ($truncation) { 288 | $tokens[$i] = substr($tokens[$i], 0, $maxLength); 289 | } 290 | $attentionMask[] = array_fill(0, count($tokens[$i]), 1); 291 | } else { 292 | // Token length < max_length 293 | $diff = $maxLength - count($tokens[$i]); 294 | if ($padding) { 295 | if ($this->paddingSide === 'right') { 296 | $attentionMask[] = array_merge(array_fill(0, count($tokens[$i]), 1), array_fill(0, $diff, 0)); 297 | for ($j = 0; $j < $diff; $j++) { 298 | $tokens[$i][] = $this->padTokenId; 299 | } 300 | } else { 301 | // Padding on the left 302 | $attentionMask[] = array_merge(array_fill(0, $diff, 0), array_fill(0, count($tokens[$i]), 1)); 303 | $paddingTokens = array_fill(0, $diff, $this->padTokenId); 304 | foreach ($paddingTokens as $paddingToken) { 305 | array_unshift($tokens[$i], $paddingToken); 306 | } 307 | } 308 | } else { 309 | $attentionMask[] = array_fill(0, count($tokens[$i]), 1); 310 | } 311 | } 312 | } 313 | } else { 314 | foreach ($tokens as $token) { 315 | $attentionMask[] = array_fill(0, count($token), 1); 316 | } 317 | } 318 | 319 | // Not going to bother with the `return_tensors` option for now 320 | // todo: add `return_tensors` option 321 | 322 | if(!is_array($text)) { 323 | $tokens = $tokens[0]; 324 | $attentionMask = $attentionMask[0]; 325 | } 326 | 327 | $modelInputs = [ 328 | 'input_ids' => $tokens, 329 | 'attention_mask' => $attentionMask, 330 | ]; 331 | 332 | return $this->prepareModelInputs($modelInputs); 333 | } 334 | 335 | /** 336 | * Helper function to remove accents from a string. 337 | * 338 | * @param string $text The text to remove accents from. 339 | * @return string The text with accents removed. 340 | */ 341 | private function removeAccents(string $text): string 342 | { 343 | return iconv('UTF-8', 'ASCII//TRANSLIT', $text); 344 | } 345 | 346 | /** 347 | * Helper function to lowercase a string and remove accents. 348 | * 349 | * @param string $text The text to lowercase and remove accents from. 350 | * @return string The lowercased text with accents removed. 351 | */ 352 | private function lowercaseAndRemoveAccent(string $text): string 353 | { 354 | return $this->removeAccents(mb_strtolower($text, 'UTF-8')); 355 | } 356 | 357 | /** 358 | * Encodes a single text using the preprocessor pipeline of the tokenizer 359 | * @param $text string|null the text to encode 360 | * @return array|null the encoded tokens 361 | * @throws \Exception 362 | */ 363 | private function encodeText(?string $text): ?array { 364 | if ($text === null) return null; 365 | 366 | // Split text based on added tokens regex, if available 367 | $sections = $this->addedTokensRegex ? preg_split($this->addedTokensRegex, $text) : [$text]; 368 | $sections = array_filter($sections); // Filter out empty strings 369 | 370 | $tokens = []; 371 | foreach ($sections as $sectionIndex => $x) { 372 | $postFilter = array_filter($this->addedTokens, function($t) use ($x) { 373 | return $t->content === $x; 374 | }); 375 | $addedToken = reset($postFilter); 376 | if (!empty($addedToken)) { 377 | $tokens[] = $x; 378 | } else { 379 | // Process the section 380 | if ($this->removeSpace === true) { 381 | $x = preg_replace('/\s+/', ' ', trim($x)); 382 | } 383 | if ($this->doLowercaseAndRemoveAccent) { 384 | $x = $this->lowercaseAndRemoveAccent($x); // Implement this method 385 | } 386 | 387 | if ($this->normalizer !== null) { 388 | $x = $this->normalizer->normalize($x); // Assuming normalizer is an object with a normalize method 389 | } 390 | 391 | $sectionTokens = ($this->preTokenizer !== null) ? $this->preTokenizer->preTokenize($x) : [$x]; 392 | $modelTokens = $this->model->encode($sectionTokens); // Assuming model is an object with an encode method 393 | 394 | $tokens = array_merge($tokens, $modelTokens); 395 | } 396 | } 397 | 398 | return $tokens; 399 | } 400 | 401 | /** 402 | * Encodes a single text or a pair of texts using the model's tokenizer. 403 | * @param $text string the text to encode 404 | * @param $textPair string|null the second text to encode 405 | * @param $options array the options for encoding 406 | * @return array 407 | * @throws \Exception 408 | */ 409 | public function encode(string $text, ?string $textPair = null, array $options = []): array { 410 | $addSpecialTokens = $options['add_special_tokens'] ?? true; 411 | 412 | $tokens = $this->encodeText($text); 413 | $tokens2 = !empty($textPair) ? $this->encodeText($textPair) : []; 414 | 415 | // TODO: Improve `add_special_tokens` and ensure correctness 416 | $combinedTokens = ($this->postProcessor !== null && $addSpecialTokens) 417 | ? $this->postProcessor->postProcess($tokens, $tokens2) 418 | : array_merge($tokens ?? [], $tokens2 ?? []); 419 | 420 | $ids = $this->model->convertTokensToIds($combinedTokens); 421 | return $ids; 422 | } 423 | 424 | /** 425 | * Decode a batch of tokenized sequences. 426 | * @param $batch array list of tokenized input sequences. 427 | * @param $decodeArgs array Optional object with decoding arguments 428 | * @return array List of decoded sequences. 429 | */ 430 | public function batchDecode($batch, $decodeArgs = []) { 431 | $decoded = []; 432 | foreach ($batch as $sequence) { 433 | $decoded[] = $this->decode($sequence, $decodeArgs); // Assuming decode is a method in this class 434 | } 435 | return $decoded; 436 | } 437 | 438 | /** 439 | * Decodes a sequence of token IDs back to a string. 440 | * 441 | * @param array|int[] $tokenIds List of token IDs to decode. 442 | * @param array $decodeArgs { 443 | * Optional. Arguments for decoding. 444 | * 445 | * @type bool $skipSpecialTokens If true, special tokens are removed from the output string. 446 | * @type bool $cleanUpTokenizationSpaces If true, spaces before punctuations are removed. 447 | * } 448 | * @return string The decoded string. 449 | * @throws \Exception If `tokenIds` is not a non-empty array of integers. 450 | */ 451 | public function decode(array $tokenIds, array $decodeArgs = []): string 452 | { 453 | if (!is_array($tokenIds) || count($tokenIds) === 0 || !$this->isIntegralNumber($tokenIds[0])) { 454 | throw new \Exception("tokenIds must be a non-empty array of integers."); 455 | } 456 | 457 | return $this->decodeSingle($tokenIds, $decodeArgs); // Assuming 'decodeSingle' is a method defined for decoding 458 | } 459 | 460 | /** 461 | * Check if a value is an integer. 462 | * 463 | * @param mixed $x The value to check. 464 | * @return bool True if the value is an integer, false otherwise. 465 | */ 466 | private function isIntegralNumber(mixed $x): bool 467 | { 468 | return is_int($x) || is_string($x) && ctype_digit($x); 469 | } 470 | 471 | /** 472 | * Decode a single list of token ids to a string. 473 | * 474 | * @param array $tokenIds List of token ids to decode. 475 | * @param array $decodeArgs { 476 | * Optional arguments for decoding. 477 | * 478 | * @type bool $skipSpecialTokens Whether to skip special tokens during decoding. 479 | * @type bool|null $cleanUpTokenizationSpaces Whether to clean up tokenization spaces during decoding. 480 | * } 481 | * @return string The decoded string. 482 | * @throws \Exception 483 | */ 484 | public function decodeSingle(array $tokenIds, array $decodeArgs = []): string 485 | { 486 | $skipSpecialTokens = $decodeArgs['skip_special_tokens'] ?? false; 487 | $cleanUpTokenizationSpaces = $decodeArgs['clean_up_tokenization_spaces'] ?? $this->cleanUpTokenizationSpaces ?? true; 488 | 489 | $tokens = $this->model->convertIdsToTokens($tokenIds); 490 | 491 | if ($skipSpecialTokens) { 492 | $tokens = array_filter($tokens, function($token) { 493 | return !in_array($token, $this->specialTokens); 494 | }); 495 | } 496 | 497 | $decoded = $this->decoder ? $this->decoder->decode($tokens) : implode(' ', $tokens); 498 | 499 | if ($this->decoder && $this->decoder->endOfWordSuffix) { 500 | $decoded = str_replace($this->decoder->endOfWordSuffix, ' ', $decoded); 501 | if ($skipSpecialTokens) { 502 | $decoded = trim($decoded); 503 | } 504 | } 505 | 506 | if ($cleanUpTokenizationSpaces) { 507 | $decoded = $this->cleanUpTokenization($decoded); 508 | } 509 | 510 | return $decoded; 511 | } 512 | 513 | /** 514 | * Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms. 515 | * 516 | * @param string $text The text to clean up. 517 | * @return string The cleaned up text. 518 | */ 519 | private function cleanUpTokenization(string $text): string 520 | { 521 | $patterns = ['/ \./', '/ \?/', '/ \!/', '/ ,/', "/ ' /", "/ n't/", "/ 'm/", "/ 's/", "/ 've/", "/ 're/"]; 522 | $replacements = ['.', '?', '!', ',', "'", "n't", "'m", "'s", "'ve", "'re"]; 523 | 524 | return preg_replace($patterns, $replacements, $text); 525 | } 526 | 527 | /** 528 | * Get the default chat template. 529 | * 530 | * @return mixed The default chat template. 531 | */ 532 | public function getDefaultChatTemplate(): mixed 533 | { 534 | if (!$this->warnedAboutChatTemplate) { 535 | // Log the warning here (use your preferred logging method) 536 | error_log( 537 | "No chat template is defined for this tokenizer - using a default chat template " . 538 | "that implements the ChatML format. If the default is not appropriate for " . 539 | "your model, please set `tokenizer.chat_template` to an appropriate template. " . 540 | "See https://huggingface.co/docs/transformers/main/chat_templating for more information." 541 | ); 542 | $this->warnedAboutChatTemplate = true; 543 | } 544 | 545 | return $this->defaultChatTemplate; 546 | } 547 | } -------------------------------------------------------------------------------- /src/Nlp/Decoder.php: -------------------------------------------------------------------------------- 1 | config = $config; 15 | 16 | $this->addedTokens = []; 17 | $this->endOfWordSuffix = null; 18 | $this->trimOffsets = $config['trim_offsets'] ?? false; 19 | } 20 | 21 | /** 22 | * Decode a list of tokens into a string. 23 | * @param array $tokens The list of tokens to decode. 24 | * @return string The decoded string. 25 | * @throws \Exception 26 | */ 27 | public function decode(array $tokens): string 28 | { 29 | $decoded = $this->decodeChain($tokens); 30 | return join('', $decoded); 31 | } 32 | 33 | /** 34 | * Apply the decoder to a list of tokens. 35 | * @param array $tokens 36 | * @return array The decoded list of tokens. 37 | * @throws \Exception 38 | */ 39 | public function decodeChain(array $tokens): array { 40 | throw new \Exception('Not implemented'); 41 | } 42 | } -------------------------------------------------------------------------------- /src/Nlp/Embedder.php: -------------------------------------------------------------------------------- 1 | model = new Model(__DIR__ . '/model_quantized.onnx'); 25 | 26 | // load tokenizer configuration 27 | $tokenizerConfig = json_decode(file_get_contents(__DIR__ . '/tokenizer_config.json'), true); 28 | $tokenizerJSON = json_decode(file_get_contents(__DIR__ . '/tokenizer.json'), true); 29 | 30 | // load BertTokenizer 31 | $this->tokenizer = new BertTokenizer($tokenizerJSON, $tokenizerConfig); 32 | } 33 | 34 | public function getInputs(): array { 35 | return $this->model->inputs(); 36 | } 37 | 38 | public function getOutputs(): array { 39 | return $this->model->outputs(); 40 | } 41 | 42 | /** 43 | * Returns the number of dimensions of the output vector. 44 | * @return int 45 | */ 46 | public function getDimensions(): int { 47 | return $this->model->outputs()[0]['shape'][2]; 48 | } 49 | 50 | /** 51 | * Calculates the embedding of a text. 52 | * @param array $text Batch of text to embed 53 | * @return array Batch of embeddings 54 | * @throws \Exception 55 | */ 56 | public function embed(array $text, bool $prependQuery = false): array { 57 | 58 | if($prependQuery) { 59 | // Add query instruction to text 60 | $text = array_map(function($t) { 61 | return self::QUERY_INSTRUCTION . ' ' . $t; 62 | }, $text); 63 | } 64 | 65 | $tokens = $this->tokenizer->call($text, [ 66 | 'text_pair' => null, 67 | 'add_special_tokens' => true, 68 | 'padding' => true, 69 | 'truncation' => true, 70 | 'max_length' => null, 71 | 'return_tensor' => false 72 | ]); 73 | 74 | $outputs = $this->model->predict($tokens, outputNames: ['last_hidden_state']); 75 | return $outputs['last_hidden_state']; 76 | } 77 | 78 | private function dotProduct(array $a, array $b): float { 79 | return \array_sum(\array_map( 80 | function ($a, $b) { 81 | return $a * $b; 82 | }, 83 | $a, 84 | $b 85 | )); 86 | } 87 | 88 | private function l2Norm(array $a): float { 89 | return \sqrt(\array_sum(\array_map(function($x) { return $x * $x; }, $a))); 90 | } 91 | 92 | private function cosine(array $a, array $b): float { 93 | $dotproduct = $this->dotProduct($a, $b); 94 | $normA = $this->l2Norm($a); 95 | $normB = $this->l2Norm($b); 96 | return 1.0 - ($dotproduct / ($normA * $normB)); 97 | } 98 | 99 | /** 100 | * Calculates the cosine similarity between two vectors. 101 | * @param array $a 102 | * @param array $b 103 | * @return float 104 | */ 105 | public function getCosineSimilarity(array $a, array $b): float { 106 | return 1.0 - $this->cosine($a, $b); 107 | } 108 | 109 | public function getMaxLength(): int { 110 | return $this->tokenizer->modelMaxLength; 111 | } 112 | } -------------------------------------------------------------------------------- /src/Nlp/PostProcessor.php: -------------------------------------------------------------------------------- 1 | config = $config; 11 | } 12 | 13 | public function postProcess(array $tokens, ...$args): array 14 | { 15 | throw new \Exception('Not implemented'); 16 | } 17 | } -------------------------------------------------------------------------------- /src/Nlp/PreTokenizer.php: -------------------------------------------------------------------------------- 1 | config = $config; 12 | } 13 | 14 | /** 15 | * Method that should be implemented by the child class 16 | * @param string $text 17 | * @param array $options 18 | * @return array 19 | * @throws \Exception 20 | */ 21 | public function preTokenizeText(string $text, array $options = []): array { 22 | throw new \Exception('Not implemented'); 23 | } 24 | 25 | /** 26 | * Tokenizes the given text into pre-tokens 27 | * @param string|array $text The text or array of text 28 | * @param array $options Options for the pre-tokenizer 29 | * @return array Array of pre-tokens 30 | * @throws \Exception 31 | */ 32 | public function preTokenize(string|array $text, array $options = []): array { 33 | $result = []; 34 | 35 | if (is_array($text)) { 36 | foreach ($text as $item) { 37 | $result[] = $this->preTokenizeText($item, $options); 38 | } 39 | } else { 40 | $result = $this->preTokenizeText($text, $options); 41 | } 42 | 43 | // Flatten the result if it's an array of arrays 44 | if (count($result) > 0 && is_array($result[0])) { 45 | $result = array_merge(...$result); 46 | } 47 | 48 | return $result; 49 | } 50 | } -------------------------------------------------------------------------------- /src/Nlp/TemplateProcessing.php: -------------------------------------------------------------------------------- 1 | single = $config['single'] ?? []; 17 | $this->pair = $config['pair'] ?? []; 18 | } 19 | 20 | /** 21 | * Replaces special tokens in the template with actual tokens. 22 | * @param array $tokens The list of tokens for the first sequence. 23 | * @param array ...$args The list of tokens for the second sequence. Optional. 24 | * @return array The list of tokens with replaced special tokens. 25 | */ 26 | public function postProcess(array $tokens, ...$args): array { 27 | $tokensPair = $args[0] ?? null; 28 | $type = empty($tokensPair) ? $this->single : $this->pair; 29 | 30 | $toReturn = []; 31 | foreach ($type as $item) { 32 | if (isset($item['SpecialToken'])) { 33 | $toReturn[] = $item['SpecialToken']['id']; 34 | 35 | } elseif (isset($item['Sequence'])) { 36 | if ($item['Sequence']['id'] === 'A') { 37 | $toReturn = array_merge($toReturn, $tokens); 38 | 39 | } elseif ($item['Sequence']['id'] === 'B') { 40 | $toReturn = array_merge($toReturn, $tokensPair); 41 | } 42 | } 43 | } 44 | return $toReturn; 45 | } 46 | } -------------------------------------------------------------------------------- /src/Nlp/TextNormalizer.php: -------------------------------------------------------------------------------- 1 | config = $config; 12 | } 13 | 14 | /** 15 | * Normalize the input text 16 | * @param string $text The text to normalize 17 | * @return string The normalized text 18 | * @throws \Exception If normalize is not implemented in a child class 19 | */ 20 | public function normalize(string $text): string { 21 | throw new \Exception("normalize should be implemented in a child class"); 22 | } 23 | 24 | protected function _call(string $text) { 25 | return $this->normalize($text); 26 | } 27 | } -------------------------------------------------------------------------------- /src/Nlp/TokenizerModel.php: -------------------------------------------------------------------------------- 1 | 11 | */ 12 | public array $vocab; 13 | 14 | /** 15 | * A map from tokens to their ids. 16 | * @var array 17 | */ 18 | public array $tokensToIds; 19 | protected $unkTokenId; 20 | protected $unkToken; 21 | public $endOfWordSuffix; 22 | 23 | /** 24 | * Whether to fuse unknown tokens when encoding. Defaults to false. 25 | * @var bool 26 | */ 27 | protected mixed $fuseUnk; 28 | /** 29 | * @var int 30 | */ 31 | protected mixed $maxInputCharsPerWord; 32 | 33 | public function __construct(array $config = []) { 34 | $this->config = $config; 35 | 36 | $this->endOfWordSuffix = null; 37 | $this->fuseUnk = $config['fuse_unk'] ?? false; 38 | 39 | $this->tokensToIds = $this->objectToMap($config['vocab']); 40 | $this->unkTokenId = $this->tokensToIds[$config['unk_token']]; 41 | $this->unkToken = $config['unk_token']; 42 | $this->maxInputCharsPerWord = $config['max_input_chars_per_word'] ?? 100; 43 | 44 | /** 45 | * An array of tokens. 46 | * @var array $vocab 47 | */ 48 | $this->vocab = []; 49 | foreach ($this->tokensToIds as $key => $value) { 50 | $this->vocab[$value] = $key; 51 | } 52 | } 53 | 54 | /** 55 | * Encodes a list of tokens into a list of token IDs. 56 | * @param array $tokens 57 | * @return array 58 | * @throws \Exception 59 | */ 60 | public function encode(array $tokens): array { 61 | throw new \Exception('Not implemented'); 62 | } 63 | 64 | /** 65 | * Converts a list of tokens to a list of token IDs. 66 | * @param array $tokens The list of tokens to convert. 67 | * @return array The converted list of token IDs. 68 | */ 69 | public function convertTokensToIds(array $tokens): array { 70 | $ids = []; 71 | foreach ($tokens as $token) { 72 | $ids[] = $this->tokensToIds[$token] ?? $this->unkTokenId; 73 | } 74 | 75 | if($this->fuseUnk) { 76 | // Fuse unknown tokens. 77 | $ids = $this->fuse($ids, $this->unkTokenId); 78 | } 79 | 80 | return $ids; 81 | } 82 | 83 | /** 84 | * Helper function to fuse consecutive values in an array equal to the specified value. 85 | * @param array $arr The array to fuse. 86 | * @param mixed $value The value to fuse on. 87 | * @return array 88 | */ 89 | protected function fuse(array $arr, mixed $value): array { 90 | $fused = []; 91 | $i = 0; 92 | $length = count($arr); 93 | 94 | while ($i < $length) { 95 | $fused[] = $arr[$i]; 96 | if ($arr[$i] !== $value) { 97 | ++$i; 98 | continue; 99 | } 100 | 101 | while ($i < $length && $arr[$i] === $value) { 102 | ++$i; 103 | } 104 | } 105 | 106 | return $fused; 107 | } 108 | 109 | /** 110 | * Converts a list of token IDs to a list of tokens. 111 | * @param array $ids The list of token IDs to convert. 112 | * @return array The converted list of tokens. 113 | */ 114 | public function convertIdsToTokens(array $ids): array { 115 | $tokens = []; 116 | foreach ($ids as $id) { 117 | $tokens[] = $this->vocab[$id] ?? $this->unkToken; 118 | } 119 | 120 | return $tokens; 121 | } 122 | 123 | protected function objectToMap($object): array { 124 | $map = []; 125 | foreach ($object as $key => $value) { 126 | $map[$key] = $value; 127 | } 128 | 129 | return $map; 130 | } 131 | } -------------------------------------------------------------------------------- /src/Nlp/WordPieceDecoder.php: -------------------------------------------------------------------------------- 1 | cleanup = $config['cleanup'] ?? true; 15 | } 16 | 17 | /** 18 | * Clean up a list of simple English tokenization artifacts. 19 | * @param $text 20 | * @return string 21 | */ 22 | private function cleanUpTokenization($text): string { 23 | $text = preg_replace('/ \./', '.', $text); 24 | $text = preg_replace('/ \?/', '?', $text); 25 | $text = preg_replace('/ \!/', '!', $text); 26 | $text = preg_replace('/ ,/', ',', $text); 27 | $text = preg_replace('/ \' /', "'", $text); 28 | $text = preg_replace('/ n\'t/', "n't", $text); 29 | $text = preg_replace('/ \'m/', "'m", $text); 30 | $text = preg_replace('/ \'s/', "'s", $text); 31 | $text = preg_replace('/ \'ve/', "'ve", $text); 32 | $text = preg_replace('/ \'re/', "'re", $text); 33 | 34 | return $text; 35 | } 36 | 37 | public function decodeChain($tokens): array { 38 | return array_map(function($token, $i) { 39 | if ($i !== 0) { 40 | if (str_starts_with($token, $this->config['prefix'])) { 41 | // Replace only the first occurrence of prefix 42 | $token = substr_replace($token, '', 0, strlen($this->config['prefix'])); 43 | } else { 44 | $token = ' ' . $token; 45 | } 46 | } 47 | if ($this->cleanup) { 48 | $token = $this->cleanUpTokenization($token); // Assume cleanUpTokenization is a method in this class 49 | } 50 | 51 | return $token; 52 | }, $tokens, array_keys($tokens)); 53 | } 54 | } -------------------------------------------------------------------------------- /src/Nlp/WordpieceTokenizer.php: -------------------------------------------------------------------------------- 1 | $this->maxInputCharsPerWord) { 23 | $outputTokens[] = $this->unkToken; 24 | continue; 25 | } 26 | 27 | $isUnknown = false; 28 | $start = 0; 29 | $subTokens = []; 30 | 31 | while ($start < count($chars)) { 32 | $end = count($chars); 33 | $currentSubstring = null; 34 | while ($start < $end) { 35 | $substr = implode('', array_slice($chars, $start, $end - $start)); 36 | 37 | if ($start > 0) { 38 | $substr = $this->config['continuing_subword_prefix'] . $substr; 39 | } 40 | if (isset($this->tokensToIds[$substr])) { 41 | $currentSubstring = $substr; 42 | break; 43 | } 44 | 45 | --$end; 46 | } 47 | if ($currentSubstring === null) { 48 | $isUnknown = true; 49 | break; 50 | } 51 | $subTokens[] = $currentSubstring; 52 | $start = $end; 53 | } 54 | if ($isUnknown) { 55 | $outputTokens[] = $this->unkToken; 56 | } else { 57 | $outputTokens = array_merge($outputTokens, $subTokens); 58 | } 59 | } 60 | 61 | return $outputTokens; 62 | } 63 | } -------------------------------------------------------------------------------- /src/Nlp/model_quantized.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allanpichardo/mysql-vector/ce020410968912a0523519065907679221d945bd/src/Nlp/model_quantized.onnx -------------------------------------------------------------------------------- /src/Nlp/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "clean_up_tokenization_spaces": true, 3 | "cls_token": "[CLS]", 4 | "do_basic_tokenize": true, 5 | "do_lower_case": true, 6 | "mask_token": "[MASK]", 7 | "model_max_length": 512, 8 | "never_split": null, 9 | "pad_token": "[PAD]", 10 | "sep_token": "[SEP]", 11 | "strip_accents": null, 12 | "tokenize_chinese_chars": true, 13 | "tokenizer_class": "BertTokenizer", 14 | "unk_token": "[UNK]" 15 | } 16 | -------------------------------------------------------------------------------- /src/VectorTable.php: -------------------------------------------------------------------------------- 1 | mysqli = $mysqli; 30 | $this->name = $name; 31 | $this->dimension = $dimension; 32 | $this->engine = $engine; 33 | $this->centroidCache = []; 34 | } 35 | 36 | public function getVectorTableName(): string 37 | { 38 | return sprintf('%s_vectors', $this->name); 39 | } 40 | 41 | protected function getCreateStatements(bool $ifNotExists = true): array { 42 | $binaryCodeLengthInBytes = ceil($this->dimension / 8); 43 | 44 | $vectorsQuery = 45 | "CREATE TABLE %s %s ( 46 | id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY, 47 | vector JSON, 48 | normalized_vector JSON, 49 | magnitude DOUBLE, 50 | binary_code BINARY(%d), 51 | created TIMESTAMP DEFAULT CURRENT_TIMESTAMP 52 | ) ENGINE=%s;"; 53 | $vectorsQuery = sprintf($vectorsQuery, $ifNotExists ? 'IF NOT EXISTS' : '', $this->getVectorTableName(), $binaryCodeLengthInBytes, $this->engine); 54 | 55 | return [$vectorsQuery]; 56 | } 57 | 58 | /** 59 | * Convert an n-dimensional vector in to an n-bit binary code 60 | * @param array $vector 61 | * @return int 62 | */ 63 | public function vectorToHex(array $vector): string { 64 | $binary = ''; 65 | foreach($vector as $value) { 66 | $binary .= $value > 0 ? '1' : '0'; 67 | } 68 | 69 | $padded = str_pad($binary, ceil(strlen($binary) / 8) * 8, '0', STR_PAD_LEFT); 70 | 71 | return $this->binaryToHexadecimal($padded); 72 | } 73 | 74 | private function binaryToHexadecimal(string $binaryString): string { 75 | $hex = ''; 76 | foreach(str_split($binaryString, 4) as $char) { 77 | $hex .= strtoupper(dechex(bindec($char))); 78 | } 79 | $hex = str_pad($hex, ceil(strlen($hex) / 4) * 4, '0', STR_PAD_LEFT); 80 | return $hex; 81 | } 82 | 83 | /** 84 | * Create the tables required for storing vectors 85 | * @param bool $ifNotExists Whether to use IF NOT EXISTS in the CREATE TABLE statements 86 | * @return void 87 | * @throws \Exception If the tables could not be created 88 | */ 89 | public function initialize(bool $ifNotExists = true): void 90 | { 91 | $this->mysqli->begin_transaction(); 92 | foreach ($this->getCreateStatements($ifNotExists) as $statement) { 93 | $success = $this->mysqli->query($statement); 94 | if (!$success) { 95 | $e = new \Exception($this->mysqli->error); 96 | $this->mysqli->rollback(); 97 | throw $e; 98 | } 99 | } 100 | 101 | // Add COSIM function 102 | $this->mysqli->query("DROP FUNCTION IF EXISTS COSIM"); 103 | $res = $this->mysqli->query(self::SQL_COSIM_FUNCTION); 104 | 105 | if(!$res) { 106 | $e = new \Exception($this->mysqli->error); 107 | $this->mysqli->rollback(); 108 | throw $e; 109 | } 110 | 111 | // Drop the index if it exists. 112 | $tableName = $this->getVectorTableName(); 113 | $query = " 114 | SELECT COUNT(1) index_exists 115 | FROM information_schema.statistics 116 | WHERE table_name=? AND index_name='idx_binary_code' 117 | "; 118 | $stmt = $this->mysqli->prepare($query); 119 | $stmt->bind_param('s', $tableName); 120 | $stmt->execute(); 121 | $result = $stmt->get_result(); 122 | $row = $result->fetch_assoc(); 123 | if ($row['index_exists'] > 0) { 124 | $this->mysqli->query("DROP INDEX idx_binary_code ON " . $tableName); 125 | } 126 | $stmt->close(); 127 | 128 | $binaryCodeLengthInBytes = ceil($this->dimension / 8); 129 | $this->mysqli->query("CREATE INDEX idx_binary_code ON " . $tableName . " (binary_code($binaryCodeLengthInBytes))"); 130 | 131 | $this->mysqli->commit(); 132 | } 133 | 134 | /** 135 | * Compute the cosine similarity between two normalized vectors 136 | * @param array $v1 The first vector 137 | * @param array $v2 The second vector 138 | * @return float The cosine similarity between the two vectors [0, 1] 139 | * @throws \Exception 140 | */ 141 | public function cosim(array $v1, array $v2): float 142 | { 143 | $statement = $this->mysqli->prepare("SELECT COSIM(?, ?)"); 144 | 145 | if(!$statement) { 146 | $e = new \Exception($this->mysqli->error); 147 | $this->mysqli->rollback(); 148 | throw $e; 149 | } 150 | 151 | $v1 = json_encode($v1); 152 | $v2 = json_encode($v2); 153 | 154 | $statement->bind_param('ss', $v1, $v2); 155 | $statement->execute(); 156 | $statement->bind_result($similarity); 157 | $statement->fetch(); 158 | $statement->close(); 159 | 160 | return $similarity; 161 | } 162 | 163 | /** 164 | * Insert or update a vector 165 | * @param array $vector The vector to insert or update 166 | * @param int|null $id Optional ID of the vector to update 167 | * @return int The ID of the inserted or updated vector 168 | * @throws \Exception If the vector could not be inserted or updated 169 | */ 170 | public function upsert(array $vector, ?int $id = null): int 171 | { 172 | $magnitude = $this->getMagnitude($vector); 173 | $normalizedVector = $this->normalize($vector, $magnitude); 174 | $binaryCode = $this->vectorToHex($normalizedVector); 175 | $tableName = $this->getVectorTableName(); 176 | 177 | $insertQuery = empty($id) ? 178 | "INSERT INTO $tableName (vector, normalized_vector, magnitude, binary_code) VALUES (?, ?, ?, UNHEX(?))" : 179 | "UPDATE $tableName SET vector = ?, normalized_vector = ?, magnitude = ?, binary_code = UNHEX(?) WHERE id = $id"; 180 | 181 | $statement = $this->mysqli->prepare($insertQuery); 182 | if(!$statement) { 183 | $e = new \Exception($this->mysqli->error); 184 | $this->mysqli->rollback(); 185 | throw $e; 186 | } 187 | 188 | $vector = json_encode($vector); 189 | $normalizedVector = json_encode($normalizedVector); 190 | 191 | $statement->bind_param('ssds', $vector, $normalizedVector, $magnitude, $binaryCode); 192 | 193 | $success = $statement->execute(); 194 | if(!$success) { 195 | throw new \Exception($statement->error); 196 | } 197 | 198 | $id = $statement->insert_id; 199 | $statement->close(); 200 | 201 | return $id; 202 | } 203 | 204 | /** 205 | * Insert multiple vectors in a single query 206 | * @param array $vectorArray Array of vectors to insert 207 | * @return array Array of ids of the inserted vectors 208 | * @throws \Exception 209 | */ 210 | public function batchInsert(array $vectorArray): array { 211 | $tableName = $this->getVectorTableName(); 212 | 213 | $statement = $this->getConnection()->prepare("INSERT INTO $tableName (vector, normalized_vector, magnitude, binary_code) VALUES (?, ?, ?, UNHEX(?))"); 214 | if(!$statement) { 215 | throw new \Exception("Prepare failed: " . $this->getConnection()->error); 216 | } 217 | 218 | $ids = []; 219 | $this->getConnection()->begin_transaction(); 220 | try { 221 | foreach ($vectorArray as $vector) { 222 | $magnitude = $this->getMagnitude($vector); 223 | $normalizedVector = $this->normalize($vector, $magnitude); 224 | $binaryCode = $this->vectorToHex($normalizedVector); 225 | $vectorJson = json_encode($vector); 226 | $normalizedVectorJson = json_encode($normalizedVector); 227 | 228 | $statement->bind_param('ssds', $vectorJson, $normalizedVectorJson, $magnitude, $binaryCode); 229 | 230 | if (!$statement->execute()) { 231 | throw new \Exception("Execute failed: " . $statement->error); 232 | } 233 | 234 | $ids[] = $statement->insert_id; 235 | } 236 | 237 | $this->getConnection()->commit(); 238 | } catch (\Exception $e) { 239 | $this->getConnection()->rollback(); 240 | throw $e; 241 | } finally { 242 | $statement->close(); 243 | } 244 | 245 | return $ids; 246 | } 247 | 248 | /** 249 | * Select one or more vectors by id 250 | * @param \mysqli $mysqli The mysqli connection 251 | * @param array $ids The ids of the vectors to select 252 | * @return array Array of vectors 253 | */ 254 | public function select(array $ids): array { 255 | $tableName = $this->getVectorTableName(); 256 | 257 | $placeholders = implode(', ', array_fill(0, count($ids), '?')); 258 | $statement = $this->mysqli->prepare("SELECT id, vector, normalized_vector, magnitude, binary_code FROM $tableName WHERE id IN ($placeholders)"); 259 | $types = str_repeat('i', count($ids)); 260 | 261 | $refs = []; 262 | foreach ($ids as $key => $id) { 263 | $refs[$key] = &$ids[$key]; 264 | } 265 | 266 | call_user_func_array([$statement, 'bind_param'], array_merge([$types], $refs)); 267 | $statement->execute(); 268 | $statement->bind_result($vectorId, $vector, $normalizedVector, $magnitude, $binaryCode); 269 | 270 | $result = []; 271 | while ($statement->fetch()) { 272 | $result[] = [ 273 | 'id' => $vectorId, 274 | 'vector' => json_decode($vector, true), 275 | 'normalized_vector' => json_decode($normalizedVector, true), 276 | 'magnitude' => $magnitude, 277 | 'binary_code' => $binaryCode 278 | ]; 279 | } 280 | 281 | $statement->close(); 282 | 283 | return $result; 284 | } 285 | 286 | public function selectAll(): array { 287 | $tableName = $this->getVectorTableName(); 288 | 289 | $statement = $this->mysqli->prepare("SELECT id, vector, normalized_vector, magnitude, binary_code FROM $tableName"); 290 | 291 | if (!$statement) { 292 | $e = new \Exception($this->mysqli->error); 293 | $this->mysqli->rollback(); 294 | throw $e; 295 | } 296 | 297 | $statement->execute(); 298 | $statement->bind_result($vectorId, $vector, $normalizedVector, $magnitude, $binaryCode); 299 | 300 | $result = []; 301 | while ($statement->fetch()) { 302 | $result[] = [ 303 | 'id' => $vectorId, 304 | 'vector' => json_decode($vector, true), 305 | 'normalized_vector' => json_decode($normalizedVector, true), 306 | 'magnitude' => $magnitude, 307 | 'binary_code' => $binaryCode 308 | ]; 309 | } 310 | 311 | $statement->close(); 312 | 313 | return $result; 314 | } 315 | 316 | 317 | private function dotProduct(array $vectorA, array $vectorB): float { 318 | $product = 0; 319 | 320 | foreach ($vectorA as $position => $value) { 321 | if (isset($vectorB[$position])) { 322 | $product += $value * $vectorB[$position]; 323 | } 324 | } 325 | 326 | return $product; 327 | } 328 | 329 | /** 330 | * Returns the number of vectors stored in the database 331 | * @return int The number of vectors 332 | */ 333 | public function count(): int { 334 | $tableName = $this->getVectorTableName(); 335 | $statement = $this->mysqli->prepare("SELECT COUNT(id) FROM $tableName"); 336 | $statement->execute(); 337 | $statement->bind_result($count); 338 | $statement->fetch(); 339 | $statement->close(); 340 | return $count; 341 | } 342 | 343 | private function getMagnitude(array $vector): float 344 | { 345 | $sum = 0; 346 | foreach ($vector as $value) { 347 | $sum += $value * $value; 348 | } 349 | 350 | return sqrt($sum); 351 | } 352 | 353 | /** 354 | * Finds the vectors that are most similar to the given vector 355 | * @param array $vector The vector to query for 356 | * @param int $n The number of results to return 357 | * @return array Array of results containing the id, similarity, and vector 358 | * @throws \Exception 359 | */ 360 | public function search(array $vector, int $n = 10): array { 361 | $tableName = $this->getVectorTableName(); 362 | $normalizedVector = $this->normalize($vector); 363 | $binaryCode = $this->vectorToHex($normalizedVector); 364 | 365 | // Initial search using binary codes 366 | $statement = $this->mysqli->prepare("SELECT id, BIT_COUNT(binary_code ^ UNHEX(?)) AS hamming_distance FROM $tableName ORDER BY hamming_distance LIMIT $n"); 367 | $statement->bind_param('s', $binaryCode); 368 | 369 | if(!$statement) { 370 | $e = new \Exception($this->mysqli->error); 371 | $this->mysqli->rollback(); 372 | throw $e; 373 | } 374 | 375 | $statement->execute(); 376 | $statement->bind_result($vectorId, $hd); 377 | 378 | $candidates = []; 379 | while ($statement->fetch()) { 380 | $candidates[] = $vectorId; 381 | } 382 | $statement->close(); 383 | 384 | // Rerank candidates using cosine similarity 385 | $placeholders = implode(',', array_fill(0, count($candidates), '?')); 386 | $sql = " 387 | SELECT id, vector, normalized_vector, magnitude, COSIM(normalized_vector, ?) AS similarity 388 | FROM %s 389 | WHERE id IN ($placeholders) 390 | ORDER BY similarity DESC 391 | LIMIT $n"; 392 | $sql = sprintf($sql, $tableName); 393 | 394 | $statement = $this->mysqli->prepare($sql); 395 | 396 | if(!$statement) { 397 | $e = new \Exception($this->mysqli->error); 398 | $this->mysqli->rollback(); 399 | throw $e; 400 | } 401 | 402 | $normalizedVector = json_encode($normalizedVector); 403 | 404 | $types = str_repeat('i', count($candidates)); 405 | $statement->bind_param('s' . $types, $normalizedVector, ...$candidates); 406 | 407 | $statement->execute(); 408 | 409 | $statement->bind_result($id, $v, $nv, $mag, $sim); 410 | 411 | $results = []; 412 | while ($statement->fetch()) { 413 | $results[] = [ 414 | 'id' => $id, 415 | 'vector' => json_decode($v, true), 416 | 'normalized_vector' => json_decode($nv, true), 417 | 'magnitude' => $mag, 418 | 'similarity' => $sim 419 | ]; 420 | } 421 | 422 | $statement->close(); 423 | 424 | return $results; 425 | } 426 | 427 | /** 428 | * Normalize a vector 429 | * @param array $vector The vector to normalize 430 | * @param float|null $magnitude The magnitude of the vector. If not provided, it will be calculated. 431 | * @param float $epsilon The epsilon value to use for normalization 432 | * @return array The normalized vector 433 | */ 434 | private function normalize(array $vector, ?float $magnitude = null, float $epsilon = 1e-10): array { 435 | $magnitude = !empty($magnitude) ? $magnitude : $this->getMagnitude($vector); 436 | if ($magnitude == 0) { 437 | $magnitude = $epsilon; 438 | } 439 | foreach ($vector as $key => $value) { 440 | $vector[$key] = $value / $magnitude; 441 | } 442 | return $vector; 443 | } 444 | 445 | /** 446 | * Remove a vector from the database 447 | * @param int $id The id of the vector to remove 448 | * @return void 449 | * @throws \Exception 450 | */ 451 | public function delete(int $id): void { 452 | $tableName = $this->getVectorTableName(); 453 | $statement = $this->mysqli->prepare("DELETE FROM $tableName WHERE id = ?"); 454 | $statement->bind_param('i', $id); 455 | $success = $statement->execute(); 456 | if(!$success) { 457 | throw new \Exception($statement->error); 458 | } 459 | $statement->close(); 460 | } 461 | 462 | public function getConnection(): \mysqli { 463 | return $this->mysqli; 464 | } 465 | } -------------------------------------------------------------------------------- /tests/Nlp/BertNormalizerTest.php: -------------------------------------------------------------------------------- 1 | true]); 15 | $this->assertEquals('你 好', $normalizer->normalize('你好')); 16 | } 17 | 18 | public function testStripAccents() 19 | { 20 | $normalizer = new BertNormalizer(['strip_accents' => true]); 21 | $this->assertEquals('cafe', $normalizer->normalize('café')); 22 | } 23 | 24 | public function testIsControlCharacter() 25 | { 26 | $normalizer = new BertNormalizer(); 27 | $reflection = new ReflectionClass($normalizer); 28 | $method = $reflection->getMethod('isControlCharacter'); 29 | $method->setAccessible(true); 30 | 31 | $this->assertFalse($method->invokeArgs($normalizer, [" "])); 32 | $this->assertFalse($method->invokeArgs($normalizer, ["\n"])); 33 | $this->assertFalse($method->invokeArgs($normalizer, ["\r"])); 34 | $this->assertFalse($method->invokeArgs($normalizer, ["\t"])); 35 | $this->assertTrue($method->invokeArgs($normalizer, ["\x00"])); 36 | } 37 | 38 | public function testCleanText() 39 | { 40 | $normalizer = new BertNormalizer(['clean_text' => true]); 41 | $this->assertEquals('text', $normalizer->normalize("te\x00xt")); 42 | } 43 | 44 | public function testNormalize() 45 | { 46 | $normalizer = new BertNormalizer(['clean_text' => true, 'handle_chinese_chars' => true, 'strip_accents' => true, 'lowercase' => true]); 47 | $this->assertEquals('hello world, 世 界!', $normalizer->normalize("Hèllo WOrld, 世界!")); 48 | } 49 | } -------------------------------------------------------------------------------- /tests/Nlp/BertPreTokenizerTest.php: -------------------------------------------------------------------------------- 1 | preTokenizeText($text, []); 17 | 18 | $this->assertEquals($expectedResult, $result); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /tests/Nlp/BertTokenizerTest.php: -------------------------------------------------------------------------------- 1 | tokenizer = new BertTokenizer($tokenizerJSON, $tokenizerConfig); 24 | } 25 | 26 | public function testCall() { 27 | // Test the encode method with sample data 28 | $text = "Hello how are U tday?"; 29 | $encoded = $this->tokenizer->call($text); 30 | $decoded = $this->tokenizer->decode($encoded['input_ids']); 31 | 32 | // Assert that the output is as expected 33 | $this->assertIsArray($encoded['input_ids']); 34 | $this->assertEquals("[CLS] hello how are u tday? [SEP]", $decoded); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /tests/Nlp/EmbedderTest.php: -------------------------------------------------------------------------------- 1 | embed(['Hello world!', 'This is a test.', 'Hi world', 'Snow is white.', 'Hello world']); 13 | $this->assertIsArray($embeddings); 14 | $this->assertCount(5, $embeddings); 15 | $this->assertCount($embedder->getDimensions(), $embeddings[0][0]); 16 | 17 | $this->assertGreaterThan(0.99, $embedder->getCosineSimilarity($embeddings[0][0], $embeddings[0][0])); 18 | $this->assertGreaterThan(0.89, $embedder->getCosineSimilarity($embeddings[0][0], $embeddings[4][0])); 19 | $this->assertLessThan(0.6, $embedder->getCosineSimilarity($embeddings[0][0], $embeddings[1][0])); 20 | $this->assertGreaterThan(0.7, $embedder->getCosineSimilarity($embeddings[0][0], $embeddings[2][0])); 21 | $this->assertLessThan(0.6, $embedder->getCosineSimilarity($embeddings[0][0], $embeddings[3][0])); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /tests/Nlp/TokenizerModelTest.php: -------------------------------------------------------------------------------- 1 | ['hello' => 1, 'world' => 2, '[UNK]' => 0], 14 | 'unk_token' => '[UNK]', 15 | 'unk_token_id' => 0, 16 | 'fuse_unk' => false 17 | ]; 18 | $tokenizer = new TokenizerModel($config); 19 | $tokens = ['hello', 'world', 'unknown']; 20 | 21 | $expectedResult = [1, 2, 0]; 22 | $result = $tokenizer->convertTokensToIds($tokens); 23 | 24 | $this->assertEquals($expectedResult, $result); 25 | } 26 | 27 | public function testConvertIdsToTokens() { 28 | $config = [ 29 | 'vocab' => ['hello' => 1, 'world' => 2, '[UNK]' => 0], 30 | 'unk_token' => '[UNK]', 31 | 'unk_token_id' => 0, 32 | 'fuse_unk' => false 33 | ]; 34 | $tokenizer = new TokenizerModel($config); 35 | $ids = [1, 2, 0]; 36 | 37 | $expectedResult = ['hello', 'world', '[UNK]']; 38 | $result = $tokenizer->convertIdsToTokens($ids); 39 | 40 | $this->assertEquals($expectedResult, $result); 41 | } 42 | 43 | public function testFuse() { 44 | $config = [ 45 | 'vocab' => ['hello' => 1, 'world' => 2, '[UNK]' => 0], 46 | 'unk_token' => '[UNK]', 47 | 'unk_token_id' => 0, 48 | 'fuse_unk' => false 49 | ]; 50 | $tokenizer = new TokenizerModel($config); 51 | $reflection = new ReflectionClass($tokenizer); 52 | $method = $reflection->getMethod('fuse'); 53 | $method->setAccessible(true); 54 | 55 | $arr = [0, 0, 1, 0, 0]; 56 | $expectedResult = [0, 1, 0]; 57 | $result = $method->invokeArgs($tokenizer, [$arr, 0]); 58 | 59 | $this->assertEquals($expectedResult, $result); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /tests/Nlp/WordPieceDecoderTest.php: -------------------------------------------------------------------------------- 1 | getMethod('cleanUpTokenization'); 15 | $method->setAccessible(true); 16 | 17 | $this->assertEquals("Hello world!", $method->invokeArgs($decoder, ["Hello world !"])); 18 | } 19 | 20 | public function testDecodeChain() { 21 | $config = ['prefix' => '##', 'cleanup' => true]; 22 | $decoder = new WordPieceDecoder($config); 23 | $tokens = ["Hello", "##world", "!"]; 24 | 25 | $this->assertEquals(["Hello", "world", "!"], $decoder->decodeChain($tokens)); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /tests/Nlp/WordpieceTokenizerTest.php: -------------------------------------------------------------------------------- 1 | ['hello' => 1, 'world' => 2, '[UNK]' => 0], 13 | 'max_input_chars_per_word' => 100, 14 | 'continuing_subword_prefix' => '##', 15 | 'unk_token' => '[UNK]' 16 | ]; 17 | $tokenizer = new WordpieceTokenizer($config); 18 | $tokens = ['hello', 'world', '!']; 19 | 20 | $expectedResult = ['hello', 'world', '[UNK]']; // Expected result might vary based on actual implementation 21 | $result = $tokenizer->encode($tokens); 22 | 23 | $this->assertEquals($expectedResult, $result); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /tests/PerformanceBenchmarkTest.php: -------------------------------------------------------------------------------- 1 | connect_error) { 20 | die("Connection failed: " . $mysqli->connect_error); 21 | } 22 | 23 | // Setup VectorTable for testing 24 | $this->vectorTable = new VectorTable($mysqli, 'test_table', $this->dimension); 25 | 26 | // Create required tables for testing 27 | $this->vectorTable->initialize(); 28 | } 29 | 30 | public function testSearchPerformance() { 31 | $this->vectorTable->getConnection()->begin_transaction(); 32 | 33 | // Let's insert a known vector 34 | $targetVector = array_fill(0, $this->dimension, 0.5); 35 | $this->vectorTable->upsert($targetVector); 36 | 37 | $vecs = $this->getRandomVectors(100, $this->dimension); 38 | $this->vectorTable->batchInsert($vecs); 39 | 40 | // Now, we search for this vector 41 | echo "Searching for 1 vector among 100...\n"; 42 | $time = microtime(true); 43 | $results = $this->vectorTable->search($targetVector); 44 | $time = microtime(true) - $time; 45 | echo sprintf("Search completed in %.2f seconds\n", $time); 46 | 47 | $vecs = $this->getRandomVectors(900, $this->dimension); 48 | $this->vectorTable->batchInsert($vecs); 49 | 50 | // Now, we search for this vector 51 | echo "Searching for 1 vector among 1000...\n"; 52 | $time = microtime(true); 53 | $results = $this->vectorTable->search($targetVector); 54 | $time = microtime(true) - $time; 55 | echo sprintf("Search completed in %.2f seconds\n", $time); 56 | 57 | $vecs = $this->getRandomVectors(9000, $this->dimension); 58 | $this->vectorTable->batchInsert($vecs); 59 | 60 | // Now, we search for this vector 61 | echo "Searching for 1 vector among 10000...\n"; 62 | $time = microtime(true); 63 | $results = $this->vectorTable->search($targetVector, 10); 64 | $time = microtime(true) - $time; 65 | echo sprintf("Search completed in %.2f seconds\n", $time); 66 | 67 | $vecs = $this->getRandomVectors(90000, $this->dimension); 68 | $this->vectorTable->batchInsert($vecs); 69 | 70 | // Now, we search for this vector 71 | echo "Searching for 1 vector among 100000...\n"; 72 | $time = microtime(true); 73 | $results = $this->vectorTable->search($targetVector, 10); 74 | $time = microtime(true) - $time; 75 | echo sprintf("Search completed in %.2f seconds\n", $time); 76 | 77 | $vecs = $this->getRandomVectors(900000, $this->dimension); 78 | $this->vectorTable->batchInsert($vecs); 79 | 80 | // Now, we search for this vector 81 | echo "Searching for 1 vector among 1000000...\n"; 82 | $time = microtime(true); 83 | $results = $this->vectorTable->search($targetVector, 10); 84 | $time = microtime(true) - $time; 85 | echo sprintf("Search completed in %.2f seconds\n", $time); 86 | 87 | $vecs = $this->getRandomVectors(9000000, $this->dimension); 88 | $this->vectorTable->batchInsert($vecs); 89 | 90 | // Now, we search for this vector 91 | echo "Searching for 1 vector among 10000000...\n"; 92 | $time = microtime(true); 93 | $results = $this->vectorTable->search($targetVector, 10); 94 | $time = microtime(true) - $time; 95 | echo sprintf("Search completed in %.2f seconds\n", $time); 96 | 97 | $this->vectorTable->getConnection()->rollback(); 98 | } 99 | 100 | private function getRandomVectors($count, $dimension) { 101 | $vecs = []; 102 | for ($i = 0; $i < $count; $i++) { 103 | for($j = 0; $j < $dimension; $j++) { 104 | $vecs[$i][$j] = 2 * (mt_rand(0, 1000) / 1000) - 1; 105 | } 106 | } 107 | return $vecs; 108 | } 109 | 110 | public static function tearDownAfterClass(): void 111 | { 112 | // Clean up the database and close connection 113 | $mysqli = new \mysqli('localhost', 'root', '', 'mysql-vector'); 114 | $vectorTable = new VectorTable($mysqli, 'test_table', 3); 115 | $mysqli->query("DROP TABLE IF EXISTS " . $vectorTable->getVectorTableName()); 116 | $mysqli->query("DROP FUNCTION IF EXISTS COSIM"); 117 | $mysqli->close(); 118 | } 119 | 120 | protected function tearDown(): void 121 | { 122 | // Clean up the database and close connection 123 | $this->vectorTable->getConnection()->query("DROP TABLE IF EXISTS" . $this->vectorTable->getVectorTableName()); 124 | $this->vectorTable->getConnection()->query("DROP FUNCTION IF EXISTS COSIM"); 125 | $this->vectorTable->getConnection()->close(); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /tests/VectorTableTest.php: -------------------------------------------------------------------------------- 1 | connect_error) { 22 | die("Connection failed: " . $mysqli->connect_error); 23 | } 24 | 25 | // Setup VectorTable for testing 26 | $this->vectorTable = new VectorTable($mysqli, 'test_table', $this->dimension); 27 | 28 | // Create required tables for testing 29 | $this->vectorTable->initialize(); 30 | } 31 | 32 | private function getRandomVectors($count, $dimension) { 33 | $vecs = []; 34 | for ($i = 0; $i < $count; $i++) { 35 | for($j = 0; $j < $dimension; $j++) { 36 | $vecs[$i][$j] = 2 * (mt_rand(0, 1000) / 1000) - 1; 37 | } 38 | } 39 | return $vecs; 40 | } 41 | 42 | public function testGetVectorTableName() 43 | { 44 | $tableName = $this->vectorTable->getVectorTableName(); 45 | $this->assertEquals('test_table_vectors', $tableName); 46 | } 47 | 48 | public function testUpsertSingle() { 49 | $this->vectorTable->getConnection()->begin_transaction(); 50 | 51 | $vecs = $this->getRandomVectors(1, $this->dimension); 52 | 53 | $ids = []; 54 | 55 | echo "Inserting 1 vector...\n"; 56 | $time = microtime(true); 57 | foreach ($vecs as $vec) { 58 | $ids[] = $this->vectorTable->upsert($vec); 59 | } 60 | $time = microtime(true) - $time; 61 | echo "Elapsed time: " . sprintf("%.2f", $time) . " seconds\n"; 62 | 63 | $this->assertEquals(count($vecs), $this->vectorTable->count()); 64 | $this->vectorTable->getConnection()->rollback(); 65 | } 66 | 67 | public function testUpsert() { 68 | $this->vectorTable->getConnection()->begin_transaction(); 69 | 70 | $lastId = 0; 71 | $vecArray = []; 72 | echo "Inserting $this->testVectorAmount vectors one-at-a-time...\n"; 73 | $time = microtime(true); 74 | for($i = 0; $i < $this->testVectorAmount; $i++) { 75 | $vec = $this->getRandomVectors(1, $this->dimension)[0]; 76 | $lastId = $this->vectorTable->upsert($vec); 77 | $vecArray[] = $vec; 78 | } 79 | 80 | $time = microtime(true) - $time; 81 | echo "Elapsed time: " . sprintf("%.2f", $time) . " seconds\n"; 82 | 83 | $this->assertEquals($this->testVectorAmount, count($this->vectorTable->selectAll())); 84 | 85 | echo "Inserting another $this->testVectorAmount vectors in a batch...\n"; 86 | $time = microtime(true); 87 | $this->vectorTable->batchInsert($vecArray); 88 | 89 | $time = microtime(true) - $time; 90 | echo "Elapsed time: " . sprintf("%.2f", $time) . " seconds\n"; 91 | 92 | $this->assertEquals($this->testVectorAmount * 2, $this->vectorTable->count()); 93 | 94 | $id = $lastId; 95 | $newVec = $this->getRandomVectors(1, $this->dimension)[0]; 96 | $this->vectorTable->upsert($newVec, $id); 97 | $r = $this->vectorTable->select([$id]); 98 | $this->assertCount(1, $r); 99 | $this->assertEqualsWithDelta($newVec, $r[0]['vector'], 0.00001); 100 | 101 | $this->vectorTable->getConnection()->rollback(); 102 | } 103 | 104 | public function testCosim() { 105 | $this->vectorTable->getConnection()->begin_transaction(); 106 | 107 | $vecs = $this->getRandomVectors(2, $this->dimension); 108 | $dotProduct = 0; 109 | for ($i = 0; $i < count($vecs[0]); $i++) { 110 | $dotProduct += $vecs[0][$i] * $vecs[1][$i]; 111 | } 112 | 113 | $this->assertEqualsWithDelta($dotProduct, $this->vectorTable->cosim($vecs[0], $vecs[1]), 0.0001); 114 | } 115 | 116 | public function testSelectAll() { 117 | $this->vectorTable->getConnection()->begin_transaction(); 118 | 119 | $vecs = $this->getRandomVectors(10, $this->dimension); 120 | foreach ($vecs as $vec) { 121 | $this->vectorTable->upsert($vec); 122 | } 123 | 124 | $results = $this->vectorTable->selectAll(); 125 | $this->assertSameSize($vecs, $results); 126 | 127 | $i = 0; 128 | foreach ($results as $result) { 129 | $this->assertEqualsWithDelta($vecs[$i], $result['vector'], 0.00001); 130 | $i++; 131 | } 132 | 133 | $this->vectorTable->getConnection()->rollback(); 134 | } 135 | 136 | public function testVectorToHex() { 137 | $hex = $this->vectorTable->vectorToHex([0.5, 0.5, 0, 0, 0, 0.5]); 138 | $this->assertEqualsIgnoringCase('0031', $hex); 139 | 140 | $hex = $this->vectorTable->vectorToHex([0.5, 0.5, 0.5, 0.5, 0,0,0,0,0,0,0,0,0,0,0,0]); 141 | $this->assertEqualsIgnoringCase('f000', $hex); 142 | 143 | $hex = $this->vectorTable->vectorToHex([0.5, 0.5, 0.5, 0.5, 0,0,0,0,0,0,0,0,0,0,0,1]); 144 | $this->assertEqualsIgnoringCase('f001', $hex); 145 | 146 | $hex = $this->vectorTable->vectorToHex([0.5, 0.5, 0.5, 0.5, 1,0,0,0,0,0,0,0,0,0,0,1]); 147 | $this->assertEqualsIgnoringCase('F801', $hex); 148 | } 149 | 150 | public function testSearch() { 151 | $multiples = 1; 152 | $this->vectorTable->getConnection()->begin_transaction(); 153 | 154 | // Insert $this->testVectorAmount random vectors 155 | for($i = 0; $i < $multiples; $i++) { 156 | $vecs = $this->getRandomVectors($this->testVectorAmount, $this->dimension); 157 | $this->vectorTable->batchInsert($vecs); 158 | } 159 | 160 | // Let's insert a known vector 161 | $targetVector = array_fill(0, $this->dimension, 0.5); 162 | $this->vectorTable->upsert($targetVector); 163 | 164 | // Now, we search for this vector 165 | $searchAmount = $this->testVectorAmount * $multiples; 166 | echo "Searching for 1 vector among ($searchAmount) with binary quantization...\n"; 167 | $time = microtime(true); 168 | $results = $this->vectorTable->search($targetVector); 169 | $time = microtime(true) - $time; 170 | // print time in format 00:00:00.000 171 | echo sprintf("Search completed in %.2f seconds\n", $time); 172 | 173 | // At least the first result should be our target vector or very close 174 | $firstResultVector = $results[0]['vector']; 175 | $firstResultSimilarity = $results[0]['similarity']; 176 | 177 | $this->assertEqualsWithDelta($targetVector, $firstResultVector, 0.00001, "The most similar vector should be the target vector itself"); 178 | $this->assertEqualsWithDelta(1.0, $firstResultSimilarity, 0.001, "The similarity of the most similar vector should be the highest possible value"); 179 | 180 | $this->vectorTable->getConnection()->rollback(); 181 | } 182 | 183 | public function testDelete(): void { 184 | $this->vectorTable->getConnection()->begin_transaction(); 185 | 186 | $ids = []; 187 | $vecs = $this->getRandomVectors(10, $this->dimension); 188 | foreach ($vecs as $vec) { 189 | $ids[] = $this->vectorTable->upsert($vec); 190 | } 191 | 192 | $this->assertEquals(count($ids), $this->vectorTable->count()); 193 | 194 | foreach ($ids as $id) { 195 | $this->vectorTable->delete($id); 196 | } 197 | 198 | $this->assertEquals(0, $this->vectorTable->count()); 199 | 200 | $this->vectorTable->getConnection()->rollback(); 201 | } 202 | 203 | public static function tearDownAfterClass(): void 204 | { 205 | // Clean up the database and close connection 206 | $mysqli = new \mysqli('db', 'db', 'db', 'db', 3306); 207 | $vectorTable = new VectorTable($mysqli, 'test_table', 3); 208 | $mysqli->query("DROP TABLE IF EXISTS " . $vectorTable->getVectorTableName()); 209 | $mysqli->query("DROP FUNCTION IF EXISTS COSIM"); 210 | $mysqli->close(); 211 | } 212 | 213 | protected function tearDown(): void 214 | { 215 | // Clean up the database and close connection 216 | $this->vectorTable->getConnection()->query("DROP TABLE IF EXISTS " . $this->vectorTable->getVectorTableName()); 217 | $this->vectorTable->getConnection()->query("DROP FUNCTION IF EXISTS COSIM"); 218 | $this->vectorTable->getConnection()->close(); 219 | } 220 | 221 | } 222 | --------------------------------------------------------------------------------