├── .github
└── workflows
│ ├── build_wheel.yml
│ └── build_wheel
│ └── Dockerfile
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
├── .nojekyll
├── LICENSE.md
├── README.md
├── _sidebar.md
├── index.html
├── modelconnector
│ ├── framworks.md
│ ├── introduction.md
│ ├── ld_preload.md
│ └── python_apis.md
└── torchconnector
│ ├── configuration.md
│ ├── examples.md
│ ├── installation.md
│ └── introduction.md
├── oss-model-connector
├── ossmodelconnector
│ ├── __init__.py
│ ├── _oss_connector
│ │ ├── __init__.py
│ │ └── oss_model_connector.pyi
│ └── oss_model_connector.py
├── pyproject.toml
└── setup.py
└── oss-torch-connector
├── osstorchconnector
├── __init__.py
├── _oss_bucket_iterable.py
├── _oss_client.py
├── _oss_connector
│ ├── __init__.py
│ └── oss_connector.pyi
├── _oss_tar_iterable.py
├── oss_checkpoint.py
├── oss_iterable_dataset.py
└── oss_map_dataset.py
├── pyproject.toml
├── setup.py
└── tools
└── generate_tar_archive.py
/.github/workflows/build_wheel.yml:
--------------------------------------------------------------------------------
1 | name: Build Wheel
2 |
3 | on:
4 | push:
5 | tags:
6 | - "osstorchconnector/v*"
7 | - "ossmodelconnector/v*"
8 |
9 | jobs:
10 | build:
11 | name: "Build Release"
12 | runs-on: ubuntu-latest
13 | strategy:
14 | fail-fast: false
15 | matrix:
16 | python: ["3.8", "3.9", "3.10", "3.11", "3.12"]
17 | steps:
18 | - name: Checkout
19 | uses: actions/checkout@v3
20 | - name: Login ghcr.io
21 | uses: docker/login-action@v2
22 | with:
23 | registry: ghcr.io
24 | username: ${{ github.actor }}
25 | password: ${{ secrets.GITHUB_TOKEN }}
26 | - name: Setup buildx instance
27 | uses: docker/setup-buildx-action@v2
28 | with:
29 | use: true
30 | - name: Build
31 | shell: bash
32 | run: |
33 | REFS=${{ github.ref }}
34 | REFS=${REFS#refs/tags/}
35 | PACKAGE_NAME=${REFS%%/*}
36 | echo "PACKAGE_NAME=${PACKAGE_NAME}"
37 | RELEASE_VERSION="${REFS#*/v}"
38 | echo "RELEASE_VERSION=${RELEASE_VERSION}"
39 | PYTHON_VERSION=${{ matrix.python }}
40 | PYTHON_VERSION=${PYTHON_VERSION//./}
41 | echo "PYTHON_VERSION=${PYTHON_VERSION}"
42 | BUILD_IMAGE="quay.io/pypa/manylinux2014_x86_64:2024-03-10-4935fcc"
43 | echo "BUILD_IMAGE=${BUILD_IMAGE}"
44 | RELEASE_IMAGE="ghcr.io/${GITHUB_REPOSITORY,,}/connector_builder:${PACKAGE_NAME}-${RELEASE_VERSION}"
45 | echo "RELEASE_IMAGE=${RELEASE_IMAGE}"
46 | if [[ "${PACKAGE_NAME}" == "osstorchconnector" ]]; then
47 | PACKAGE_DIR="oss-torch-connector"
48 | elif [[ "${PACKAGE_NAME}" == "ossmodelconnector" ]]; then
49 | PACKAGE_DIR="oss-model-connector"
50 | fi
51 | sed -i -e "s/version.*/version = \"${RELEASE_VERSION}\"/g" ${PACKAGE_DIR}/pyproject.toml
52 | docker buildx build --build-arg BUILD_IMAGE=${BUILD_IMAGE} --build-arg RELEASE_IMAGE=${RELEASE_IMAGE} --build-arg PYTHON_VERSION=${PYTHON_VERSION} --build-arg PACKAGE_DIR=${PACKAGE_DIR} -f .github/workflows/build_wheel/Dockerfile -o dist/ .
53 | ls -l dist/
54 | - name: Upload
55 | uses: actions/upload-artifact@v4
56 | with:
57 | name: dist-${{ strategy.job-index }}
58 | path: dist/oss*
59 |
60 | release:
61 | name: "Tagged Release"
62 | runs-on: ubuntu-latest
63 | needs: [build]
64 | steps:
65 | - name: Download builds and release notes
66 | uses: actions/download-artifact@v4
67 | with:
68 | pattern: dist-*
69 | merge-multiple: true
70 | path: dist
71 | - name: Display downloaded files
72 | shell: bash
73 | run: |
74 | ls -l dist
75 | REFS=${{ github.ref }}
76 | REFS=${REFS#refs/tags/}
77 | echo "RELEASE_TAG=${REFS}" >> $GITHUB_ENV
78 | - name: Create Release
79 | uses: "marvinpinto/action-automatic-releases@latest"
80 | with:
81 | repo_token: "${{ secrets.GITHUB_TOKEN }}"
82 | automatic_release_tag: "${{ env.RELEASE_TAG }}"
83 | prerelease: false
84 | files: dist/oss*
85 |
--------------------------------------------------------------------------------
/.github/workflows/build_wheel/Dockerfile:
--------------------------------------------------------------------------------
1 | ARG RELEASE_IMAGE
2 | ARG BUILD_IMAGE
3 | FROM ${RELEASE_IMAGE} AS release
4 |
5 | FROM ${BUILD_IMAGE} AS builder
6 | WORKDIR /libconnector
7 | COPY --from=release /libconnector .
8 | ARG PACKAGE_DIR
9 | COPY ${PACKAGE_DIR} .
10 | COPY README.md LICENSE MANIFEST.in .
11 | ARG PYTHON_VERSION
12 | ENV PY_VER=${PYTHON_VERSION}
13 | ENV PY_ABI_TAG=cp${PY_VER}-cp${PY_VER}
14 | RUN export PATH="/opt/python/${PY_ABI_TAG}/bin:$PATH" && \
15 | python3 -V && \
16 | libconnector=$(find . -type f -name "oss_*.cpython-${PY_VER}-x86_64-linux-gnu.so") && \
17 | chmod +x ${libconnector} && \
18 | echo -e "[build_ext]\nlibrary_path=${libconnector}" > setup.cfg && \
19 | cat setup.cfg && \
20 | python3 -u setup.py bdist_wheel && \
21 | auditwheel repair dist/oss*.whl -w repaired_wheel && \
22 | find . -type f -name "oss-connector-lib-*" -exec cp {} repaired_wheel/ \;
23 |
24 | FROM scratch
25 | COPY --from=builder /libconnector/repaired_wheel/oss* /
26 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info/
2 | build/
3 | dist/
4 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2024 aliyun.com
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
4 | documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
5 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
6 | permit persons to whom the Software is furnished to do so, subject to the following conditions:
7 |
8 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
9 | Software.
10 |
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
12 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
13 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
14 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE README.md
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # OSS Connector for AI/ML
2 |
3 | [ossconnector.github.io](https://ossconnector.github.io/)
4 |
5 | ## Overview
6 |
7 | OSS Connector for AI/ML contains some high-performance Python libraries specifically designed for AI and ML scenariosis, tailored to work with [Alibaba Cloud OSS (Object Storage Service)](https://www.alibabacloud.com/en/product/object-storage-service).
8 |
9 | Currently, the OSS connector is composed of two libraries: OSS Model Connector and OSS Torch Connector.
10 |
11 | - [OSS Torch Connector](https://aliyun.github.io/oss-connector-for-ai-ml/#/torchconnector/introduction) is dedicated to AI training scenarios, including loading [datasets](https://pytorch.org/docs/stable/data.html#dataset-types) from OSS and loading/saving checkpoints from/to OSS.
12 |
13 | - [OSS Model Connector](https://aliyun.github.io/oss-connector-for-ai-ml/#/modelconnector/introduction) focuses on AI inference scenarios, loading large model files from OSS into local AI inference frameworks.
14 |
15 | The core component of the OSS Connector for AI/ML is implemented in C++ using [PhotonLibOS](https://github.com/alibaba/PhotonLibOS) and is provided as dynamic link libraries within wheel packages. This repository only contains the code of Python.
16 |
17 | For details, please refer to [ossconnector.github.io](https://ossconnector.github.io/) or [aliyun.github.io/oss-connector-for-ai-ml](https://aliyun.github.io/oss-connector-for-ai-ml).
18 |
19 |
20 | ## Related
21 |
22 | [OSS Connector for AI/ML 中文文档](https://help.aliyun.com/zh/oss/developer-reference/oss-connector-for-ai-ml)
23 |
24 | ## License
25 |
26 | This project is licensed under the terms of the [MIT License](LICENSE).
--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/oss-connector-for-ai-ml/0945da1942b9afee5efef2d733db370472af5afa/docs/.nojekyll
--------------------------------------------------------------------------------
/docs/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright (c) 2024 aliyun.com
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
4 | documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
5 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
6 | permit persons to whom the Software is furnished to do so, subject to the following conditions:
7 |
8 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
9 | Software.
10 |
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
12 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
13 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
14 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # OSS Connector for AI/ML
2 |
3 | OSS Connector for AI/ML contains some high-performance Python libraries specifically designed for AI and ML scenariosis, tailored to work with [Alibaba Cloud OSS (Object Storage Service)](https://www.alibabacloud.com/en/product/object-storage-service).
4 |
5 | Currently, the OSS connector is composed of two libraries: OSS Model Connector and OSS Torch Connector.
6 |
7 | - [OSS Torch Connector](https://aliyun.github.io/oss-connector-for-ai-ml/#/torchconnector/introduction) is dedicated to AI training scenarios, including loading [datasets](https://pytorch.org/docs/stable/data.html#dataset-types) from OSS and loading/saving checkpoints from/to OSS.
8 |
9 | - [OSS Model Connector](https://aliyun.github.io/oss-connector-for-ai-ml/#/modelconnector/introduction) focuses on AI inference scenarios, loading large model files from OSS into local AI inference frameworks.
10 |
11 | The core component of the OSS Connector for AI/ML is implemented in C++ using [PhotonLibOS](https://github.com/alibaba/PhotonLibOS) and is provided as dynamic link libraries within wheel packages. This repository only contains the code of Python.
12 |
13 |
14 | ## License
15 |
16 | This project is licensed under the terms of the [MIT License](LICENSE.md).
--------------------------------------------------------------------------------
/docs/_sidebar.md:
--------------------------------------------------------------------------------
1 | - [Home](/)
2 |
3 | - OSS Model Connector
4 |
5 | - [Introduction](/modelconnector/introduction.md)
6 | - [Python APIs](/modelconnector/python_apis.md)
7 | - [Inference Framworks](/modelconnector/framworks.md)
8 | - [LD_PRELOAD](/modelconnector/ld_preload.md)
9 |
10 | - OSS Torch Connector
11 |
12 | - [Introduction](/torchconnector/introduction.md)
13 | - [Installation](/torchconnector/installation.md)
14 | - [Configuration](/torchconnector/configuration.md)
15 | - [Examples](/torchconnector/examples.md)
16 |
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | OSS Connector for AI/ML
10 |
14 |
15 |
16 |
17 |
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/docs/modelconnector/framworks.md:
--------------------------------------------------------------------------------
1 | # Work with AI frameworks
2 |
3 | ## Overview
4 |
5 | Mainstream AI inference frameworks, such as vllm and transformers, load models from a local directory. The number of files in the model directory is not large, comprising several small files and multiple larger model files. For example, the directory below shows the model directory for Qwen2.5-72B, including 37 large safetensors files and several small files.
6 |
7 | ```bash
8 | # ll -lh /root/Qwen2.5-72B
9 | total 136G
10 | -rw-r--r-- 1 root root 664 Sep 25 12:23 config.json
11 | -rw-r--r-- 1 root root 2 Sep 25 12:23 configuration.json
12 | -rw-r--r-- 1 root root 138 Sep 25 12:23 generation_config.json
13 | -rw-r--r-- 1 root root 6.8K Sep 25 12:23 LICENSE
14 | -rw-r--r-- 1 root root 1.6M Sep 25 12:23 merges.txt
15 | -rw-r--r-- 1 root root 3.6G Sep 25 12:28 model-00001-of-00037.safetensors
16 | -rw-r--r-- 1 root root 3.8G Sep 25 12:33 model-00002-of-00037.safetensors
17 | -rw-r--r-- 1 root root 3.6G Sep 25 12:39 model-00003-of-00037.safetensors
18 | -rw-r--r-- 1 root root 3.8G Sep 25 12:44 model-00004-of-00037.safetensors
19 | -rw-r--r-- 1 root root 3.8G Sep 25 12:50 model-00005-of-00037.safetensors
20 | -rw-r--r-- 1 root root 3.8G Sep 25 12:55 model-00006-of-00037.safetensors
21 | -rw-r--r-- 1 root root 3.6G Sep 25 13:00 model-00007-of-00037.safetensors
22 | -rw-r--r-- 1 root root 3.8G Sep 25 13:06 model-00008-of-00037.safetensors
23 | -rw-r--r-- 1 root root 3.8G Sep 25 13:11 model-00009-of-00037.safetensors
24 | -rw-r--r-- 1 root root 3.8G Sep 25 13:17 model-00010-of-00037.safetensors
25 | -rw-r--r-- 1 root root 3.6G Sep 25 13:22 model-00011-of-00037.safetensors
26 | -rw-r--r-- 1 root root 3.8G Sep 25 13:28 model-00012-of-00037.safetensors
27 | -rw-r--r-- 1 root root 3.8G Sep 25 13:33 model-00013-of-00037.safetensors
28 | -rw-r--r-- 1 root root 3.8G Sep 25 13:39 model-00014-of-00037.safetensors
29 | -rw-r--r-- 1 root root 3.6G Sep 25 13:44 model-00015-of-00037.safetensors
30 | -rw-r--r-- 1 root root 3.8G Sep 25 13:49 model-00016-of-00037.safetensors
31 | -rw-r--r-- 1 root root 3.8G Sep 25 13:55 model-00017-of-00037.safetensors
32 | -rw-r--r-- 1 root root 3.8G Sep 25 14:00 model-00018-of-00037.safetensors
33 | -rw-r--r-- 1 root root 3.6G Sep 25 14:06 model-00019-of-00037.safetensors
34 | -rw-r--r-- 1 root root 3.8G Sep 25 14:11 model-00020-of-00037.safetensors
35 | -rw-r--r-- 1 root root 3.8G Sep 25 14:17 model-00021-of-00037.safetensors
36 | -rw-r--r-- 1 root root 3.8G Sep 25 14:22 model-00022-of-00037.safetensors
37 | -rw-r--r-- 1 root root 3.6G Sep 25 14:27 model-00023-of-00037.safetensors
38 | -rw-r--r-- 1 root root 3.8G Sep 25 14:33 model-00024-of-00037.safetensors
39 | -rw-r--r-- 1 root root 3.8G Sep 25 14:38 model-00025-of-00037.safetensors
40 | -rw-r--r-- 1 root root 3.8G Sep 25 14:44 model-00026-of-00037.safetensors
41 | -rw-r--r-- 1 root root 3.6G Sep 25 14:49 model-00027-of-00037.safetensors
42 | -rw-r--r-- 1 root root 3.8G Sep 25 14:55 model-00028-of-00037.safetensors
43 | -rw-r--r-- 1 root root 3.8G Sep 25 15:00 model-00029-of-00037.safetensors
44 | -rw-r--r-- 1 root root 3.8G Sep 25 15:05 model-00030-of-00037.safetensors
45 | -rw-r--r-- 1 root root 3.6G Sep 25 15:11 model-00031-of-00037.safetensors
46 | -rw-r--r-- 1 root root 3.8G Sep 25 15:16 model-00032-of-00037.safetensors
47 | -rw-r--r-- 1 root root 3.8G Sep 25 15:22 model-00033-of-00037.safetensors
48 | -rw-r--r-- 1 root root 3.8G Sep 25 15:27 model-00034-of-00037.safetensors
49 | -rw-r--r-- 1 root root 3.6G Sep 25 15:32 model-00035-of-00037.safetensors
50 | -rw-r--r-- 1 root root 3.8G Sep 25 15:38 model-00036-of-00037.safetensors
51 | -rw-r--r-- 1 root root 3.3G Sep 25 15:43 model-00037-of-00037.safetensors
52 | -rw-r--r-- 1 root root 78K Sep 25 15:43 model.safetensors.index.json
53 | -rw-r--r-- 1 root root 3.8K Sep 25 15:43 README.md
54 | -rw-r--r-- 1 root root 7.1K Sep 25 15:43 tokenizer_config.json
55 | -rw-r--r-- 1 root root 6.8M Sep 25 15:43 tokenizer.json
56 | -rw-r--r-- 1 root root 2.7M Sep 25 15:43 vocab.json
57 | ```
58 |
59 | Another common scenario is like the Stable Diffusion web UI, where a large number of models are stored in one or several folders, and there might be situations where models need to be switched during use.
60 |
61 | The OssModelConnector offers a method to directly pass in an OSS directory to the inference frameworks and read the models directly from OSS.
62 |
63 | Compared to the FUSE-based mounting solution, OssModelConnector has a significant performance advantage. Compared to downloading before loading to framworks, the OssModelConnector allows for simultaneous downloading and loading, achieving faster model deployment speeds.
64 |
65 | ## Usage
66 |
67 | Before starting inference frameworks like vllm and transformers, call `connector.prepare_directory(oss_dir, model_dir)`, and then pass model_dir to the inference framework.
68 |
69 | The `oss_dir` is the directory in OSS where the model files are stored, formatted as a URL, for example, `oss://ai-testset/qwen/qwen2.5-72B/`.
70 |
71 | The `model_dir` is the local model directory. During the process, the connector will download some temporary data into model_dir, which can be deleted afterward.
72 |
73 | After the prepare_directory called, the OssModelConnector begins downloading and prefetching data. Smaller files will be downloaded to the `model_dir` concurrently, while larger model files start prefetching into memory in alphabetical order. To avoid being corrupted by dirty data, the OssModelConnector will clean the contents of the `model_dir` before running.
74 |
75 | ## Examples
76 |
77 | ### Transformers
78 | ```python
79 | from transformers import AutoModelForCausalLM, AutoTokenizer
80 | from ossmodelconnector import OssModelConnector
81 |
82 | # initialize OssModelConnector
83 | connector = OssModelConnector(...)
84 |
85 | # prepare_directory
86 | oss_path = "oss://ai-testset/qwen/Qwen25-75B"
87 | model_dir = '/root/abc/'
88 | connector.prepare_directory(oss_path, model_dir)
89 |
90 | # pass model_dir to transformer
91 | tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
92 | model = AutoModelForCausalLM.from_pretrained(
93 | model_dir,
94 | device_map="cpu",
95 | trust_remote_code=True,
96 | ).eval()
97 |
98 | # close to release resource
99 | connector.close()
100 |
101 | # do inference
102 | ```
103 |
104 | ### Vllm
105 |
106 | ```python
107 | from transformers import AutoTokenizer
108 | from vllm import LLM, SamplingParams
109 | from ossmodelconnector import OssModelConnector
110 |
111 | # initialize OssModelConnector
112 | connector = OssModelConnector(...)
113 |
114 | # prepare_directory
115 | oss_path = "oss://ai-testset/qwen/Qwen25-75B"
116 | model_dir = '/root/abc/'
117 | connector.prepare_directory(oss_path, model_dir)
118 |
119 | # pass model_dir to vllm
120 | tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
121 | sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=512)
122 | llm = LLM(model=model_dir, trust_remote_code=True)
123 |
124 | # close to release resource
125 | connector.close()
126 |
127 | # do inference
128 | ```
129 |
130 | # Stable Diffusion web UI
131 |
132 | Edit launch.py to initalize and configure OssModelConnector.
133 |
134 | ```python
135 | from modules import launch_utils
136 |
137 | import oss2
138 | from oss2.credentials import EnvironmentVariableCredentialsProvider
139 | from ossmodelconnector import OssModelConnector
140 |
141 | ...
142 |
143 | def main():
144 | ...
145 |
146 |
147 | if __name__ == "__main__":
148 | connector = OssModelConnector(endpoint='oss-cn-beijing-internal.aliyuncs.com',
149 | cred_provider=EnvironmentVariableCredentialsProvider(),
150 | config_path='/etc/connector.json')
151 | connector.prepare_directory('oss://ai-testset/Stable-diffusion/', '/root/stable-diffusion-webui/models/Stable-diffusion')
152 |
153 | main()
154 | ```
155 |
156 | Currently, prepare_directory() loads all models into memory, which can put pressure on memory and even cause crashes in scenarios with a large number of models. In the future, prepare_directory() will support lazy loading, downloading models only when switching to or open them, and it will include a garbage collection feature to release memory for unused models after a specified time.
157 |
--------------------------------------------------------------------------------
/docs/modelconnector/introduction.md:
--------------------------------------------------------------------------------
1 |
2 | # OSS Model Connector
3 |
4 | ## Overview
5 |
6 | Storing (large) models on a lower-cost object storage (like Alibaba Cloud OSS) is a cost-effective option. The OSS Model Connector provides high-performance methods for loading (large) model files from OSS in AI inference scenarios.
7 |
8 | In current, memory of computing nodes for AI inference are generally large. The common practice is to first load the model from network storage or local disk into the node's memory before applying it for subsequent use.
9 | The primary function of the OSS Model Connector is to fully leverage local memory to accelerate the process of downloading models from OSS.
10 | In our testing environment, the download speed can exceed 15GB/s, approaching 20GB/s.
11 |
12 | The OSS Model Connector mainly offers 3 usage methods.
13 |
14 | - The first method is using the Python interface, allowing users to open OSS objects and read their contents through list stream api.
15 | We also provide an interface for listing objects on OSS, as well as an implementation call 'fast list', which can complete the listing of a million objects within several seconds.
16 |
17 | - The second method is utilizing the libraries for loading models in inference frameworks such as transformer or vllm. This method enables the integration of model file downloading and loading, optimizing the model deployment time.
18 |
19 | - The third method is to use LD_PRELOAD to address scenarios that the second method cannot handle, such as multi-process environments. The advantage of this approach is that it does not require modifying the code, configuration alone is sufficient.
20 |
21 | ## Features
22 |
23 | Compared to other solutions for loading OSS data, the OSS Model Connector is more focused, simpler, and high-performance.
24 |
25 | - Focus
26 |
27 | Unlike [ossfs](https://github.com/aliyun/ossfs), which provides a generic POSIX interface, the OSS Model Connector is more focused on AI inference scenarios. In this context, only data reading is involved, so there is no need to implement complex write operations. Additionally, memory resources are usually more abundant in these scenarios, allowing for the use of large amounts of memory for caching to accelerate the speed of data downloading from OSS.
28 |
29 | - Simpler
30 |
31 | The OSS Model Connector is used as an SDK, implemented entirely in user space, without the need for kernel or FUSE modules, resulting in a more simpler I/O path.
32 |
33 | - High-performance
34 |
35 | Thanks to the simpler I/O path and efficient C++ implementation, the OSS Model Connector can achieve better performance. The C++ code is implemented based on the high-performance [PhotonLibOS](https://github.com/alibaba/PhotonLibOS), which includes features such as coroutines and HTTP client. In our testing environment, the model loading speed can exceed 15GB/s, approaching 20GB/s, achieve the maximum bandwidth of the OSS server configuration.
36 |
--------------------------------------------------------------------------------
/docs/modelconnector/ld_preload.md:
--------------------------------------------------------------------------------
1 | # Loading Models via LD_PRELOAD
2 |
3 | ## Overview
4 | In multi-process scenarios, the OSSModelConnector configuration initialized via the Python interface may be lost in Python sub-processes, causing OSS data to fail to load. For example, `vllm.entrypoints.openai.api_server`, where the main process is the API server and model inference happens in sub-processes; or in multi-GPU scenarios, where different processes load models onto different GPUs.
5 |
6 | In such cases, you can start the OSSModelConnector using the `LD_PRELOAD` method, passing configuration parameters via environment variables. Compared to initializing with Python, this `LD_PRELOAD` method generally does not require code modifications.
7 |
8 | ## Installation
9 |
10 | Download the installation package `oss-connector-lib` from [Release](https://github.com/aliyun/oss-connector-for-ai-ml/releases)
11 |
12 | For example, download the `oss-connector-lib-1.0.0rc8` and install.
13 |
14 | rpm:
15 |
16 | ```shell
17 | yum install -y https://github.com/aliyun/oss-connector-for-ai-ml/releases/download/ossmodelconnector%2Fv1.0.0rc8/oss-connector-lib-1.0.0rc8.x86_64.rpm
18 | ```
19 |
20 | deb:
21 | ```shell
22 | wget https://github.com/aliyun/oss-connector-for-ai-ml/releases/download/ossmodelconnector%2Fv1.0.0rc8/oss-connector-lib-1.0.0rc8.x86_64.deb
23 | dpkg -i oss-connector-lib-1.0.0rc8.x86_64.deb
24 | ```
25 |
26 | **After installation, check `/usr/local/lib/libossc_preload.so`.**
27 |
28 |
29 | ## Usage Method
30 |
31 | ### Configuration File
32 |
33 | The configuration file path is `/etc/oss-connector/config.json`. The installation package **already includes** a default configuration file as follows:
34 |
35 | ```json
36 | {
37 | "logLevel": 1,
38 | "logPath": "/var/log/oss-connector/connector.log",
39 | "auditPath": "/var/log/oss-connector/audit.log",
40 | "prefetch": {
41 | "vcpus": 16,
42 | "workers": 16
43 | }
44 | }
45 | ```
46 |
47 | The main performance-related parameters are:
48 |
49 | - `prefetch.vcpus`: Number of vCPUs (CPU cores) to prefetch, default value is 16.
50 | - `prefetch.workers`: Number of coroutines per prefetched vCPU, default value is 16.
51 |
52 | ### Configure Environment Variables
53 |
54 | | Environment Variable KEY | Environment Variable VALUE Description |
55 | | --- | --- |
56 | | OSS_ACCESS_KEY_ID | OSS access key |
57 | | OSS_ACCESS_KEY_SECRET | OSS access key secret |
58 | | OSS_SESSION_TOKEN | Optional, STS token |
59 | | OSS_ENDPOINT | Endpoint for OSS, e.g., `http://oss-cn-beijing-internal.aliyuncs.com`, default HTTP schema is `http` |
60 | | OSS_PATH | OSS model directory, e.g., `oss://example-bucket/example-model-path/` |
61 | | MODEL_DIR | Local model directory, passed to vLLM or other inference frameworks. To avoid interference from dirty data, it is recommended to clear this directory first. Temporary data will be downloaded during use, and it can be deleted afterward. |
62 | | LD_PRELOAD | `/usr/local/lib/libossc_preload.so` |
63 | | **ENABLE_CONNECTOR** | `1`, **Enable Connector, must be set for the main process** |
64 |
65 | ### Start Python Program
66 |
67 | ```shell
68 | LD_PRELOAD=/usr/local/lib/libossc_preload.so ENABLE_CONNECTOR=1 OSS_ACCESS_KEY_ID=${akid} OSS_ACCESS_KEY_SECRET=${aksecret} OSS_ENDPOINT=${endpoint} OSS_PATH=oss://${bucket}/${path}/ MODEL_DIR=/tmp/model python3 -m vllm.entrypoints.openai.api_server --model /tmp/model --trust-remote-code --tensor-parallel-size 1 --disable-custom-all-reduce
69 | ```
70 |
71 | ### Note!
72 |
73 | 1. `MODEL_DIR` must be consistent with the model dir for AI framework, e.g., vLLM's `--model`.
74 |
75 | 2. `ENABLE_CONNECTOR=1` must be set for the entrypoint process. `LD_PRELOAD` is recommended to be set for the entrypoint process but can also be directly set for the container.
76 |
77 | 3. Currently, when starting the OSSModelConnector via `LD_PRELOAD`, additional memory used for caching will be released with a delay, currently set at 120 seconds.
78 |
79 | 4. If using `nohup` to start, do not configure the environment variables for `nohup`. Instead, encapsulate the environment variables and startup command into a script and execute `nohup` on the script.
80 |
81 | 5. For now, try to use this method in single-machine scenarios. In multi-machine setups, there might be repeated loading or other unknown issues.
82 |
--------------------------------------------------------------------------------
/docs/modelconnector/python_apis.md:
--------------------------------------------------------------------------------
1 | # Python API
2 |
3 | ## Overview
4 |
5 | Users can create an OssModelConnector in Python and call its provided methods to access data on OSS. The OssModelConnector provides methods for read-only access to OSS, such as list, open, and read, but does not offer any write methods for now.
6 |
7 | ## Key Features
8 |
9 | - List and FastList
10 |
11 | In addition to offering a normal list implementation, a faster method called "FastList" is also provided to significantly enhance the efficiency of listing a large number of objects. FastList achieves this by concurrently sending list requests and more intelligently handling the segmentation of lists, allowing the listing of millions of objects to be completed within seconds.
12 |
13 | - Data Prefetching
14 |
15 | This optimization is specifically designed for large models. After the open api is called, the ModelConnector performs high-concurrency data prefetching according to the order of opening to fully leverage the bandwidth advantages of OSS. It temporarily stores the data in memory, allowing users to quickly load data from memory when reading.
16 |
17 | ## Installation
18 |
19 | ### Requirements
20 |
21 | - OS: Linux x86-64
22 | - glibc: >= 2.17
23 | - Python: 3.8-3.12
24 | - PyTorch: >= 2.0
25 |
26 | ### Install lastest version
27 |
28 | Download the latest OSSModelConnector package from [Release](https://github.com/aliyun/oss-connector-for-ai-ml/releases) and use pip to install it.
29 |
30 | For example, download the `ossmodelconnector/v1.0.0rc8` for Python 3.11 and install:
31 |
32 | ```bash
33 | wget https://github.com/aliyun/oss-connector-for-ai-ml/releases/download/ossmodelconnector%2Fv1.0.0rc1/ossmodelconnector-1.0.0rc8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
34 |
35 | pip install ossmodelconnector-1.0.0rc8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
36 | ```
37 |
38 | ## Configuration
39 |
40 | ### Credential
41 |
42 | When initializing the OssModelConnector, it is necessary to specify the authentication information required to access OSS.
43 |
44 | Two methods are supported: Crendentials provider and Crendentials file.
45 |
46 | #### Crendentials Provider
47 |
48 | OssModelConnector supports all authentication configuration methods of the OSS Python SDK.
49 | Please refer to the documentation:
50 | [How to configure access credentials for OSS SDK for Python](https://www.alibabacloud.com/help/en/oss/developer-reference/python-configuration-access-credentials) /
51 | [如何为OSS Python SDK配置访问凭证](https://help.aliyun.com/zh/oss/developer-reference/python-configuration-access-credentials)
52 |
53 | When using it, simply pass the `credentials_provider` to the constructor of the OssModelConnector.
54 |
55 | The following is an example of configuring authentication from environment variables.
56 |
57 | ```bash
58 | export OSS_ACCESS_KEY_ID=
59 | export OSS_ACCESS_KEY_SECRET=
60 | export OSS_SESSION_TOKEN=
61 | ```
62 |
63 | ```python
64 | import oss2
65 | from oss2.credentials import EnvironmentVariableCredentialsProvider
66 | from ossmodelconnector import OssModelConnector
67 |
68 | connector = OssModelConnector(endpoint=ENDPOINT,
69 | cred_provider=EnvironmentVariableCredentialsProvider(),
70 | config_path=CONFIG_PATH)
71 | ```
72 |
73 | The following is an example of user-custom credentials.
74 |
75 | ```python
76 | from oss2 import CredentialsProvider
77 | from oss2.credentials import Credentials
78 | from ossmodelconnector import OssModelConnector
79 |
80 | class CredentialProviderWrapper(CredentialsProvider):
81 | def get_credentials(self):
82 | return Credentials('', '')
83 |
84 |
85 | credentials_provider = CredentialProviderWrapper()
86 | connector = OssModelConnector(endpoint=ENDPOINT,
87 | cred_provider=credentials_provider,
88 | config_path=CONFIG_PATH)
89 | ```
90 |
91 |
92 | #### Crendentials File
93 |
94 | For now only JSON format credential file is supported.
95 |
96 | ```bash
97 | mkdir -p /root/.alibabacloud/
98 | cat <<-EOF | tee /root/.alibabacloud/credentials
99 | {
100 | "AccessKeyId": "",
101 | "AccessKeySecret": "",
102 | "SecurityToken": "",
103 | "Expiration": "2024-08-02T15:04:05Z"
104 | }
105 | EOF
106 | ```
107 | `SecurityToken` and `Expiration` are optional.
108 | The credential file must be updated before expiration to avoid authorization errors.
109 |
110 | ```python
111 | from ossmodelconnector import OssModelConnector
112 |
113 | connector = OssModelConnector(endpoint=ENDPOINT,
114 | cred_path='/root/.alibabacloud/credentials',
115 | config_path='/tmp/config.json')
116 | ```
117 |
118 |
119 | ### Config File
120 |
121 | The configuration file is responsible for setting parameters such as logging and concurrency. Below is an example.
122 |
123 | ```bash
124 | mkdir -p /etc/oss-connector/
125 | cat <<-EOF | tee /etc/oss-connector/config.json
126 | {
127 | "logLevel": 1,
128 | "logPath": "/var/log/oss-connector/connector.log",
129 | "auditPath": "/var/log/oss-connector/audit.log",
130 | "prefetch": {
131 | "vcpus": 24,
132 | "workers": 32
133 | }
134 | "fastList": {
135 | "vcpus": 2,
136 | "workers": 16
137 | }
138 | }
139 | EOF
140 | ```
141 |
142 | Pass the path to `config_path` when initializing OssModelConnector.
143 |
144 | ```python
145 | import oss2
146 | from oss2.credentials import EnvironmentVariableCredentialsProvider
147 | from ossmodelconnector import OssModelConnector
148 |
149 | connector = OssModelConnector(endpoint=ENDPOINT,
150 | cred_provider=EnvironmentVariableCredentialsProvider(),
151 | config_path='/etc/oss-connector/config.json')
152 | ```
153 |
154 | Below is an explanation of each configuration item.
155 |
156 | | Field | Description |
157 | |---------------|-------------------------------------------------------------------------------------------------------|
158 | | logLevel | The log level for log file, 0 - DEBUG, 1 - INFO, 2 - WARN, 3 - ERROR, 1 is the default value. |
159 | | logPath | The path for log file, `/var/log/oss-connector/connector.log` is the default value. |
160 | | auditPath | The path for audit file, `/var/log/oss-connector/audit.log` is the default value. |
161 | | prefetch.vcpus | The vcpu number for perfetching data. 16 is the default value. |
162 | | prefetch.workers | The worker number for perfetching data in each vcpu. 16 is the default value. |
163 | | fastList.vcpus | The vcpu number for doing fast list. 1 is the default value. |
164 | | fastList.workers | The worker number for doing fast list in each vcpu. 16 is the default value. |
165 |
166 |
167 | ## Main APIs
168 |
169 | - Initialization
170 |
171 | To initialize an OssModelConnector, please refer to [configuration](./configuration.md)
172 |
173 | ```python
174 | connector = OssModelConnector(endpoint=ENDPOINT,
175 | cred_provider=EnvironmentVariableCredentialsProvider(),
176 | config_path='/tmp/config.json')
177 | ```
178 |
179 | - List objects
180 |
181 | By passing in the `bucket` and `prefix`, users can obtain a list of all objects that meet the criteria, including name and size of objects.
182 |
183 | ```python
184 | objs = connector.list('ai-testset', "geonet/images/DISC/DISC.01/2022.001")
185 | for obj in objs:
186 | print(obj.key)
187 | print(obj.size)
188 | ```
189 |
190 | Do FastList by passing True in the second parameter, which works more faster for large amount objects.
191 | The order of objects obtained by FastList is not guaranteed. If a specific order is required, users can sort the result based on `key`.
192 |
193 | ```python
194 | objs = connector.list('ai-testset', "geonet/images/DISC/DISC.01/2022.001", True)
195 | ```
196 |
197 | - Open object
198 |
199 | Open an object through a URI. The URI format is `oss://{bucket}/{name}`. For example, `oss://ai-testset/dir1/obj1` represents an object named `dir1/obj1` in the `ai-testset` bucket.
200 |
201 | The open function accepts two parameters: the first is the URI, and the second is binary, which is of type bool and defaults to True, indicating that the file will be opened in binary mode. If set to False, it will be opened in text mode.
202 |
203 | ```python
204 | # open as binary mode
205 | obj = connector.open('oss://ai-testset/dir1/obj1')
206 |
207 | # open as text mode
208 | obj1 = connector.open('oss://ai-testset/dir1/obj1', False)
209 | ```
210 |
211 | After calling open, OssModelConnector will start prefetching in the order of the open calls. For scenarios involving loading large model files in shards (e.g. model-00001-of-00038.safetensors to model-00038-of-00038.safetensors), we recommend making sequential batch calls to open first, and then reading each one individually."
212 |
213 | - Read object data
214 |
215 | Read, readinto, seek methods are provided and they follow the standard usage of Python streams.
216 |
217 | When a read call is made, if the data has already been prefetched into memory, it is returned directly from memory. Otherwise, a request is sent to OSS to retrieve and return the data.
218 |
219 | ```python
220 | # read whole data
221 | data = obj.read()
222 |
223 | # read a specified amount of data
224 | data = obj.read(4*1024*1024)
225 |
226 | # read into buffer
227 | buf = bytearray(4 * 1024 * 1024)
228 | obj.readinto(buf)
229 |
230 | # seek to a position
231 | obj.seek(0)
232 | ```
233 |
234 | - Destroy object
235 |
236 | Destroying an object will release its occupied memory resources. Users can rely on Python's GC to handle it automatically, or perform manual destruction in memory-sensitive scenarios.
237 |
238 |
239 | ## Example
240 |
241 | Below is a sample code for loading a model in multiple shards. First, open them to initiate prefetching, and then read them sequentially.
242 |
243 | ```python
244 | import oss2
245 | from oss2.credentials import EnvironmentVariableCredentialsProvider
246 | from ossmodelconnector import OssModelConnector
247 |
248 | connector = OssModelConnector(endpoint=ENDPOINT,
249 | cred_provider=EnvironmentVariableCredentialsProvider(),
250 | config_path='/tmp/config.json')
251 |
252 | objs = []
253 | for i in range(1, 39): # 1-38
254 | name = f"oss://ai-testset/qwen/Qwen1.5-72B-Chat/model-{i:05d}-of-00038.safetensors"
255 | obj = connector.open(name)
256 | objs.append(obj)
257 |
258 | # using read
259 | for i in range(0, 38): # 0-37
260 | while True:
261 | data = objs[i].read(4*1024*1024)
262 | if not data:
263 | print("read object done ", i+1)
264 | break
265 |
266 | # or using readinto (recommended)
267 | buf = bytearray(4 * 1024 * 1024)
268 | for i in range(0, 38): # 0-37
269 | objs[i].seek(0)
270 | while True:
271 | n = objs[i].readinto(buf)
272 | if n == 0:
273 | print("readinto object done ", i+1)
274 | break
275 | ```
276 |
--------------------------------------------------------------------------------
/docs/torchconnector/configuration.md:
--------------------------------------------------------------------------------
1 | # Configuration
2 |
3 | ## Credential
4 |
5 | When initializing the OssTorchConnector components (OssMapDataset, OssIterableDataset, OssCheckpoint ...), it is necessary to specify the authentication information required to access OSS.
6 |
7 | Two methods are supported: Crendentials provider and Crendentials file.
8 |
9 | ### Crendentials Provider
10 |
11 | OssTorchConnector supports all authentication configuration methods of the OSS Python SDK.
12 | Please refer to the documentation:
13 | [How to configure access credentials for OSS SDK for Python](https://www.alibabacloud.com/help/en/oss/developer-reference/python-configuration-access-credentials) /
14 | [如何为OSS Python SDK配置访问凭证](https://help.aliyun.com/zh/oss/developer-reference/python-configuration-access-credentials)
15 |
16 | When using it, simply pass the `credentials_provider` to the constructor of the OssTorchConnector components.
17 |
18 | The following is an example of configuring authentication from environment variables.
19 |
20 | ```bash
21 | export OSS_ACCESS_KEY_ID=
22 | export OSS_ACCESS_KEY_SECRET=
23 | export OSS_SESSION_TOKEN=
24 | ```
25 |
26 | ```python
27 | import oss2
28 | from oss2.credentials import EnvironmentVariableCredentialsProvider
29 | from osstorchconnector import OssMapDataset
30 |
31 | map_dataset = OssMapDataset.from_prefix(OSS_URI, endpoint=ENDPOINT,
32 | cred_provider=EnvironmentVariableCredentialsProvider(),
33 | config_path=CONFIG_PATH)
34 | ```
35 |
36 | The following is an example of user-custom credentials.
37 |
38 | ```python
39 | from oss2 import CredentialsProvider
40 | from oss2.credentials import Credentials
41 | from osstorchconnector import OssMapDataset
42 |
43 | class CredentialProviderWrapper(CredentialsProvider):
44 | def get_credentials(self):
45 | return Credentials('', '')
46 |
47 |
48 | credentials_provider = CredentialProviderWrapper()
49 | map_dataset = OssMapDataset.from_prefix(OSS_URI, endpoint=ENDPOINT,
50 | cred_provider=credentials_provider,
51 | config_path=CONFIG_PATH)
52 | ```
53 |
54 |
55 | ### Crendentials File
56 |
57 | For now only JSON format credential file is supported.
58 |
59 | ```bash
60 | mkdir -p /root/.alibabacloud/
61 | cat <<-EOF | tee /root/.alibabacloud/credentials
62 | {
63 | "AccessKeyId": "",
64 | "AccessKeySecret": "",
65 | "SecurityToken": "",
66 | "Expiration": "2024-08-02T15:04:05Z"
67 | }
68 | EOF
69 | ```
70 | `SecurityToken` and `Expiration` are optional.
71 | The credential file must be updated before expiration to avoid authorization errors.
72 |
73 | ```python
74 | from osstorchconnector import OssMapDataset
75 |
76 | map_dataset = OssMapDataset(endpoint=ENDPOINT,
77 | cred_path='/root/.alibabacloud/credentials',
78 | config_path=CONFIG_PATH)
79 | ```
80 |
81 |
82 | ## Config
83 |
84 | The configuration file is responsible for setting parameters such as logging and concurrency. Below is an example.
85 |
86 | ```bash
87 | mkdir -p /etc/oss-connector/
88 | cat <<-EOF | tee /etc/oss-connector/config.json
89 | {
90 | "logLevel": 1,
91 | "logPath": "/var/log/oss-connector/connector.log",
92 | "auditPath": "/var/log/oss-connector/audit.log",
93 | "datasetConfig": {
94 | "prefetchConcurrency": 24,
95 | "prefetchWorker": 2
96 | },
97 | "checkpointConfig": {
98 | "prefetchConcurrency": 24,
99 | "prefetchWorker": 4,
100 | "uploadConcurrency": 64
101 | }
102 | }
103 | EOF
104 | ```
105 |
106 | Pass the path to `config_path` when initializing OssTorchConnector components.
107 |
108 | ```python
109 | import oss2
110 | from oss2.credentials import EnvironmentVariableCredentialsProvider
111 | from osstorchconnector import OssMapDataset
112 |
113 | map_dataset = OssMapDataset.from_prefix(OSS_URI, endpoint=ENDPOINT,
114 | cred_provider=EnvironmentVariableCredentialsProvider(),
115 | config_path='/etc/oss-connector/config.json')
116 | ```
117 |
118 | | Field | Description |
119 | |---------------|-------------------------------------------------------------------------------------------------------|
120 | | logLevel | The log level for log file, 0 - DEBUG, 1 - INFO, 2 - WARN, 3 - ERROR |
121 | | logPath | The path for log file, `/var/log/oss-connector/connector.log` is the default value. |
122 | | auditPath | The path for audit file, `/var/log/oss-connector/audit.log` is the default value. |
123 | | datasetConfig.prefetchConcurrency | The concurrency for perfetching data from Dataset. 24 is the default value. |
124 | | datasetConfig.prefetchWorker | The vcpu number for perfetching data from Dataset. 2 is the default value. |
125 | | datasetConfig.enableFastList | Flag to enable or disable FastList. false is the default value. |
126 | | datasetConfig.listConcurrency | The concurrency for FastList. 16 is the default value. |
127 | | datasetConfig.listWorker | The vcpu number for FastList. 1 is the default value. |
128 | | checkpointConfig.prefetchConcurrency | The concurrency for perfetching checkpoint. 24 is the default value. |
129 | | checkpointConfig.prefetchWorker | The vcpu number for perfetching checkpoint. 4 is the default value. |
130 | | checkpointConfig.uploadConcurrency | The concurrency for uploading checkpoint. 64 is the default value. |
--------------------------------------------------------------------------------
/docs/torchconnector/examples.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | ## Dataset
4 |
5 | ### IterableDataset
6 |
7 | ```py
8 | from osstorchconnector import OssIterableDataset
9 |
10 | ENDPOINT = "http://oss-cn-beijing-internal.aliyuncs.com"
11 | CONFIG_PATH = "/etc/oss-connector/config.json"
12 | CRED_PATH = "/root/.alibabacloud/credentials"
13 | OSS_URI = "oss://ossconnectorbucket/EnglistImg/Img/BadImag/Bmp/Sample001/"
14 |
15 | # 1) from_prefix
16 | iterable_dataset = OssIterableDataset.from_prefix(OSS_URI, endpoint=ENDPOINT, cred_path=CRED_PATH, config_path=CONFIG_PATH)
17 | for item in iterable_dataset:
18 | print(item.key)
19 | print(item.size)
20 | content = item.read()
21 | print(len(content))
22 | item.close()
23 |
24 |
25 | # 2) from_objects
26 | uris = [
27 | "oss://ossconnectorbucket/EnglistImg/Img/BadImag/Bmp/Sample001/img001-00001.png",
28 | "oss://ossconnectorbucket/EnglistImg/Img/BadImag/Bmp/Sample001/img001-00002.png",
29 | "oss://ossconnectorbucket/EnglistImg/Img/BadImag/Bmp/Sample001/img001-00003.png"
30 | ]
31 |
32 | iterable_dataset = OssIterableDataset.from_objects(uris, endpoint=ENDPOINT, cred_path=CRED_PATH, config_path=CONFIG_PATH)]
33 | for item in iterable_dataset:
34 | print(item.key)
35 | print(item.size)
36 | content = item.read()
37 | print(len(content))
38 | item.close()
39 | ```
40 |
41 | ### MapDataset
42 |
43 | ```py
44 | from osstorchconnector import OssMapDataset
45 |
46 | ENDPOINT = "http://oss-cn-beijing-internal.aliyuncs.com"
47 | CONFIG_PATH = "/etc/oss-connector/config.json"
48 | CRED_PATH = "/root/.alibabacloud/credentials"
49 | OSS_URI = "oss://ossconnectorbucket/EnglistImg/Img/BadImag/Bmp/Sample001/"
50 |
51 | # 1) from_prefix
52 | map_dataset = OssMapDataset.from_prefix(OSS_URI, endpoint=ENDPOINT, cred_path=CRED_PATH, config_path=CONFIG_PATH)
53 | # random access
54 | item = map_dataset[0]
55 | print(item.key)
56 | content = item.read()
57 | print(item.size)
58 | print(len(content))
59 | item.close()
60 |
61 | # or
62 | with map_dataset[5] as item:
63 | print(item.key)
64 | content = item.read()
65 | print(item.size)
66 | print(len(content))
67 |
68 | # iterable
69 | for item in map_dataset:
70 | print(item.key)
71 | print(item.size)
72 | content = item.read()
73 | print(len(content))
74 | item.close()
75 |
76 |
77 | # 2) from_objects
78 | uris = [
79 | "oss://ossconnectorbucket/EnglistImg/Img/BadImag/Bmp/Sample001/img001-00001.png",
80 | "oss://ossconnectorbucket/EnglistImg/Img/BadImag/Bmp/Sample001/img001-00002.png",
81 | "oss://ossconnectorbucket/EnglistImg/Img/BadImag/Bmp/Sample001/img001-00003.png"
82 | ]
83 |
84 | map_dataset = OssMapDataset.from_objects(uris, endpoint=ENDPOINT, cred_path=CRED_PATH, config_path=CONFIG_PATH)
85 | # random access
86 | item = map_dataset[1]
87 | print(item.key)
88 | print(item.size)
89 | content = item.read()
90 | print(len(content))
91 | item.close()
92 |
93 | # iterable
94 | for item in map_dataset:
95 | print(item.key)
96 | print(item.size)
97 | content = item.read()
98 | print(len(content))
99 | item.close()
100 | ```
101 |
102 | Please note that OssMapDataset performs an OSS list objects operation under the given prefix first (which may take some time).
103 |
104 | ### Manifest file
105 |
106 | Manifest file contains objects name (and label) of OSS objects.
107 | Building datasets with manifest file can reduce the overhead of listing objects in OSS, making it suitable for datasets with a large number of objects and repeated dataset loading.
108 |
109 | A manifest file must be constructed in advance, and a method for parsing it must be provided during use.
110 | Below are examples of manifest files and loading a dataset with manifest file.
111 |
112 | Example manifest file with object name:
113 | ```
114 | Img/BadImag/Bmp/Sample001/img001-00001.png
115 | Img/BadImag/Bmp/Sample001/img001-00002.png
116 | Img/BadImag/Bmp/Sample001/img001-00003.png
117 | ```
118 |
119 | Example manifest file with object name and label:
120 | ```
121 | Img/BadImag/Bmp/Sample001/img001-00001.png label1
122 | Img/BadImag/Bmp/Sample001/img001-00002.png label2
123 | Img/BadImag/Bmp/Sample001/img001-00003.png label3
124 | ```
125 |
126 | ```py
127 | from osstorchconnector import OssIterableDataset
128 |
129 | ENDPOINT = "http://oss-cn-beijing-internal.aliyuncs.com"
130 | CONFIG_PATH = "/etc/oss-connector/config.json"
131 | CRED_PATH = "/root/.alibabacloud/credentials"
132 | OSS_URI = "oss://ossconnectorbucket/EnglistImg/Img/BadImag/Bmp/Sample001/"
133 |
134 | # manifest_parser
135 | def manifest_parser(reader: io.IOBase) -> Iterable[Tuple[str, str]]:
136 | lines = reader.read().decode("utf-8").strip().split("\n")
137 | for i, line in enumerate(lines):
138 | try:
139 | items = line.strip().split(' ')
140 | if len(items) >= 2:
141 | key = items[0]
142 | label = items[1]
143 | yield (key, label)
144 | elif len(items) == 1:
145 | key = items[0]
146 | yield (key, '')
147 | else:
148 | raise ValueError("format error")
149 | except ValueError as e:
150 | raise e
151 |
152 | # from local manifest_file
153 | iterable_dataset = OssIterableDataset.from_manifest_file("manifest_file", manifest_parser, "oss://ossconnectorbucket/EnglistImg/", endpoint=ENDPOINT, cred_path=CRED_PATH, config_path=CONFIG_PATH)
154 | for item in iterable_dataset:
155 | print(item.key)
156 | print(item.size)
157 | print(item.label)
158 | content = item.read()
159 | print(len(content))
160 | item.close()
161 |
162 | # manifest_file on oss
163 | iterable_dataset = OssIterableDataset.from_manifest_file("oss://ossconnectorbucket/manifest_file/EnglistImg/manifest_file", manifest_parser, "oss://ossconnectorbucket/EnglistImg/", endpoint=ENDPOINT, cred_path=CRED_PATH, config_path=CONFIG_PATH)
164 | ```
165 |
166 | ### Dataset and transform
167 |
168 | ```py
169 | import sys
170 | import io
171 | import torchvision.transforms as transforms
172 | from PIL import Image
173 |
174 | from osstorchconnector import OssIterableDataset, OssMapDataset
175 |
176 | ENDPOINT = "http://oss-cn-beijing-internal.aliyuncs.com"
177 | CONFIG_PATH = "/etc/oss-connector/config.json"
178 | CRED_PATH = "/root/.alibabacloud/credentials"
179 | OSS_URI = "oss://ossconnectorbucket/EnglistImg/Img/BadImag/Bmp/Sample001/"
180 |
181 | trans = transforms.Compose([
182 | transforms.Resize(256),
183 | transforms.CenterCrop(224),
184 | transforms.ToTensor(),
185 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
186 | ])
187 |
188 | def transform(data):
189 | try:
190 | img = Image.open(io.BytesIO(data.read())).convert('RGB')
191 | val = trans(img)
192 | except Exception as e:
193 | raise e
194 | return val, data.label
195 |
196 | iterable_dataset = OssIterableDataset.from_prefix(OSS_URI, endpoint=ENDPOINT, transform=transform, cred_path=CRED_PATH, config_path=CONFIG_PATH)
197 |
198 | for item in iterable_dataset:
199 | print(item[0])
200 | print(item[1])
201 | ```
202 |
203 | ### Pytorch dataloader
204 | ```py
205 | import sys
206 | import io
207 | import torch
208 | import torchvision.transforms as transforms
209 | from PIL import Image
210 | from osstorchconnector import OssIterableDataset, OssMapDataset
211 |
212 | ENDPOINT = "http://oss-cn-beijing-internal.aliyuncs.com"
213 | CONFIG_PATH = "/etc/oss-connector/config.json"
214 | CRED_PATH = "/root/.alibabacloud/credentials"
215 | OSS_URI = "oss://ossconnectorbucket/EnglistImg/Img/BadImag/Bmp/Sample001/"
216 |
217 |
218 | trans = transforms.Compose([
219 | transforms.Resize(256),
220 | transforms.CenterCrop(224),
221 | transforms.ToTensor(),
222 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
223 | ])
224 |
225 | def transform(data):
226 | try:
227 | img = Image.open(io.BytesIO(data.read())).convert('RGB')
228 | val = trans(img)
229 | except Exception as e:
230 | raise e
231 | return val, data.key, data.label
232 |
233 | # OssIterableDataset
234 | iterable_dataset = OssIterableDataset.from_prefix(OSS_URI, endpoint=ENDPOINT, transform=transform, cred_path=CRED_PATH, config_path=CONFIG_PATH)
235 | loader = torch.utils.data.DataLoader(iterable_dataset, batch_size=256, num_workers=32, prefetch_factor=2)
236 | for i, (datas, keys, labels) in enumerate(loader):
237 | print(datas)
238 | print(keys)
239 |
240 | # OssMapDataset with shuffle
241 | map_dataset = OssMapDataset.from_prefix(OSS_URI, endpoint=ENDPOINT, transform=transform, cred_path=CRED_PATH, config_path=CONFIG_PATH)
242 | loader = torch.utils.data.DataLoader(map_dataset, batch_size=256, num_workers=32, prefetch_factor=2, shuffle=True)
243 | for i, (datas, keys, labels) in enumerate(loader):
244 | print(datas)
245 | print(keys)
246 | ```
247 |
248 | When using with DataLoader, the main DataLoader worker responsible for listing from OSS or receiving objects from_prefix/from_manifest_file, all workers obtain their assigned objects from the main worker.
249 | This approach avoids issues of redundant listing and data reading (which may exist in other connectors), allowing better performance from multiple workers. When testing data download speed (excluding transform and other CPU-bound workload) with a large number of small files (e.g., ImageNet), it can exceed 10GB/s.
250 |
251 | OssIterableDataset includes prefetch optimization by increasing concurrency. When the DataLoader is configured with multiple workers, the iteration order may not be deterministic (local order might be disrupted).
252 |
253 | ## Checkpoint
254 |
255 | ```py
256 | import torch
257 | from osstorchconnector import OssCheckpoint
258 |
259 | ENDPOINT = "http://oss-cn-beijing-internal.aliyuncs.com"
260 | CONFIG_PATH = "/etc/oss-connector/config.json"
261 | CRED_PATH = "/root/.alibabacloud/credentials"
262 |
263 | checkpoint = OssCheckpoint(endpoint=ENDPOINT, cred_path=CRED_PATH, config_path=CONFIG_PATH)
264 |
265 | # read checkpoint
266 | CHECKPOINT_READ_URI = "oss://ossconnectorbucket/checkpoint/epoch.0"
267 | with checkpoint.reader(CHECKPOINT_READ_URI) as reader:
268 | state_dict = torch.load(reader)
269 |
270 | # write checkpoint
271 | CHECKPOINT_WRITE_URI = "oss://ossconnectorbucket/checkpoint/epoch.1"
272 | with checkpoint.writer(CHECKPOINT_WRITE_URI) as writer:
273 | torch.save(state_dict, writer)
274 | ```
275 |
276 | OssCheckpoint can be used for checkpoints, and also for high-speed uploading and downloading of arbitrary objects. In our testing environment, the download speed can exceed 15GB/s.
--------------------------------------------------------------------------------
/docs/torchconnector/installation.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | ## Requirements
4 |
5 | - OS: Linux x86-64
6 | - glibc: >= 2.17
7 | - Python: 3.8-3.12
8 | - PyTorch: >= 2.0
9 |
10 | ## Install
11 |
12 | ### Install stable version
13 |
14 | ```bash
15 | pip install osstorchconnector
16 | ```
17 |
18 | ### Install lastest version
19 |
20 | Download the latest osstorchconnector package from [Release](https://github.com/aliyun/oss-connector-for-ai-ml/releases) and use pip to install it.
21 |
22 | For example, download the `osstorchconnector/v1.1.0rc1` for Python 3.11 and install:
23 |
24 | ```bash
25 | wget https://github.com/aliyun/oss-connector-for-ai-ml/releases/download/osstorchconnector%2Fv1.1.0rc1/osstorchconnector-1.1.0rc1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
26 |
27 | pip install osstorchconnector-1.1.0rc1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
28 | ```
--------------------------------------------------------------------------------
/docs/torchconnector/introduction.md:
--------------------------------------------------------------------------------
1 |
2 | # OSS Torch Connector
3 |
4 | ## Overview
5 |
6 | OSS Torch Connector provides both [Map-style and Iterable-style datasets](https://pytorch.org/docs/stable/data.html#dataset-types) for loading datasets from OSS.
7 | And also provides a method for loading and saving checkpoints from and to OSS.
8 |
9 | The core part of is OSS Connector for AI/ML is implemented in C++ using [PhotonLibOS](https://github.com/alibaba/PhotonLibOS). This repository only contains the code of Python.
10 |
11 |
12 | ## Related
13 |
14 | [OSS Connector for AI/ML 中文文档](https://help.aliyun.com/zh/oss/developer-reference/oss-connector-for-ai-ml)
15 |
--------------------------------------------------------------------------------
/oss-model-connector/ossmodelconnector/__init__.py:
--------------------------------------------------------------------------------
1 | from ._oss_connector import (
2 | DataObject,
3 | DataObjectInfo,
4 | Connector,
5 | new_oss_connector,
6 | )
7 | from .oss_model_connector import OssModelConnector
8 |
9 | __all__ = ["DataObject", "DataObjectInfo", "Connector", "new_oss_connector", "OssModelConnector"]
10 |
--------------------------------------------------------------------------------
/oss-model-connector/ossmodelconnector/_oss_connector/__init__.py:
--------------------------------------------------------------------------------
1 | from .oss_model_connector import (
2 | DataObject,
3 | DataObjectInfo,
4 | Connector,
5 | new_oss_connector
6 | )
7 |
8 | __all__ = ["DataObject", "DataObjectInfo", "Connector", "new_oss_connector"]
9 |
--------------------------------------------------------------------------------
/oss-model-connector/ossmodelconnector/_oss_connector/oss_model_connector.pyi:
--------------------------------------------------------------------------------
1 | from typing import List, Union, Any
2 |
3 | class DataObject:
4 | key: str
5 |
6 | def __enter__(self) -> DataObject: ...
7 | def __exit__(self, exc_type, exc_val, exc_tb): ...
8 | def tell(self) -> int: ...
9 | def seek(self, offset: int, whence: int) -> int: ...
10 | def read(self, size: int): ...
11 | def readline(self, size: int): ...
12 | def readinto(self, buf) -> int: ...
13 | def mmap(self) -> int: ...
14 | def close(self): ...
15 | def size(self) -> int: ...
16 |
17 |
18 | class DataObjectInfo:
19 | key: str
20 | size: int
21 |
22 |
23 | class Connector:
24 | def open(uri: str, prefetch: bool, userfault: bool, binary: bool) -> DataObject: ...
25 | def prepare_directory(uri: str, dir: str, libc: bool) -> int: ...
26 | def list(bucket: str, prefix: str, fast: bool) -> List[DataObjectInfo]: ...
27 |
28 |
29 | def new_oss_connector(endpoint: str, cred: Union[str, Any], config_path: str) -> Connector:
30 | ...
31 |
--------------------------------------------------------------------------------
/oss-model-connector/ossmodelconnector/oss_model_connector.py:
--------------------------------------------------------------------------------
1 | from ._oss_connector import new_oss_connector, Connector
2 | import ctypes
3 | import torch
4 | import builtins
5 | import pathlib
6 | from typing import Any
7 |
8 |
9 | class UntypedStorageEx:
10 | def __init__(self, file, size):
11 | self.file = file
12 | self.addr = memoryview((ctypes.c_ubyte * size).from_address(self.file.mmap()))
13 |
14 | def untyped(self):
15 | return self
16 |
17 | def __getitem__(self, idx):
18 | return self.addr[idx]
19 |
20 | class OssModelConnector:
21 | """
22 | A connector class for interfacing with OSS for model loading,
23 | providing high-performance methods to load models/objects/files for AI inference.
24 | """
25 |
26 | def __init__(
27 | self,
28 | endpoint: str,
29 | cred_path: str = "",
30 | config_path: str = "",
31 | cred_provider: Any = None,
32 | ):
33 | """
34 | Initializes the connector with endpoint and optional credential information.
35 |
36 | Args:
37 | endpoint(str): The OSS endpoint to connect to.
38 | cred_path(str, optional): Path to the credential file. Defaults to "".
39 | config_path(str, optional): Path to the configuration file. Defaults to "".
40 | cred_provider(Any, optional): Credential provider. Defaults to None.
41 |
42 | Raises:
43 | ValueError: If endpoint or credential is not provided.
44 | """
45 | if not endpoint:
46 | raise ValueError("endpoint must be non-empty")
47 | if cred_provider is None and not cred_path:
48 | raise ValueError("Either cred_path or cred_provider must be provided")
49 |
50 | self._endpoint = endpoint
51 | if not cred_path:
52 | self._cred_path = ""
53 | else:
54 | self._cred_path = cred_path
55 | if not config_path:
56 | self._config_path = ""
57 | else:
58 | self._config_path = config_path
59 | self._cred_provider = cred_provider
60 |
61 | self._real_connector = None
62 | self._hook_dir = ''
63 | self._origin_from_file = torch.UntypedStorage.from_file
64 | self._origin_open = builtins.open
65 |
66 | def __del__(self):
67 | self.close()
68 | @property
69 | def _connector(self):
70 | if self._real_connector is None:
71 | if self._cred_provider is not None:
72 | self._real_connector = new_oss_connector(self._endpoint, self._cred_provider, self._config_path)
73 | else:
74 | self._real_connector = new_oss_connector(self._endpoint, self._cred_path, self._config_path)
75 |
76 | return self._real_connector
77 |
78 | def close(self):
79 | """
80 | Close the connector and release resources.
81 | """
82 | try:
83 | if self._hook_dir:
84 | self._hook_dir = ''
85 |
86 | if builtins.open == self._connector_open:
87 | builtins.open = self._origin_open
88 |
89 | if torch.UntypedStorage.from_file == self._from_file_helper:
90 | torch.UntypedStorage.from_file = self._origin_from_file
91 |
92 | if self._real_connector is not None:
93 | del self._real_connector
94 | self._real_connector = None
95 | except:
96 | print("exception in close, ignore")
97 |
98 | def open(self, uri, binary = True):
99 | """
100 | Opens an object from OSS storage.
101 |
102 | Args:
103 | uri(str): The uri (oss://{bucket}/{object_name}) of the object to open.
104 | binary(bool): Flag indicating whether to open in binary mode or not.
105 |
106 | Returns:
107 | Stream-like object of the opened OSS object.
108 | """
109 | return self._connector.open(uri, True, True, binary)
110 |
111 | def _from_file_helper(self, filename, shared, nbytes):
112 | if self._hook_dir and filename.startswith(self._hook_dir):
113 | file = self._connector.open(filename, True, True)
114 | return UntypedStorageEx(file, nbytes)
115 | else:
116 | return self._origin_from_file(filename, shared, nbytes)
117 |
118 | def _connector_open(self, file, mode='r', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None):
119 | if isinstance(file, pathlib.Path):
120 | file = str(file)
121 | if self._hook_dir and file.startswith(self._hook_dir):
122 | binary = False
123 | if 'b' in mode:
124 | binary = True
125 | try:
126 | return self.open(file, binary)
127 | except:
128 | return self._origin_open(file, mode, buffering, encoding, errors, newline, closefd, opener)
129 | else:
130 | return self._origin_open(file, mode, buffering, encoding, errors, newline, closefd, opener)
131 |
132 | def prepare_directory(self, uri: str, dir: str, libc_hook: bool = False):
133 | """
134 | Prepare the directory from OSS storage, which can be used as directory 'dir' in vllm/transformers or other frameworks.
135 |
136 | Args:
137 | uri(str): The URI (oss://{bucket}/{directory}) of the OSS directory.
138 | dir(str): The local directory used for vllm/transformers or other frameworks.
139 | libc_hook (bool): Flag to enable libc hooking.
140 |
141 | Raises:
142 | RuntimeError: If prepare directory failed.
143 | """
144 | if not dir.endswith('/'):
145 | dir += '/'
146 | self._connector.prepare_directory(uri, dir, libc_hook)
147 | if not libc_hook:
148 | builtins.open = self._connector_open
149 | torch.UntypedStorage.from_file = self._from_file_helper
150 | self._hook_dir = dir
151 |
152 | def list(self, bucket: str, prefix: str, fast: bool = False):
153 | """
154 | Lists objects in a specified OSS bucket with a given prefix.
155 |
156 | Args:
157 | bucket(str): The OSS bucket name.
158 | prefix(str): The prefix filter for object listing.
159 | fast (bool): If true, enables fast list mode.
160 |
161 | Returns:
162 | List: A list of objects matching the bucket and prefix criteria.
163 | """
164 | return self._connector.list(bucket, prefix, fast)
165 |
--------------------------------------------------------------------------------
/oss-model-connector/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "ossmodelconnector"
7 | version = "1.0.0rc1"
8 | description = "OSS model connector for AI/ML"
9 | requires-python = ">=3.8,<3.13"
10 | readme = "README.md"
11 | dependencies = [
12 | "torch >= 2.0",
13 | ]
14 | classifiers = [
15 | "Development Status :: 4 - Beta",
16 | "Intended Audience :: Developers",
17 | "Topic :: Utilities",
18 | "License :: OSI Approved :: MIT License",
19 | "Operating System :: POSIX :: Linux",
20 |
21 | "Programming Language :: Python :: 3",
22 | "Programming Language :: Python :: 3.8",
23 | "Programming Language :: Python :: 3.9",
24 | "Programming Language :: Python :: 3.10",
25 | "Programming Language :: Python :: 3.11",
26 | "Programming Language :: Python :: 3.12",
27 | ]
28 |
29 | [tool.setuptools.packages.find]
30 | where = ["."]
31 | include = ["ossmodelconnector", "ossmodelconnector._oss_connector"]
32 |
33 | [tool.setuptools.package-data]
34 | osstorchconnector = ["_oss_connector/*.so"]
35 |
--------------------------------------------------------------------------------
/oss-model-connector/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, Extension
2 | from setuptools.command.build_ext import build_ext
3 | import os
4 | import subprocess
5 | import shutil
6 |
7 |
8 | class BuildExtension(Extension):
9 | def __init__(self, name, source_dir=''):
10 | Extension.__init__(self, name, sources=[source_dir])
11 | self.source_dir = os.path.abspath(source_dir)
12 |
13 | class LibraryBuild(build_ext):
14 | user_options = build_ext.user_options + [
15 | ('library-path=', None, 'oss_connector library path'),
16 | ]
17 | def initialize_options(self):
18 | super().initialize_options()
19 | self.library_path = None
20 | def run(self):
21 | if not self.library_path:
22 | raise RuntimeError("library path is not specified by '--library-path'")
23 | self.library_path = os.path.abspath(self.library_path)
24 | if os.path.exists(self.library_path):
25 | print('library path:', self.library_path)
26 | else:
27 | raise RuntimeError("invalid library path: " + self.library_path)
28 | for ext in self.extensions:
29 | self.build_extension(ext)
30 |
31 | def run_command(self, command, cwd):
32 | try:
33 | subprocess.run(command, capture_output=True, text=True, check=True, cwd=cwd)
34 | except subprocess.CalledProcessError as e:
35 | print(f"Command '{' '.join(command)}' failed with exit code {e.returncode}")
36 | print(f"Stdout: {e.stdout}")
37 | print(f"Stderr: {e.stderr}")
38 | raise RuntimeError("Subprocess execution failed") from e
39 |
40 | def build_extension(self, ext):
41 | print('name:', ext.name)
42 | print('source path:', ext.source_dir)
43 | print('current dir:', os.getcwd())
44 |
45 | # copy .so
46 | library_file_name = os.path.basename(self.library_path)
47 | dest_so_path = os.path.abspath(
48 | os.path.join(self.build_lib, 'ossmodelconnector', '_oss_connector', library_file_name))
49 | print('copy %s to %s' % (self.library_path, dest_so_path))
50 | shutil.copy(self.library_path, dest_so_path)
51 |
52 |
53 | setup(
54 | ext_modules=[BuildExtension('oss_model_connector', '.')],
55 | cmdclass=dict(build_ext=LibraryBuild),
56 | )
57 |
--------------------------------------------------------------------------------
/oss-torch-connector/osstorchconnector/__init__.py:
--------------------------------------------------------------------------------
1 | from .oss_iterable_dataset import OssIterableDataset
2 | from .oss_map_dataset import OssMapDataset
3 | from .oss_checkpoint import OssCheckpoint
4 | from ._oss_client import OssClient
5 | from ._oss_bucket_iterable import imagenet_manifest_parser
6 | from ._oss_tar_iterable import generate_tar_archive
7 |
8 | __all__ = [
9 | "OssIterableDataset",
10 | "OssMapDataset",
11 | "OssCheckpoint",
12 | "OssClient",
13 | "imagenet_manifest_parser",
14 | "generate_tar_archive",
15 | ]
16 |
--------------------------------------------------------------------------------
/oss-torch-connector/osstorchconnector/_oss_bucket_iterable.py:
--------------------------------------------------------------------------------
1 | from typing import Iterator, Iterable, Union, Tuple, Callable
2 | from ._oss_client import OssClient, DataObject
3 | from ._oss_connector import new_data_object
4 | import logging
5 | import io
6 |
7 | log = logging.getLogger(__name__)
8 |
9 | def identity(obj: DataObject) -> DataObject:
10 | if obj is not None:
11 | return obj.copy()
12 | else:
13 | return None
14 |
15 | def parse_oss_uri(uri: str) -> Tuple[str, str]:
16 | if not uri or not (uri.startswith("oss://") or uri.startswith("/")):
17 | raise ValueError("Only oss:// URIs are supported")
18 | if uri.startswith("oss://"):
19 | uri = uri[len("oss://"):]
20 | elif uri.startswith("/"):
21 | uri = uri[1:]
22 | if not uri:
23 | raise ValueError("Bucket name must be non-empty")
24 | split = uri.split("/", maxsplit=1)
25 | if len(split) == 1:
26 | bucket = split[0]
27 | prefix = ""
28 | else:
29 | bucket, prefix = split
30 | if not bucket:
31 | raise ValueError("Bucket name must be non-empty")
32 | return bucket, prefix
33 |
34 | def imagenet_manifest_parser(reader: io.IOBase) -> Iterable[Tuple[str, str]]:
35 | lines = reader.read().decode("utf-8").strip().split("\n")
36 | for i, line in enumerate(lines):
37 | try:
38 | items = line.strip().split('\t')
39 | if len(items) >= 2:
40 | key = items[0]
41 | label = items[1]
42 | yield (key, label)
43 | elif len(items) == 1:
44 | key = items[0]
45 | yield (key, '')
46 | else:
47 | raise ValueError("format error")
48 | except ValueError as e:
49 | logging.error(f"Error: {e} for line {i}: {line}")
50 |
51 |
52 | class OssBucketIterable:
53 | def __init__(self, client: OssClient, *,
54 | oss_uri: str = None,
55 | object_uris: Iterable[str] = None,
56 | preload: bool = False,
57 | manifest_file_path: str = None,
58 | manifest_parser: Callable[[io.IOBase], Iterable[Tuple[str, str]]] = None,
59 | oss_base_uri: str = None):
60 | log.info("OssBucketIterable init")
61 | self._client = client
62 | self._oss_uri = oss_uri
63 | self._object_uris = object_uris
64 | self._preload = preload
65 | self._manifest_file_path = manifest_file_path
66 | self._manifest_parser = manifest_parser
67 | self._oss_base_uri = oss_base_uri
68 | self._data_objects: Iterable[DataObject] = None
69 |
70 | @classmethod
71 | def from_uris(cls, object_uris: Union[str, Iterable[str]], client: OssClient, preload: bool = False):
72 | if not object_uris:
73 | raise ValueError("object_uris must be non-empty")
74 | if isinstance(object_uris, str):
75 | object_uris = [object_uris]
76 | return cls(client, object_uris=object_uris, preload=preload)
77 |
78 | @classmethod
79 | def from_prefix(cls, oss_uri: str, client: OssClient, preload: bool = False):
80 | if not oss_uri:
81 | raise ValueError("oss_uri must be non-empty")
82 | if not oss_uri.startswith("oss://"):
83 | raise ValueError("only oss:// uri are supported")
84 | return cls(client, oss_uri=oss_uri, preload=preload)
85 |
86 | @classmethod
87 | def from_manifest_file(cls, manifest_file_path: str, manifest_parser: Callable[[io.IOBase], Iterable[Tuple[str, str]]],
88 | oss_base_uri: str, client: OssClient, preload: bool = False):
89 | if not manifest_file_path:
90 | raise ValueError("manifest_file_path must be non-empty")
91 | if not manifest_parser:
92 | raise ValueError("manifest_parser must be non-empty")
93 | return cls(client, manifest_file_path=manifest_file_path, manifest_parser=manifest_parser,
94 | oss_base_uri=oss_base_uri, preload=preload)
95 |
96 | def _get_data_object_by_manifest(self) -> Iterator[DataObject]:
97 | if self._manifest_file_path.startswith("oss://"):
98 | ibucket, ikey = parse_oss_uri(self._manifest_file_path)
99 | with self._client.get_object(ibucket, ikey, type=0) as manifest_file:
100 | for key, label in self._manifest_parser(manifest_file):
101 | yield new_data_object(self._oss_base_uri + key, 0, label)
102 | else:
103 | with open(self._manifest_file_path, "rb") as manifest_file:
104 | for key, label in self._manifest_parser(manifest_file):
105 | yield new_data_object(self._oss_base_uri + key, 0, label)
106 |
107 | def __iter__(self) -> Iterator[DataObject]:
108 | # This allows us to iterate multiple times by re-creating the `_list_stream`
109 | if self._object_uris is not None:
110 | log.info("OssBucketIterable get iter by object uris")
111 | self._data_objects = [new_data_object(uri, 0, "") for uri in self._object_uris]
112 | return iter(OssBucketObjectsIterator(self._client, self._data_objects, self._preload))
113 | elif self._manifest_file_path is not None and self._manifest_parser is not None:
114 | log.info("OssBucketIterable get iter by manifest file: %s", self._manifest_file_path)
115 | self._data_objects = self._get_data_object_by_manifest()
116 | return iter(OssBucketObjectsIterator(self._client, self._data_objects, self._preload))
117 | elif self._oss_uri is not None:
118 | log.info("OssBucketIterable get iter by oss prefix: %s", self._oss_uri)
119 | return iter(OssBucketPrefixIterator(self._client, self._oss_uri, self._preload))
120 | else:
121 | log.error("OssBucketIterable get iter failed")
122 | return None
123 |
124 |
125 | class OssBucketObjectsIterator:
126 | def __init__(self, client: OssClient, objects: Iterable[DataObject], preload: bool) -> Iterator[DataObject]:
127 | log.info("OssBucketObjectsIterator init")
128 | if preload:
129 | self._list_stream = iter(client.list_objects_from_uris_with_preload(objects))
130 | else:
131 | self._list_stream = iter(objects) # map does not need pass objects to client for now
132 |
133 | def __iter__(self) -> Iterator[DataObject]:
134 | log.info("OssBucketObjectsIterator get iter")
135 | return self._list_stream
136 |
137 |
138 | class OssBucketPrefixIterator:
139 | def __init__(self, client: OssClient, oss_uri: str, preload: bool):
140 | log.info("OssBucketPrefixIterator init")
141 | bucket, prefix = parse_oss_uri(oss_uri)
142 | if preload:
143 | self._list_stream = iter(client.list_objects_with_preload(bucket, prefix))
144 | else:
145 | self._list_stream = iter(client.list_objects(bucket, prefix))
146 |
147 | def __iter__(self) -> Iterator[DataObject]:
148 | log.info("OssBucketPrefixIterator get iter")
149 | return self._list_stream
150 |
--------------------------------------------------------------------------------
/oss-torch-connector/osstorchconnector/_oss_client.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Iterator, Iterable, Any
3 | import logging
4 |
5 | log = logging.getLogger(__name__)
6 |
7 | from ._oss_connector import (
8 | DataSet,
9 | DataObject,
10 | new_oss_dataset
11 | )
12 |
13 | O_MULTI_PART = 0x40000000 # oss multi-part upload
14 |
15 | """
16 | _oss_client.py
17 | Internal client wrapper class on top of OSS client interface
18 | with multi-process support.
19 | """
20 |
21 | class OssClient:
22 | def __init__(self, endpoint: str, cred_path: str = "", config_path: str = "", uuid: str = "", id: int = 0, total: int = 1, cred_provider: Any = None):
23 | self._endpoint = endpoint
24 | self._cred_path = cred_path
25 | self._config_path = config_path
26 | self._uuid = uuid
27 | self._real_client = None
28 | self._client_pid = None
29 | self._id = id
30 | self._total = total
31 | self._cred_provider = cred_provider
32 |
33 | @property
34 | def _client(self) -> DataSet:
35 | if self._client_pid is None or self._client_pid != os.getpid():
36 | # does OSS client survive forking ? NO
37 | if self._client_pid != os.getpid() and self._real_client is not None:
38 | log.info("OssClient delete dataset")
39 | # del self._real_client
40 | self._client_pid = os.getpid()
41 | self._real_client = self._client_builder()
42 | return self._real_client
43 |
44 | def _client_builder(self) -> DataSet:
45 | log.info("OssClient new_oss_dataset, id %d, total %d", self._id, self._total)
46 | return new_oss_dataset(self._endpoint, self._cred_path, self._cred_provider, self._config_path, str(self._uuid), self._id, self._total)
47 |
48 | def get_object(self, bucket: str, key: str, size: int = 0, type: int = 0, label: str = "") -> DataObject:
49 | return self._client.open_ro(bucket, key, size, type, label)
50 |
51 | def put_object(self, bucket: str, key: str) -> DataObject:
52 | return self._client.open_wo(bucket, key)
53 |
54 | def list_objects(self, bucket: str, prefix: str = "") -> Iterator[DataObject]:
55 | log.debug("OssClient list_objects")
56 | return self._client.list(bucket, prefix)
57 |
58 | def list_objects_with_preload(self, bucket: str, prefix: str = "") -> Iterator[DataObject]:
59 | log.debug("OssClient list_objects_with_preload")
60 | return self._client.list_with_preload(bucket, prefix)
61 |
62 | def list_objects_from_uris(self, object_uris: Iterable, prefetch: bool = False, include_errors: bool = False) -> Iterator[DataObject]:
63 | log.debug("OssClient list_objects_from_uris")
64 | return self._client.list_from_uris(object_uris, prefetch, include_errors)
65 |
66 | def list_objects_from_uris_with_preload(self, object_uris: Iterable) -> Iterator[DataObject]:
67 | log.debug("OssClient list_objects_from_uris_with_preload")
68 | return self._client.list_from_uris_with_preload(object_uris)
69 |
70 | def list_objects_from_tar(self, bucket: str, tar_key: str, index_key: str, chunks: Iterable = [], sizes: Iterable = [],
71 | prefetch: bool = False, include_errors: bool = False) -> Iterator[DataObject]:
72 | log.debug("OssClient list_objects_from_tar")
73 | return self._client.list_from_tar(bucket, tar_key, index_key, chunks, sizes, prefetch, include_errors)
74 |
75 | def gen_tar_archive(self, tar_path: str, index_path: str, source_path: str, index_only: bool = False) -> int:
76 | return self._client.gen_tar_archive(tar_path, index_path, source_path, index_only)
77 |
--------------------------------------------------------------------------------
/oss-torch-connector/osstorchconnector/_oss_connector/__init__.py:
--------------------------------------------------------------------------------
1 | from .oss_connector import (
2 | DataSet,
3 | DataObject,
4 | new_oss_dataset,
5 | new_data_object
6 | )
7 |
8 | __all__ = ["DataSet", "DataObject", "new_oss_dataset", "new_data_object"]
9 |
--------------------------------------------------------------------------------
/oss-torch-connector/osstorchconnector/_oss_connector/oss_connector.pyi:
--------------------------------------------------------------------------------
1 | from typing import Iterable, Iterator, Any
2 |
3 |
4 | class DataObject:
5 | key: str
6 | size: int
7 | label: str
8 |
9 | def __enter__(self) -> DataObject: ...
10 | def __exit__(self, exc_type, exc_val, exc_tb): ...
11 | def tell(self) -> int: ...
12 | def seek(self, offset: int, whence: int) -> int: ...
13 | def read(self, count: int) -> bytes: ...
14 | def readinto(self, buf) -> int: ...
15 | def write(self, data) -> int: ...
16 | def close(self) -> int: ...
17 | def flush(self) -> int: ...
18 | def err(self) -> int: ...
19 | def error_msg(self) -> str: ...
20 | def copy(self) -> DataObject: ...
21 |
22 |
23 | class DataSet:
24 | def list(self, bucket: str, prefix: str) -> Iterator[DataObject]: ...
25 | def list_with_preload(self, bucket: str, prefix: str) -> Iterator[DataObject]: ...
26 | def list_from_uris(self, iter: Iterable, prefetch: bool, include_errors: bool) -> Iterator[DataObject]: ...
27 | def list_from_uris_with_preload(self, iter: Iterable) -> Iterator[DataObject]: ...
28 | def list_from_tar(self, bucket: str, tar_key: str, index_key: str, chunks: Iterable, sizes: Iterable,
29 | prefetch: bool, include_errors: bool) -> Iterator[DataObject]: ...
30 | def open_ro(self, bucket: str, key: str, size: int, mmap: int, label: str) -> DataObject: ...
31 | def open_wo(self, bucket: str, key: str) -> DataObject: ...
32 | def gen_tar_archive(self, tar_path: str, index_path: str, source_path: str, index_only: bool) -> int: ...
33 |
34 |
35 | def new_oss_dataset(endpoint: str, cred_path: str, cred_provider: Any, config_path: str, uuid: str, id: int, total: int) -> DataSet:
36 | ...
37 |
38 |
39 | def new_data_object(key: str, size: int, label: str) -> DataObject:
40 | ...
41 |
--------------------------------------------------------------------------------
/oss-torch-connector/osstorchconnector/_oss_tar_iterable.py:
--------------------------------------------------------------------------------
1 | from typing import Iterator, List, Tuple, Any
2 | from ._oss_client import OssClient, DataObject
3 | from ._oss_bucket_iterable import parse_oss_uri
4 | import logging
5 |
6 | log = logging.getLogger(__name__)
7 |
8 | class OssTarIterable:
9 | def __init__(self, client: OssClient, *,
10 | tar_uri: str = None,
11 | tar_index_uri: str = None,
12 | preload: bool = False,
13 | chunks: List[Tuple[int, int]] = []):
14 | log.info("OssTarIterable init, preload: %s", preload)
15 | self._client = client
16 | self._tar_uri = tar_uri
17 | self._tar_index_uri = tar_index_uri
18 | self._preload = preload
19 | self._chunks = chunks
20 | self._list_stream = None
21 |
22 | @classmethod
23 | def from_tar(cls, tar_uri: str, tar_index_uri: str, client: OssClient, preload: bool = False,
24 | chunks: List[Tuple[int, int]] = []):
25 | if not tar_uri:
26 | raise ValueError("tar_uri must be non-empty")
27 | if not tar_uri.startswith("oss://"):
28 | raise ValueError("only oss:// uri are supported for tar_uri")
29 | if not tar_index_uri:
30 | raise ValueError("tar_index_uri must be non-empty")
31 | if not tar_index_uri.startswith("oss://"):
32 | raise ValueError("only oss:// uri are supported for tar_index_uri")
33 | return cls(client, tar_uri=tar_uri, tar_index_uri=tar_index_uri, preload=preload,
34 | chunks=chunks)
35 |
36 | def __iter__(self) -> Iterator[DataObject]:
37 | # This allows us to iterate multiple times by re-creating the `_list_stream`
38 | self._list_stream = OssTarObjectsIterator(self._client, self._tar_uri, self._tar_index_uri, self._preload,
39 | chunks=self._chunks)
40 | return iter(self._list_stream)
41 |
42 | def __len__(self):
43 | if self._list_stream is None:
44 | self._list_stream = OssTarObjectsIterator(self._client, self._tar_uri, self._tar_index_uri, self._preload,
45 | chunks=self._chunks)
46 | return len(self._list_stream)
47 |
48 |
49 | class OssTarObjectsIterator:
50 | def __init__(self, client: OssClient, tar_uri: str, tar_index_uri: str, preload: bool,
51 | chunks: List[Tuple[int, int]] = []):
52 | log.info("OssTarObjectsIterator init")
53 | tar_bucket, tar_key = parse_oss_uri(tar_uri)
54 | index_bucket, index_key = parse_oss_uri(tar_index_uri)
55 | if tar_bucket != index_bucket:
56 | raise ValueError("tar_uri and tar_index_uri must be in the same bucket")
57 | starts = [start for start, _ in chunks] if chunks else []
58 | sizes = [size for _, size in chunks] if chunks else []
59 | self._list_stream = client.list_objects_from_tar(tar_bucket, tar_key, index_key, prefetch=preload,
60 | chunks=starts, sizes=sizes)
61 |
62 | def __iter__(self) -> Iterator[DataObject]:
63 | log.info("OssTarObjectsIterator get iter")
64 | return iter(self._list_stream)
65 |
66 | def __len__(self):
67 | return len(self._list_stream)
68 |
69 |
70 | def generate_tar_archive(endpoint: str, cred_path: str, config_path: str, tar_path: str,
71 | index_path: str, source_path: str, index_only: bool = False,
72 | cred_provider: Any = None):
73 | """ Generate tar archive and its index.
74 |
75 | Args:
76 | endpoint(str): Endpoint of the OSS bucket where the objects are stored.
77 | cred_path(str): Credential info of the OSS bucket where the objects are stored.
78 | config_path(str): Configuration file path of the OSS connector.
79 | tar_path(str): Path to the tar archive. (OSS URI or local path)
80 | index_path(str): Path to the tar index. (OSS URI or local path)
81 | source_path(str): Path to the source directory. (OSS URI or local path)
82 | index_only(bool): If True, generate tar index from tar archive specified by 'tar_path',
83 | otherwise (by default) generate tar archive and its index from
84 | source directory specified by 'source_path'.
85 | cred_provider: OSS credential provider.
86 | """
87 | if not endpoint:
88 | raise ValueError("endpoint must be non-empty")
89 | if not cred_path and not cred_provider:
90 | raise ValueError("neither cred_path nor cred_provider is specified")
91 | client = OssClient(endpoint, cred_path, config_path, cred_provider=cred_provider)
92 | return client.gen_tar_archive(tar_path, index_path, source_path, index_only)
93 |
--------------------------------------------------------------------------------
/oss-torch-connector/osstorchconnector/oss_checkpoint.py:
--------------------------------------------------------------------------------
1 | from ._oss_bucket_iterable import parse_oss_uri
2 | from ._oss_client import OssClient, DataObject
3 | from ctypes import *
4 | from typing import Any
5 |
6 | class OssCheckpoint:
7 | """A checkpoint manager for OSS.
8 |
9 | To read a checkpoint from OSS, users need to create an `DataObject`
10 | by providing oss_uri of the checkpoint stored in OSS. Similarly, to save a
11 | checkpoint to OSS, users need to create an `DataObject` by providing oss_uri.
12 | `DataObject` can be passed to torch.load, and torch.save.
13 | """
14 |
15 | def __init__(
16 | self,
17 | endpoint: str,
18 | cred_path: str = "",
19 | config_path: str = "",
20 | cred_provider: Any = None,
21 | ):
22 | if not endpoint:
23 | raise ValueError("endpoint must be non-empty")
24 | else:
25 | self._endpoint = endpoint
26 | if not cred_path:
27 | self._cred_path = ""
28 | else:
29 | self._cred_path = cred_path
30 | if not config_path:
31 | self._config_path = ""
32 | else:
33 | self._config_path = config_path
34 | self._cred_provider = cred_provider
35 | self._client = OssClient(self._endpoint, self._cred_path, self._config_path, cred_provider=self._cred_provider)
36 |
37 | def reader(self, oss_uri: str):
38 | """Creates an DataObject from a given oss_uri.
39 |
40 | Args:
41 | oss_uri (str): A valid oss_uri. (i.e. oss:///)
42 |
43 | Returns:
44 | DataObject: a read-only binary stream of the OSS object's contents, specified by the oss_uri.
45 | """
46 | bucket, key = parse_oss_uri(oss_uri)
47 | return self._client.get_object(bucket, key, type=1)
48 |
49 | def writer(self, oss_uri: str) -> DataObject:
50 | """Creates an DataObject from a given oss_uri.
51 |
52 | Args:
53 | oss_uri (str): A valid oss_uri. (i.e. oss:///)
54 |
55 | Returns:
56 | DataObject: a write-only binary stream. The content is saved to OSS using the specified oss_uri.
57 | """
58 | bucket, key = parse_oss_uri(oss_uri)
59 | return self._client.put_object(bucket, key)
60 |
--------------------------------------------------------------------------------
/oss-torch-connector/osstorchconnector/oss_iterable_dataset.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 | from typing import Iterator, Any, Union, Iterable, Callable, Tuple
3 | import io
4 | import torch.utils.data
5 | import uuid
6 | import logging
7 | import random
8 |
9 | from ._oss_client import OssClient, DataObject
10 | from ._oss_bucket_iterable import OssBucketIterable, identity
11 | from ._oss_tar_iterable import OssTarIterable
12 |
13 | log = logging.getLogger(__name__)
14 |
15 | class OssIterableDataset(torch.utils.data.IterableDataset):
16 | """An IterableStyle dataset created from OSS objects.
17 |
18 | To create an instance of OssIterableDataset, you need to use
19 | `from_prefix`, `from_objects`, `from_manifest_file` or `from_tar` methods.
20 | """
21 |
22 | def __init__(
23 | self,
24 | endpoint: str,
25 | cred_path: str,
26 | config_path: str,
27 | get_dataset_objects: Callable[[OssClient], Iterable[DataObject]],
28 | transform: Callable[[DataObject], Any] = identity,
29 | cred_provider: Any = None,
30 | from_tar: bool = False,
31 | shuffle: bool = False,
32 | shuffle_chunk_size: int = 1000,
33 | ):
34 | self._uuid = uuid.uuid4()
35 | self._endpoint = endpoint
36 | log.info("OssIterableDataset init, uuid: %s, endpoint: %s", self._uuid, self._endpoint)
37 | if not endpoint:
38 | raise ValueError("endpoint must be non-empty")
39 | if not cred_path:
40 | self._cred_path = ""
41 | else:
42 | self._cred_path = cred_path
43 | self._cred_provider = cred_provider
44 | if not config_path:
45 | self._config_path = ""
46 | else:
47 | self._config_path = config_path
48 | self._get_dataset_objects = get_dataset_objects
49 | self._transform = transform
50 | self._client = None
51 | self._from_tar = from_tar
52 | self._shuffle = shuffle
53 | self._chunk_size = shuffle_chunk_size
54 | if from_tar and shuffle:
55 | self._bucket_objects = self._get_dataset_objects(self._get_client(0, 1), preload=False)
56 | self._dataset_size = len(self._bucket_objects)
57 | self.shuffle()
58 | else:
59 | self._bucket_objects = None
60 |
61 | @classmethod
62 | def from_objects(
63 | cls,
64 | object_uris: Union[str, Iterable[str]],
65 | endpoint: str,
66 | *,
67 | cred_path: str = "",
68 | cred_provider: Any = None,
69 | config_path: str = "",
70 | transform: Callable[[DataObject], Any] = identity,
71 | ):
72 | """Returns an instance of OssIterableDataset using the OSS URI(s) provided.
73 |
74 | Args:
75 | object_uris(str | Iterable[str]): OSS URI of the object(s) desired.
76 | endpoint(str): Endpoint of the OSS bucket where the objects are stored.
77 | cred_path(str): Credential info of the OSS bucket where the objects are stored.
78 | config_path(str): Configuration file path of the OSS connector.
79 | transform: Optional callable which is used to transform an DataObject into the desired type.
80 | cred_provider: OSS credential provider.
81 |
82 | Returns:
83 | OssIterableDataset: An IterableStyle dataset created from OSS objects.
84 | """
85 | log.info(f"Building {cls.__name__} from_objects")
86 | return cls(
87 | endpoint, cred_path, config_path, partial(OssBucketIterable.from_uris, object_uris, preload=True),
88 | transform=transform, cred_provider=cred_provider
89 | )
90 |
91 | @classmethod
92 | def from_prefix(
93 | cls,
94 | oss_uri: str,
95 | endpoint: str,
96 | *,
97 | cred_path: str = "",
98 | cred_provider: Any = None,
99 | config_path: str = "",
100 | transform: Callable[[DataObject], Any] = identity,
101 | ):
102 | """Returns an instance of OssIterableDataset using the OSS URI provided.
103 |
104 | Args:
105 | oss_uri(str): An OSS URI (prefix) of the object(s) desired. Objects matching the prefix will be included in the returned dataset.
106 | endpoint(str): Endpoint of the OSS bucket where the objects are stored.
107 | cred_path(str): Credential info of the OSS bucket where the objects are stored.
108 | config_path(str): Configuration file path of the OSS connector.
109 | transform: Optional callable which is used to transform an DataObject into the desired type.
110 | cred_provider: OSS credential provider.
111 |
112 | Returns:
113 | OssIterableDataset: An IterableStyle dataset created from OSS objects.
114 | """
115 | log.info(f"Building {cls.__name__} from_prefix")
116 | return cls(
117 | endpoint, cred_path, config_path, partial(OssBucketIterable.from_prefix, oss_uri, preload=True),
118 | transform=transform, cred_provider=cred_provider
119 | )
120 |
121 | @classmethod
122 | def from_manifest_file(
123 | cls,
124 | manifest_file_path: str,
125 | manifest_parser: Callable[[io.IOBase], Iterable[Tuple[str, str]]],
126 | oss_base_uri: str,
127 | endpoint: str,
128 | *,
129 | cred_path: str = "",
130 | cred_provider: Any = None,
131 | config_path: str = "",
132 | transform: Callable[[DataObject], Any] = identity,
133 | ):
134 | """Returns an instance of OssIterableDataset using manifest file provided.
135 |
136 | Args:
137 | manifest_file_path(str): OSS URI or local path of manifest file.
138 | manifest_parser: A callable which takes an io.IOBase object and returns an iterable of (object_uri, label).
139 | oss_base_uri(str): The base URI of the OSS object in manifest file.
140 | endpoint(str): Endpoint of the OSS bucket where the objects are stored.
141 | cred_path(str): Credential info of the OSS bucket where the objects are stored.
142 | config_path(str): Configuration file path of the OSS connector.
143 | transform: Optional callable which is used to transform an DataObject into the desired type.
144 | cred_provider: OSS credential provider.
145 |
146 | Returns:
147 | OssIterableDataset: An IterableStyle dataset created from OSS objects.
148 | """
149 | log.info(f"Building {cls.__name__} from_manifest_file")
150 | return cls(
151 | endpoint, cred_path, config_path, partial(OssBucketIterable.from_manifest_file, manifest_file_path, manifest_parser, oss_base_uri, preload=True),
152 | transform=transform, cred_provider=cred_provider
153 | )
154 |
155 | @classmethod
156 | def from_tar(
157 | cls,
158 | tar_uri: str,
159 | tar_index_uri: str,
160 | endpoint: str,
161 | *,
162 | cred_path: str = "",
163 | cred_provider: Any = None,
164 | config_path: str = "",
165 | transform: Callable[[DataObject], Any] = identity,
166 | shuffle: bool = False,
167 | shuffle_chunk_size: int = 1000,
168 | ):
169 | """Returns an instance of OssIterableDataset using tar file provided.
170 |
171 | Args:
172 | tar_uri(str): OSS URI of tar archive.
173 | tar_index_uri(str): OSS URI of tar index file corresponding to tar archive.
174 | shuffle(bool): Whether to shuffle the dataset.
175 | shuffle_chunk_size(int): Size of chunks to shuffle over.
176 | endpoint(str): Endpoint of the OSS bucket where the objects are stored.
177 | cred_path(str): Credential info of the OSS bucket where the objects are stored.
178 | config_path(str): Configuration file path of the OSS connector.
179 | transform: Optional callable which is used to transform an DataObject into the desired type.
180 | cred_provider: OSS credential provider.
181 |
182 | Returns:
183 | OssIterableDataset: An IterableStyle dataset created from tar file.
184 | """
185 | log.info(f"Building {cls.__name__} from_tar")
186 | return cls(
187 | endpoint, cred_path, config_path, partial(OssTarIterable.from_tar, tar_uri, tar_index_uri, preload=True),
188 | transform=transform, cred_provider=cred_provider, from_tar=True, shuffle=shuffle, shuffle_chunk_size=shuffle_chunk_size
189 | )
190 |
191 | def _get_client(self, id, total):
192 | if self._client is None:
193 | self._client = OssClient(self._endpoint, self._cred_path, self._config_path, self._uuid, id, total, cred_provider=self._cred_provider)
194 | log.info("OssIterableDataset new client")
195 | self._client._id = id
196 | self._client._total = total
197 | return self._client
198 |
199 | def _get_transformed_object(self, object: DataObject) -> Any:
200 | return self._transform(object)
201 |
202 | def __iter__(self) -> Iterator[Any]:
203 | worker_info = torch.utils.data.get_worker_info()
204 |
205 | if worker_info is None: # single-process data loading, return the full iterator
206 | log.info("OssIterableDataset get iter (single-process)")
207 | if self._from_tar and self._shuffle:
208 | if len(self._chunks) >= 1:
209 | chunks = self._chunks
210 | else:
211 | chunks = []
212 | log.info("OssIterableDataset chunk num: %d", len(chunks))
213 | worker_iter = self._get_dataset_objects(self._get_client(0, 1), chunks=chunks)
214 | else:
215 | worker_iter = self._get_dataset_objects(self._get_client(0, 1))
216 | else: # in a worker process, split workload
217 | num_workers = worker_info.num_workers
218 | worker_id = worker_info.id
219 | log.info("OssIterableDataset get iter (multi-process), num_workers: %d, worker id: %d", num_workers, worker_id)
220 | if self._from_tar and self._shuffle:
221 | if len(self._chunks) >= num_workers:
222 | chunks = [chunk for i, chunk in enumerate(self._chunks) if i % num_workers == worker_id]
223 | else:
224 | chunks = []
225 | log.info("OssIterableDataset chunk num: %d", len(chunks))
226 | worker_iter = self._get_dataset_objects(self._get_client(worker_id, num_workers), chunks=chunks)
227 | else:
228 | worker_iter = self._get_dataset_objects(self._get_client(worker_id, num_workers))
229 |
230 | return map(self._get_transformed_object, worker_iter)
231 |
232 | def shuffle(self, generator=None):
233 | if generator is None:
234 | seed = int(torch.empty((), dtype=torch.int64).random_().item())
235 | generator = torch.Generator()
236 | generator.manual_seed(seed)
237 | log.debug("OssIterableDataset shuffle seed: %d", seed)
238 | chunks = []
239 | index = 0
240 | while index < self._dataset_size:
241 | chunk_size = min(max(1, int(random.gauss(self._chunk_size, 10))), self._dataset_size - index)
242 | chunks.append((index, chunk_size))
243 | index += chunk_size
244 | random_sampler = torch.utils.data.SubsetRandomSampler(chunks, generator=generator)
245 | self._chunks = list(random_sampler)
246 | log.info("OssIterableDataset shuffle chunk indices, dataset size: %d, chunk num: %d",
247 | self._dataset_size, len(self._chunks))
248 |
--------------------------------------------------------------------------------
/oss-torch-connector/osstorchconnector/oss_map_dataset.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 | from typing import List, Any, Callable, Iterable, Union, Tuple
3 | import io
4 | import torch.utils.data
5 | import uuid
6 | import logging
7 | import time
8 | import os
9 | import errno
10 |
11 | from ._oss_client import OssClient, DataObject
12 | from ._oss_bucket_iterable import OssBucketIterable, identity, parse_oss_uri
13 | from ._oss_tar_iterable import OssTarIterable
14 |
15 | log = logging.getLogger(__name__)
16 |
17 | class OssMapDataset(torch.utils.data.Dataset):
18 | """A Map-Style dataset created from OSS objects.
19 |
20 | To create an instance of OssMapDataset, you need to use
21 | `from_prefix`, `from_objects`, `from_manifest_file` or `from_tar` methods.
22 | """
23 |
24 | def __init__(
25 | self,
26 | endpoint: str,
27 | cred_path: str,
28 | config_path: str,
29 | get_dataset_objects: Callable[[OssClient], Iterable[DataObject]],
30 | transform: Callable[[DataObject], Any] = identity,
31 | tar_uri: str = None,
32 | tar_index_uri: str = None,
33 | cred_provider: Any = None,
34 | ):
35 | self._uuid = uuid.uuid4()
36 | self._endpoint = endpoint
37 | log.info("OssMapDataset init, uuid: %s, endpoint: %s", self._uuid, self._endpoint)
38 | init_time = time.time()
39 | if not endpoint:
40 | raise ValueError("endpoint must be non-empty")
41 | if not cred_path:
42 | self._cred_path = ""
43 | else:
44 | self._cred_path = cred_path
45 | self._cred_provider = cred_provider
46 | if not config_path:
47 | self._config_path = ""
48 | else:
49 | self._config_path = config_path
50 | self._get_dataset_objects = get_dataset_objects
51 | self._transform = transform
52 | self._client = OssClient(self._endpoint, self._cred_path, self._config_path, self._uuid, cred_provider=self._cred_provider)
53 | self._client_pid = os.getpid()
54 | self._from_tar = False
55 | if tar_uri and tar_index_uri:
56 | tar_bucket, tar_key = parse_oss_uri(tar_uri)
57 | index_bucket, index_key = parse_oss_uri(tar_index_uri)
58 | if tar_bucket != index_bucket:
59 | raise ValueError("tar_uri and tar_index_uri must be in the same bucket")
60 | self._from_tar = True
61 | self._tar_bucket = tar_bucket
62 | self._tar_key = tar_key
63 | self._tar_index_key = index_key
64 | self._bucket_objects = self._get_dataset_objects(self._client)
65 | else:
66 | self._bucket_objects = list(self._get_dataset_objects(self._client))
67 | log.info("OssMapDataset init done, uuid: %s, time cost: %.2f s", self._uuid, time.time() - init_time)
68 |
69 |
70 | @property
71 | def _dataset_bucket_objects(self) -> List[DataObject]:
72 | if self._bucket_objects is None:
73 | self._bucket_objects = list(self._get_dataset_objects(self._get_client()))
74 | log.info("OssMapDataset get bucket objects")
75 | return self._bucket_objects
76 |
77 | @classmethod
78 | def from_objects(
79 | cls,
80 | object_uris: Union[str, Iterable[str]],
81 | endpoint: str,
82 | *,
83 | cred_path: str = "",
84 | cred_provider: Any = None,
85 | config_path: str = "",
86 | transform: Callable[[DataObject], Any] = identity,
87 | ):
88 | """Returns an instance of OssMapDataset using the OSS URI(s) provided.
89 |
90 | Args:
91 | object_uris(str | Iterable[str]): OSS URI of the object(s) desired.
92 | endpoint(str): Endpoint of the OSS bucket where the objects are stored.
93 | cred_path(str): Credential info of the OSS bucket where the objects are stored.
94 | config_path(str): Configuration file path of the OSS connector.
95 | transform: Optional callable which is used to transform an DataObject into the desired type.
96 | cred_provider: OSS credential provider.
97 |
98 | Returns:
99 | OssMapDataset: A Map-Style dataset created from OSS objects.
100 | """
101 | log.info(f"Building {cls.__name__} from_objects")
102 | return cls(
103 | endpoint, cred_path, config_path, partial(OssBucketIterable.from_uris, object_uris, preload=False),
104 | transform=transform, cred_provider=cred_provider
105 | )
106 |
107 | @classmethod
108 | def from_prefix(
109 | cls,
110 | oss_uri: str,
111 | endpoint: str,
112 | *,
113 | cred_path: str = "",
114 | cred_provider: Any = None,
115 | config_path: str = "",
116 | transform: Callable[[DataObject], Any] = identity,
117 | ):
118 | """Returns an instance of OssMapDataset using the OSS URI provided.
119 |
120 | Args:
121 | oss_uri(str): An OSS URI (prefix) of the object(s) desired. Objects matching the prefix will be included in the returned dataset.
122 | endpoint(str): Endpoint of the OSS bucket where the objects are stored.
123 | cred_path(str): Credential info of the OSS bucket where the objects are stored.
124 | config_path(str): Configuration file path of the OSS connector.
125 | transform: Optional callable which is used to transform an DataObject into the desired type.
126 | cred_provider: OSS credential provider.
127 |
128 | Returns:
129 | OssMapDataset: A Map-Style dataset created from OSS objects.
130 | """
131 | log.info(f"Building {cls.__name__} from_prefix")
132 | return cls(
133 | endpoint, cred_path, config_path, partial(OssBucketIterable.from_prefix, oss_uri, preload=False),
134 | transform=transform, cred_provider=cred_provider
135 | )
136 |
137 | @classmethod
138 | def from_manifest_file(
139 | cls,
140 | manifest_file_path: str,
141 | manifest_parser: Callable[[io.IOBase], Iterable[Tuple[str, str]]],
142 | oss_base_uri: str,
143 | endpoint: str,
144 | *,
145 | cred_path: str = "",
146 | cred_provider: Any = None,
147 | config_path: str = "",
148 | transform: Callable[[DataObject], Any] = identity,
149 | ):
150 | """Returns an instance of OssMapDataset using manifest file provided.
151 |
152 | Args:
153 | manifest_file_path(str): OSS URI or local path of manifest file.
154 | manifest_parser: A callable which takes an io.IOBase object and returns an iterable of (object_uri, label).
155 | oss_base_uri(str): The base URI of the OSS object in manifest file.
156 | endpoint(str): Endpoint of the OSS bucket where the objects are stored.
157 | cred_path(str): Credential info of the OSS bucket where the objects are stored.
158 | config_path(str): Configuration file path of the OSS connector.
159 | transform: Optional callable which is used to transform an DataObject into the desired type.
160 | cred_provider: OSS credential provider.
161 |
162 | Returns:
163 | OssMapDataset: A Map-Style dataset created from OSS objects.
164 | """
165 | log.info(f"Building {cls.__name__} from_manifest_file")
166 | return cls(
167 | endpoint, cred_path, config_path, partial(OssBucketIterable.from_manifest_file, manifest_file_path, manifest_parser, oss_base_uri, preload=False),
168 | transform=transform, cred_provider=cred_provider
169 | )
170 |
171 | @classmethod
172 | def from_tar(
173 | cls,
174 | tar_uri: str,
175 | tar_index_uri: str,
176 | endpoint: str,
177 | *,
178 | cred_path: str = "",
179 | cred_provider: Any = None,
180 | config_path: str = "",
181 | transform: Callable[[DataObject], Any] = identity,
182 | ):
183 | """Returns an instance of OssMapDataset using tar file provided.
184 |
185 | Args:
186 | tar_uri(str): OSS URI of tar archive.
187 | tar_index_uri(str): OSS URI of tar index file corresponding to tar archive.
188 | endpoint(str): Endpoint of the OSS bucket where the objects are stored.
189 | cred_path(str): Credential info of the OSS bucket where the objects are stored.
190 | config_path(str): Configuration file path of the OSS connector.
191 | transform: Optional callable which is used to transform an DataObject into the desired type.
192 | cred_provider: OSS credential provider.
193 |
194 | Returns:
195 | OssMapDataset: An Map-Style dataset created from tar file.
196 | """
197 | log.info(f"Building {cls.__name__} from_tar")
198 | return cls(
199 | endpoint, cred_path, config_path, partial(OssTarIterable.from_tar, tar_uri, tar_index_uri, preload=False),
200 | transform=transform, cred_provider=cred_provider, tar_uri=tar_uri, tar_index_uri=tar_index_uri
201 | )
202 |
203 | def _get_client(self):
204 | if self._client is None:
205 | self._client = OssClient(self._endpoint, self._cred_path, self._config_path, self._uuid)
206 | log.info("OssMapDataset new client")
207 | if self._client_pid != os.getpid():
208 | worker_info = torch.utils.data.get_worker_info()
209 | if worker_info is not None:
210 | # reset client id
211 | self._client._id = worker_info.id
212 | self._client._total = worker_info.num_workers
213 | self._client_pid = os.getpid()
214 | return self._client
215 |
216 | def _get_transformed_object_safe(self, object: DataObject) -> Any:
217 | eno = object.err()
218 | if eno != 0:
219 | errstr = "failed to get next object, errno=%d(%s), msg=%s" % (eno, os.strerror(eno), object.error_msg())
220 | log.error("OssMapDataset get item %s faild: %s", object.key, errstr)
221 | if eno == errno.ENOENT:
222 | return self._transform(None)
223 | else:
224 | raise RuntimeError(errstr)
225 | return self._transform(object)
226 |
227 | def __getitem__(self, i: int) -> Any:
228 | if not self._from_tar:
229 | object = self._dataset_bucket_objects[i]
230 | log.debug("OssMapDataset get item [%d], key: %s, size: %d, label: %s", i, object.key, object.size, object.label)
231 | bucket, key = parse_oss_uri(object.key)
232 | if object.size <= 0:
233 | new_object = self._get_client().get_object(bucket, key, 0, label=object.label, type=2) # mem
234 | else:
235 | new_object = self._get_client().get_object(bucket, key, object.size, label=object.label, type=0) # basic
236 | else:
237 | new_object = self._get_client().get_object(bucket=self._tar_bucket, key=self._tar_key, size=i,
238 | label=self._tar_index_key, type=3) # tar
239 | return self._get_transformed_object_safe(new_object)
240 |
241 | def __getitems__(self, indices: List[int]) -> List[Any]:
242 | log.debug("OssMapDataset get items %s", indices)
243 | if not self._from_tar:
244 | objects = [self._dataset_bucket_objects[i] for i in indices]
245 | iter = self._get_client().list_objects_from_uris(objects, prefetch=True, include_errors=True)
246 | # should return list, default collate needs batch be subscriptable
247 | return [self._get_transformed_object_safe(object) for object in iter]
248 | else:
249 | if self.is_continuous(indices):
250 | log.debug("OssMapDataset get items, start: %d, length: %d", indices[0], len(indices))
251 | iter = self._get_client().list_objects_from_tar(self._tar_bucket, self._tar_key, self._tar_index_key,
252 | [indices[0]], [len(indices)], prefetch=True, include_errors=True)
253 | return [self._get_transformed_object_safe(object) for object in iter]
254 | else:
255 | iter = self._get_client().list_objects_from_tar(self._tar_bucket, self._tar_key, self._tar_index_key,
256 | indices, [], prefetch=True, include_errors=True)
257 | return [self._get_transformed_object_safe(object) for object in iter]
258 |
259 | def __len__(self):
260 | size = len(self._dataset_bucket_objects)
261 | log.info("OssMapDataset get len (%d)", size)
262 | return size
263 |
264 | def is_continuous(self, indices):
265 | for i in range(1, len(indices)):
266 | if indices[i] - indices[i - 1] != 1:
267 | return False
268 | return True
269 |
--------------------------------------------------------------------------------
/oss-torch-connector/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "osstorchconnector"
7 | version = "1.0.0rc1"
8 | description = "OSS connector for AI/ML"
9 | requires-python = ">=3.8,<3.13"
10 | readme = "README.md"
11 | dependencies = [
12 | "torch >= 2.0",
13 | ]
14 | classifiers = [
15 | "Development Status :: 4 - Beta",
16 | "Intended Audience :: Developers",
17 | "Topic :: Utilities",
18 | "License :: OSI Approved :: MIT License",
19 | "Operating System :: POSIX :: Linux",
20 |
21 | "Programming Language :: Python :: 3",
22 | "Programming Language :: Python :: 3.8",
23 | "Programming Language :: Python :: 3.9",
24 | "Programming Language :: Python :: 3.10",
25 | "Programming Language :: Python :: 3.11",
26 | "Programming Language :: Python :: 3.12",
27 | ]
28 |
29 | [tool.setuptools.packages.find]
30 | where = ["."]
31 | include = ["osstorchconnector", "osstorchconnector._oss_connector"]
32 |
33 | [tool.setuptools.package-data]
34 | osstorchconnector = ["_oss_connector/*.so"]
35 |
--------------------------------------------------------------------------------
/oss-torch-connector/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, Extension
2 | from setuptools.command.build_ext import build_ext
3 | import os
4 | import subprocess
5 | import shutil
6 |
7 |
8 | class BuildExtension(Extension):
9 | def __init__(self, name, source_dir=''):
10 | Extension.__init__(self, name, sources=[source_dir])
11 | self.source_dir = os.path.abspath(source_dir)
12 |
13 | class LibraryBuild(build_ext):
14 | user_options = build_ext.user_options + [
15 | ('library-path=', None, 'oss_connector library path'),
16 | ]
17 | def initialize_options(self):
18 | super().initialize_options()
19 | self.library_path = None
20 | def run(self):
21 | if not self.library_path:
22 | raise RuntimeError("library path is not specified by '--library-path'")
23 | self.library_path = os.path.abspath(self.library_path)
24 | if os.path.exists(self.library_path):
25 | print('library path:', self.library_path)
26 | else:
27 | raise RuntimeError("invalid library path: " + self.library_path)
28 | for ext in self.extensions:
29 | self.build_extension(ext)
30 |
31 | def run_command(self, command, cwd):
32 | try:
33 | subprocess.run(command, capture_output=True, text=True, check=True, cwd=cwd)
34 | except subprocess.CalledProcessError as e:
35 | print(f"Command '{' '.join(command)}' failed with exit code {e.returncode}")
36 | print(f"Stdout: {e.stdout}")
37 | print(f"Stderr: {e.stderr}")
38 | raise RuntimeError("Subprocess execution failed") from e
39 |
40 | def build_extension(self, ext):
41 | print('name:', ext.name)
42 | print('source path:', ext.source_dir)
43 | print('current dir:', os.getcwd())
44 |
45 | # copy .so
46 | library_file_name = os.path.basename(self.library_path)
47 | dest_so_path = os.path.abspath(
48 | os.path.join(self.build_lib, 'osstorchconnector', '_oss_connector', library_file_name))
49 | print('copy %s to %s' % (self.library_path, dest_so_path))
50 | shutil.copy(self.library_path, dest_so_path)
51 |
52 |
53 | setup(
54 | ext_modules=[BuildExtension('oss_connector', '.')],
55 | cmdclass=dict(build_ext=LibraryBuild),
56 | )
57 |
--------------------------------------------------------------------------------
/oss-torch-connector/tools/generate_tar_archive.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | """
4 | Generate tar archive and its index
5 |
6 | This script is designed to generate a tar archive and its corresponding index.
7 | It can also generate an index from an existing tar archive.
8 | Both source and target path can be specified by OSS URI or local path.
9 |
10 | Usage:
11 | 1. Generate tar archive and its index from source:
12 | python generate_tar_archive.py --endpoint --cred-path --config-path \
13 | --tar-path --index-path --source-path
14 | 2. Generate tar index from existing tar archive:
15 | python generate_tar_archive.py --endpoint --cred-path --config-path \
16 | --tar-path --index-path --index-only
17 | """
18 |
19 | from osstorchconnector import generate_tar_archive
20 | import argparse
21 |
22 | parser = argparse.ArgumentParser(description='Generate tar archive and its index')
23 | parser.add_argument('-ep', '--endpoint', type=str, help='Endpoint of the OSS bucket where the objects are stored.')
24 | parser.add_argument('--cred-path', type=str, help='Credential info of the OSS bucket where the objects are stored.')
25 | parser.add_argument('--config-path', type=str, help='Configuration file path of the OSS connector.')
26 | parser.add_argument('--tar-path', type=str, help='Path to the tar archive. (OSS URI or local path)')
27 | parser.add_argument('--index-path', type=str, help='Path to the tar index. (OSS URI or local path)')
28 | parser.add_argument('--source-path', type=str, help='Path to the source directory. (OSS URI or local path)')
29 | parser.add_argument('--index-only', action='store_true', help='''If True, generate tar index from tar archive specified by 'tar_path',
30 | otherwise (by default) generate tar archive and its index from source directory specified by 'source_path'.''')
31 |
32 |
33 | def main():
34 | args = parser.parse_args()
35 | generate_tar_archive(args.endpoint, args.cred_path, args.config_path, args.tar_path, args.index_path, args.source_path, args.index_only)
36 |
37 |
38 | if __name__ == "__main__":
39 | main()
40 |
--------------------------------------------------------------------------------