├── .pycodestylerc ├── docs ├── images │ ├── dingtalk.png │ ├── architecture.png │ ├── performance.png │ └── wide-and-deep.png ├── requirements.txt ├── index.md ├── architecture.md ├── tutorial │ └── ranking │ │ ├── __init__.py │ │ ├── taobao │ │ └── data │ │ │ └── stats.py │ │ ├── optimization.py │ │ └── criteo │ │ └── data │ │ └── prep.py ├── introduction.md └── conf.py ├── .clang-format ├── .isort.cfg ├── .github ├── helm │ ├── Chart.yaml │ ├── values.yaml │ ├── .helmignore │ ├── upload │ └── templates │ │ └── tfjob.yaml ├── ISSUE_TEMPLATE │ ├── 30-other.md │ ├── 00-enhancement.md │ ├── 20-documentation.md │ └── 10-bug.md └── workflows │ ├── cpu.yaml │ ├── gpu.yaml │ ├── cpu-nightly.yaml │ └── gpu-nightly.yaml ├── .gitignore ├── .readthedocs.yaml ├── pyproject.toml ├── ADOPTERS.md ├── NOTICE ├── hybridbackend ├── common │ ├── __init__.py │ ├── profiler.h │ ├── macros.h │ ├── logging.h │ ├── Makefile │ ├── logging.cc │ ├── profiler.cc │ ├── env.h │ ├── atomic.cu.h │ ├── murmur3.cu.h │ └── arrow.h ├── tensorflow │ ├── benchmarks │ │ ├── __init__.py │ │ └── data_benchmark_csv.py │ ├── distribute │ │ ├── nccl │ │ │ ├── __init__.py │ │ │ └── nccl_get_id.cc │ │ ├── partition │ │ │ ├── __init__.py │ │ │ ├── modulo_functors.h │ │ │ └── dual_modulo_functors.h │ │ ├── ops.py │ │ ├── __init__.py │ │ └── tests │ │ │ └── broadcast_test.py │ ├── data │ │ ├── prefetch │ │ │ └── __init__.py │ │ ├── rebatch │ │ │ ├── __init__.py │ │ │ └── dataset.py │ │ ├── tabular │ │ │ ├── __init__.py │ │ │ ├── orc.h │ │ │ ├── parquet.h │ │ │ ├── table.cc │ │ │ └── dataset.py │ │ ├── sync │ │ │ ├── __init__.py │ │ │ ├── dataset.py │ │ │ └── utils.py │ │ ├── deduplicate │ │ │ ├── __init__.py │ │ │ └── dataset.py │ │ ├── __init__.py │ │ └── tests │ │ │ ├── rebatch_dataset_seq_test.py │ │ │ ├── parquet_dataset_reshape_test.py │ │ │ ├── parquet_dataset_ragged_nested_test.py │ │ │ └── sync_replicas_dataset_test.py │ ├── framework │ │ ├── __init__.py │ │ ├── version.py │ │ ├── device.py │ │ └── config.py │ ├── common │ │ ├── __init__.py │ │ ├── pywrap.py │ │ ├── eigen.h │ │ ├── dataset.h │ │ ├── slice_sum.h │ │ ├── fusion_helper.cu.h │ │ ├── cast.h │ │ ├── stream.h │ │ └── fusion_helper.cu.cc │ ├── keras │ │ ├── __init__.py │ │ └── layers │ │ │ └── __init__.py │ ├── metrics │ │ ├── __init__.py │ │ └── gauc.py │ ├── pipeline │ │ └── __init__.py │ ├── embedding │ │ ├── __init__.py │ │ ├── lookup_functors.h │ │ ├── deeprecev.py │ │ └── tests │ │ │ └── deeprecev_test.py │ ├── estimator │ │ └── __init__.py │ ├── wraps.py │ ├── graph │ │ ├── common │ │ │ ├── linearization.h │ │ │ ├── pruning.h │ │ │ ├── replacing.h │ │ │ ├── relocation.h │ │ │ ├── helper.h │ │ │ ├── rewriting.h │ │ │ ├── packing.h │ │ │ └── linearization.cc │ │ ├── optimize_lookup.cc │ │ ├── op_optimization.h │ │ ├── optimize_memory.cc │ │ ├── optimize_floormod_shuffle.cc │ │ └── optimize_partition_by_modulo.cc │ ├── ops │ │ ├── __init__.py │ │ └── transfer │ │ │ └── functors.h │ ├── __init__.py │ ├── training │ │ ├── __init__.py │ │ ├── server.py │ │ └── variables.py │ └── Makefile ├── torch │ └── __init__.py └── __init__.py ├── CONTRIBUTING.md ├── CITATION.cff └── ROADMAP.md /.pycodestylerc: -------------------------------------------------------------------------------- 1 | [pycodestyle] 2 | ignore = E501,E722,W503 3 | -------------------------------------------------------------------------------- /docs/images/dingtalk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepRec-AI/HybridBackend/HEAD/docs/images/dingtalk.png -------------------------------------------------------------------------------- /docs/images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepRec-AI/HybridBackend/HEAD/docs/images/architecture.png -------------------------------------------------------------------------------- /docs/images/performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepRec-AI/HybridBackend/HEAD/docs/images/performance.png -------------------------------------------------------------------------------- /docs/images/wide-and-deep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepRec-AI/HybridBackend/HEAD/docs/images/wide-and-deep.png -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinx_rtd_theme 3 | myst-parser 4 | docutils==0.16 5 | hybridbackend-tf115-cpu 6 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | # clang-format -i --style=google 2 | 3 | BasedOnStyle: Google 4 | DerivePointerAlignment: false 5 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Contents 2 | 3 | ```{toctree} 4 | :maxdepth: 2 5 | 6 | introduction 7 | architecture 8 | data 9 | distributed 10 | ``` 11 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | profile=google 3 | indent=2 4 | src_paths=hybridbackend 5 | extra_standard_library=six 6 | known_third_party=tensorflow,torch 7 | known_first_party=hybridbackend 8 | -------------------------------------------------------------------------------- /.github/helm/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: hybridbackend-developer 3 | description: A Helm chart for HybridBackend developers 4 | type: application 5 | version: 0.1.0 6 | appVersion: "0.1.0" 7 | -------------------------------------------------------------------------------- /.github/helm/values.yaml: -------------------------------------------------------------------------------- 1 | image: registry.cn-shanghai.aliyuncs.com/pai-dlc/hybridbackend:developer-tf1.15-py3.8-cu121-ubuntu20.04 2 | port: 20000 3 | gpus: 2 4 | caps: ["SYS_ADMIN", "SYS_PTRACE"] 5 | build: "" 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/30-other.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Other Issues 3 | about: Use this template for any other non-support related issues 4 | 5 | --- 6 | This template is for miscellaneous issues not covered by the other issue categories. 7 | 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.so 3 | *.so.* 4 | *.o 5 | *.d 6 | *.log 7 | build/lib.* 8 | build/bdist.* 9 | build/temp.* 10 | build/reports/ 11 | build/doc/ 12 | build/wheel/ 13 | build/release/ 14 | *.egg-info/ 15 | __pycache__/ 16 | .pylint.d/ 17 | cache/ 18 | outputs/ 19 | *_test.py.xml 20 | *_lib.c 21 | .config.mk 22 | -------------------------------------------------------------------------------- /docs/architecture.md: -------------------------------------------------------------------------------- 1 | # Architecture 2 | 3 | HybridBackend follows a shares-nothing architecture: A HybridBackend job 4 | consists of single-GPU workers. Workers shares nothing and coordinates by 5 | collective communication. Each worker reads environment variable `TF_CONFIG` 6 | for cluster information. 7 | 8 | ![architecture](images/architecture.png) 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/00-enhancement.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: New feature or request 3 | about: Use this template for raising a feature request. 4 | 5 | --- 6 | # User Story 7 | 8 | As a _, I want to _, so that _. 9 | 10 | # Detailed requirements 11 | 12 | - It should be _ 13 | 14 | # API Compatibility 15 | 16 | # Willing to contribute 17 | 18 | Yes 19 | 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/20-documentation.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Needs help for installation or documentation 3 | about: Use this template for an installation/documentation issue. 4 | 5 | --- 6 | 7 | # Summary 8 | 9 | # Installation environment 10 | - GPU model and memory: 11 | - OS Platform: 12 | - Docker version: 13 | - GCC/CUDA/cuDNN version: 14 | - Python/conda version: 15 | - TensorFlow/PyTorch version: 16 | 17 | # Willing to contribute 18 | 19 | Yes 20 | 21 | -------------------------------------------------------------------------------- /.github/helm/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Optionally declare the Python requirements required to build your docs 13 | python: 14 | version: "3.6" 15 | install: 16 | - requirements: docs/requirements.txt 17 | 18 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | select = ["F", "E", "W"] 3 | ignore = ["E501", "E722"] 4 | ignore-init-module-imports = true 5 | line-length = 80 6 | 7 | [tool.ruff.per-file-ignores] 8 | "__init__.py" = ["F401", "F403"] 9 | 10 | [tool.ruff.isort] 11 | force-single-line = true 12 | force-sort-within-sections = true 13 | single-line-exclusions = ["typing"] 14 | order-by-type = false 15 | known-third-party = ["tensorflow", "torch"] 16 | known-first-party = ["hybridbackend"] 17 | extra-standard-library = ["six"] 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/10-bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Something is not working 3 | about: Use this template for reporting a bug or a performance issue. 4 | 5 | --- 6 | # Current behavior 7 | 8 | # Expected behavior 9 | 10 | # System information 11 | - GPU model and memory: 12 | - OS Platform: 13 | - Docker version: 14 | - GCC/CUDA/cuDNN version: 15 | - Python/conda version: 16 | - TensorFlow/PyTorch version: 17 | 18 | # Code to reproduce 19 | 20 | ```python 21 | ``` 22 | 23 | # Willing to contribute 24 | 25 | Yes 26 | 27 | -------------------------------------------------------------------------------- /ADOPTERS.md: -------------------------------------------------------------------------------- 1 | # List of Adopters 2 | 3 | The following are the adopters of HybridBackend. If you are using HybridBackend 4 | in your organization, please feel free to add the organization name into the 5 | following list by a pull request. 6 | 7 | | Organization | Phase | 8 | | ------------ | ----- | 9 | | | | 10 | 11 | ## Appendix 12 | 13 | ### Phases of Adoption 14 | 15 | | Phase Name | Description | 16 | | ---------- | ----------- | 17 | | **Evaluation** | Interested in HybridBackend | 18 | | **Testing** | Take HybridBackend as one of candidates | 19 | | **Staging** | Decide to use HybridBackend, | 20 | | **Production** | Already put HybridBackend into production | 21 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | HybridBackend 2 | ------------------------------------------------------------------------------- 3 | 4 | Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 5 | 6 | This product includes software developed by Alibaba Group Holding Limited. 7 | 8 | This product includes software developed by 9 | The Apache Software Foundation (http://www.apache.org/). 10 | 11 | This product includes software from Apache Arrow, which includes the following 12 | in its NOTICE file: 13 | 14 | Apache Arrow 15 | Copyright 2016-2019 The Apache Software Foundation 16 | 17 | This product includes software developed at 18 | The Apache Software Foundation (http://www.apache.org/). 19 | 20 | -------------------------------------------------------------------------------- /hybridbackend/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Common utilities. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | -------------------------------------------------------------------------------- /docs/tutorial/ranking/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Sample ranking examples. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Benchmarks for hybridbackend. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/distribute/nccl/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''NCCL related classes and functions. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/prefetch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Prefetching related classes and functions. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/rebatch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Rebatching related classes and functions. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/tabular/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Tabular data related classes and functions. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/framework/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Framework releated functions in hybridbackend. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/sync/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''SyncReplicasDataset related classes and functions. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | -------------------------------------------------------------------------------- /hybridbackend/torch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''HybridBackend for PyTorch. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | # TODO Add pytorch support. 24 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/distribute/partition/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Partitioning related classes and functions. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/deduplicate/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''SyncReplicasDataset related classes and functions. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Common utilities. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from hybridbackend.tensorflow.common.pywrap import oplib 24 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/keras/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''HybridBackend Keras related modules. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from . import layers 24 | from .model import Model 25 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | We appreciate all contributions to improve HybridBackend. You can create an 4 | [issue](https://github.com/alibaba/HybridBackend/issues) or send a 5 | [pull request](https://github.com/alibaba/HybridBackend/pulls). 6 | 7 | **Working on your first Pull Request?** You can learn how from this *free* 8 | series [How to Contribute to an Open Source Project on GitHub](https://kcd.im/pull-request) 9 | 10 | ## Code style 11 | 12 | Before any commits, please use below tools to format and check code style: 13 | 14 | ```bash 15 | build/run build/format 16 | build/run build/lint 17 | ``` 18 | 19 | Commit message style should follow below format: 20 | 21 | ```text 22 | [Module] Do something great. 23 | ``` 24 | 25 | `Module` could be `CI`, `IO` or other well-known abbreviations. 26 | 27 | ## Building and testing 28 | 29 | Test your commit using default developer docker: 30 | 31 | ```bash 32 | build/run make -j8 33 | build/run make test 34 | ``` 35 | 36 | Also, CI builds would be triggered if a commit is pushed. 37 | -------------------------------------------------------------------------------- /hybridbackend/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''HybridBackend entry file. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | __version__ = '1.0.0' 24 | __author__ = 'Alibaba Group Holding Limited' 25 | __copyright__ = '2021 Alibaba Group Holding Limited' 26 | -------------------------------------------------------------------------------- /.github/helm/upload: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================= 16 | 17 | set -eo pipefail 18 | 19 | PODNAME=$1 20 | 21 | mkdir -p /tmp 22 | tar -czf /tmp/archive.tar.gz . 23 | kubectl wait --for=condition=ready pod ${PODNAME} 24 | kubectl cp /tmp/archive.tar.gz ${PODNAME}:/workspace/archive.tar.gz 25 | kubectl exec -it ${PODNAME} -- tar -xzf /workspace/archive.tar.gz -C /workspace 26 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Metrics for evaluating models in hybridbackend. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from hybridbackend.tensorflow.metrics.accuracy import accuracy 24 | from hybridbackend.tensorflow.metrics.auc import auc 25 | from hybridbackend.tensorflow.metrics.gauc import gauc 26 | -------------------------------------------------------------------------------- /docs/introduction.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | ## Recommendation models 4 | 5 | Model-based recommendation systems play key roles in internet industry, from 6 | social network to e-commerce platform. Recommendation models are getting 7 | deeper in recent years, which makes training on GPUs a good choice. 8 | 9 | However, industrial-scale recommendation models are not only deeper, but also 10 | much wider. Training wide-and-deep recommendation models on GPUs with real-world 11 | datasets still suffers from low utilization and high cost. 12 | 13 | ![wide-and-deep](images/wide-and-deep.png) 14 | 15 | ## HybridBackend 16 | 17 | HybridBackend is a high-performance framework for training wide-and-deep 18 | recommendation model on heterogeneous cluster. 19 | 20 | HybridBackend provides following features: 21 | 22 | - Memory-efficient loading of categorical data 23 | 24 | - GPU-efficient orchestration of embedding layers 25 | 26 | - Communication-efficient training and evaluation at scale 27 | 28 | - Easy to use with existing AI workflows 29 | 30 | HybridBackend speeds up training of wide-and-deep recommendation models 31 | dramatically: 32 | 33 | ![performance](images/performance.png) 34 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Pipeline related classes and functions. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from hybridbackend.tensorflow.framework.context import Context as _ctx 24 | from hybridbackend.tensorflow.pipeline.pipeline_lib import compute_pipeline 25 | 26 | _ = ( 27 | _ctx.get().options 28 | .register( 29 | 'pipeline_dense_ga_enabled', False, env='HB_PIPELINE_DENSE_GA_ENABLED')) 30 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/common/pywrap.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Python wrapper of tensorflow ops. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from tensorflow.python.framework.load_library import load_op_library as _load 24 | from tensorflow.python.platform import resource_loader as _loader 25 | 26 | try: 27 | oplib = _load( 28 | _loader.get_path_to_datafile('../libhybridbackend_tensorflow.so')) 29 | except ImportError: 30 | oplib = None 31 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/embedding/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Support for various embedding backends. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | # pylint: disable=ungrouped-imports 24 | try: 25 | from .deeprecev import \ 26 | ShardedEmbeddingWeightsRewritingForDeepRecEV as _patch_ev 27 | from .variables import \ 28 | ShardedEmbeddingWeightsRewritingForVariables as _patch_var 29 | except: # pylint: disable=bare-except 30 | pass 31 | # pylint: enable=ungrouped-imports 32 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/estimator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Support for Estimators in hybridbackend. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | try: 24 | from tensorflow_estimator.python.estimator.model_fn import EstimatorSpec 25 | except ImportError: 26 | from tensorflow.python.estimator.model_fn import EstimatorSpec 27 | 28 | from hybridbackend.tensorflow.estimator.estimator import Estimator 29 | from hybridbackend.tensorflow.estimator.estimator import RunConfig 30 | from hybridbackend.tensorflow.estimator.estimator import train_and_evaluate 31 | -------------------------------------------------------------------------------- /hybridbackend/common/profiler.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_COMMON_PROFILER_H_ 17 | #define HYBRIDBACKEND_COMMON_PROFILER_H_ 18 | 19 | #include 20 | 21 | #if HYBRIDBACKEND_NVTX 22 | #include 23 | #endif 24 | 25 | namespace hybridbackend { 26 | 27 | class ProfilerRange { 28 | public: 29 | static ProfilerRange* forSynch(const std::string& message); 30 | static ProfilerRange* forLookup(const std::string& message); 31 | 32 | ProfilerRange(const std::string& domain, const std::string& message); 33 | ~ProfilerRange(); 34 | 35 | private: 36 | #if HYBRIDBACKEND_NVTX 37 | nvtxDomainHandle_t domain_; 38 | nvtxRangeId_t range_; 39 | #endif 40 | }; 41 | 42 | } // namespace hybridbackend 43 | 44 | #endif // HYBRIDBACKEND_COMMON_PROFILER_H_ 45 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/distribute/ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Collective constants. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | 24 | class CollectiveOps(object): # pylint: disable=useless-object-inheritance 25 | r'''Collective operations. 26 | ''' 27 | SUM = 0 28 | PROD = 1 29 | MAX = 2 30 | MIN = 3 31 | AVG = 4 32 | 33 | 34 | class Topology(object): # pylint: disable=useless-object-inheritance 35 | r'''Communication topology. 36 | ''' 37 | ALL = 0 # Communication across all GPUs 38 | INTRA_NODE = 1 # Communication across all GPUs in current node 39 | INTER_NODE = 2 # Communication across all GPUS with same rank in every nodes 40 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/wraps.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Decorator to wraps customized object. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from tensorflow.python.estimator import estimator 24 | from tensorflow.python.training import optimizer 25 | 26 | from hybridbackend.tensorflow.estimator.estimator import wraps_estimator 27 | from hybridbackend.tensorflow.training.optimizer import wraps_optimizer 28 | 29 | 30 | def wraps(cls): 31 | r'''Wraps object to be used in HybridBackend. 32 | ''' 33 | if issubclass(cls, optimizer.Optimizer): 34 | return wraps_optimizer(cls) 35 | if issubclass(cls, estimator.Estimator): 36 | return wraps_estimator(cls) 37 | return cls 38 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/common/eigen.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_TENSORFLOW_COMMON_EIGEN_H_ 17 | #define HYBRIDBACKEND_TENSORFLOW_COMMON_EIGEN_H_ 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | // NOTE: EIGEN_MAX_ALIGN_BYTES is 64 in TF 1.x. See: 25 | // https://github.com/tensorflow/tensorflow/blob/v1.15.5/third_party/eigen.BUILD#L67 26 | #if EIGEN_MAX_ALIGN_BYTES == 0 27 | #define CHECK_EIGEN_ALIGN(...) (true) 28 | #else 29 | #define CHECK_EIGEN_ALIGN(...) \ 30 | (0 == reinterpret_cast(__VA_ARGS__) % EIGEN_MAX_ALIGN_BYTES) 31 | #endif 32 | 33 | #endif // HYBRIDBACKEND_TENSORFLOW_COMMON_EIGEN_H_ 34 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.1.0 2 | title: HybridBackend 3 | doi: 10.5281/zenodo.6464188 4 | type: software 5 | url: "https://github.com/alibaba/HybridBackend" 6 | authors: 7 | - given-names: Man 8 | family-names: Yuan 9 | - given-names: Langshi 10 | family-names: Chen 11 | message: >- 12 | Please cite HybridBackend in your publications if it helps 13 | preferred-citation: 14 | title: "PICASSO: Unleashing the Potential of GPU-centric Training for Wide-and-deep Recommender Systems" 15 | type: conference-paper 16 | collection-title: "2022 IEEE 38th International Conference on Data Engineering (ICDE)" 17 | year: 2022 18 | authors: 19 | - family-names: "Zhang" 20 | given-names: "Yuanxing" 21 | - family-names: "Chen" 22 | given-names: "Langshi" 23 | - family-names: "Yang" 24 | given-names: "Siran" 25 | - family-names: "Yuan" 26 | given-names: "Man" 27 | - family-names: "Yi" 28 | given-names: "Huimin" 29 | - family-names: "Zhang" 30 | given-names: "Jie" 31 | - family-names: "Wang" 32 | given-names: "Jiamang" 33 | - family-names: "Dong" 34 | given-names: "Jianbo" 35 | - family-names: "Xu" 36 | given-names: "Yunlong" 37 | - family-names: "Song" 38 | given-names: "Yue" 39 | - family-names: "Li" 40 | given-names: "Yong" 41 | - family-names: "Zhang" 42 | given-names: "Di" 43 | - family-names: "Lin" 44 | given-names: "Wei" 45 | - family-names: "Qu" 46 | given-names: "Lin" 47 | - family-names: "Zheng" 48 | given-names: "Bo" 49 | -------------------------------------------------------------------------------- /hybridbackend/common/macros.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_COMMON_MACROS_H_ 17 | #define HYBRIDBACKEND_COMMON_MACROS_H_ 18 | 19 | #ifdef __has_builtin 20 | #define HB_HAS_BUILTIN(x) __has_builtin(x) 21 | #else 22 | #define HB_HAS_BUILTIN(x) 0 23 | #endif 24 | 25 | #if (!defined(__NVCC__)) && \ 26 | (HB_HAS_BUILTIN(__builtin_expect) || (defined(__GNUC__) && __GNUC__ >= 3)) 27 | #define HB_PREDICT_FALSE(x) (__builtin_expect(x, 0)) 28 | #define HB_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) 29 | #else 30 | #define HB_PREDICT_FALSE(x) (x) 31 | #define HB_PREDICT_TRUE(x) (x) 32 | #endif 33 | 34 | #define HB_DISALLOW_COPY_AND_ASSIGN(TypeName) \ 35 | TypeName(const TypeName&) = delete; \ 36 | void operator=(const TypeName&) = delete 37 | 38 | #endif // HYBRIDBACKEND_COMMON_MACROS_H_ 39 | -------------------------------------------------------------------------------- /hybridbackend/common/logging.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_COMMON_LOGGING_H_ 17 | #define HYBRIDBACKEND_COMMON_LOGGING_H_ 18 | 19 | #include 20 | 21 | #include "hybridbackend/common/macros.h" 22 | 23 | #define HB_LOG_IS_ON(lvl) ((lvl) <= ::hybridbackend::MinLogLevel()) 24 | 25 | #define HB_LOG(lvl) \ 26 | if (HB_PREDICT_FALSE(HB_LOG_IS_ON(lvl))) \ 27 | ::hybridbackend::LogMessage(__FILE__, __LINE__) 28 | 29 | namespace hybridbackend { 30 | 31 | int& MinLogLevel(); 32 | 33 | class LogMessage : public std::basic_ostringstream { 34 | public: 35 | LogMessage(const char* fname, int line); 36 | ~LogMessage(); 37 | 38 | private: 39 | const char* fname_; 40 | int line_; 41 | }; 42 | 43 | } // namespace hybridbackend 44 | 45 | #endif // HYBRIDBACKEND_COMMON_LOGGING_H_ 46 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/distribute/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Communicators and distribution options. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from hybridbackend.tensorflow.distribute.collective import * 24 | from hybridbackend.tensorflow.distribute.ops import CollectiveOps as ops 25 | from hybridbackend.tensorflow.distribute.partition.ops import * 26 | from hybridbackend.tensorflow.framework.context import Context as _ctx 27 | 28 | _ = ( 29 | _ctx.get().options 30 | .register('comm_default', 'NCCL', env='HB_COMM_DEFAULT') 31 | .register('comm_pool_name', 'default') 32 | .register('comm_pool_capacity', 1) 33 | .register('comm_wire_dtype', None) 34 | .register('comm_gradient_wire_dtype', None)) 35 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/graph/common/linearization.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_LINEARIZATION_H_ 17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_LINEARIZATION_H_ 18 | 19 | #if HYBRIDBACKEND_TENSORFLOW 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include 26 | 27 | namespace tensorflow { 28 | namespace hybridbackend { 29 | 30 | class LinearizeOutputs { 31 | public: 32 | LinearizeOutputs(const string& op_type, const int32& op_output); 33 | Status In(Graph* graph); 34 | 35 | private: 36 | string op_type_; 37 | int32 op_output_; 38 | 39 | TF_DISALLOW_COPY_AND_ASSIGN(LinearizeOutputs); 40 | }; 41 | 42 | } // namespace hybridbackend 43 | } // namespace tensorflow 44 | 45 | #endif // HYBRIDBACKEND_TENSORFLOW 46 | #endif // HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_LINEARIZATION_H_ 47 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/graph/common/pruning.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_PRUNING_H_ 17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_PRUNING_H_ 18 | 19 | #if HYBRIDBACKEND_TENSORFLOW 20 | 21 | #include 22 | #include 23 | 24 | #include 25 | 26 | namespace tensorflow { 27 | namespace hybridbackend { 28 | 29 | Status InputPruneN(Graph* graph, const string& target_op_type, 30 | const string& target_n_attr, const int& target_n_input, 31 | const std::vector& op_types, 32 | const std::vector& src_outputs, 33 | const std::vector& dst_inputs); 34 | 35 | } // namespace hybridbackend 36 | } // namespace tensorflow 37 | 38 | #endif // HYBRIDBACKEND_TENSORFLOW 39 | #endif // HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_PRUNING_H_ 40 | -------------------------------------------------------------------------------- /ROADMAP.md: -------------------------------------------------------------------------------- 1 | # HybridBackend Roadmap 2 | 3 | ## HybridBackend v0.6 (2022-05) 4 | 5 | Objective: "Communication-efficient training and evaluation at scale" 6 | 7 | - Data-Parallel Training and Evaluation 8 | - Bucketized Gradients Aggregation using AllReduce 9 | - Global Metric Operations 10 | - Out-Of-Range Coordination 11 | 12 | - Hybrid-Parallel Embedding Learning 13 | - Bucketized Embedding Exchanging using AllToAllv 14 | - Fusion and Quantization of AllToAllv 15 | - Fusion of Partitioning and Stitching 16 | 17 | Objective: "Easy to use with existing AI workflows" 18 | 19 | - Usability 20 | - Support of MonitoredSession and Estimator 21 | - Declarative API for Model Definition 22 | 23 | - Compatibility 24 | - Support of NVIDIA TensorFlow and DeepRec 25 | 26 | - Interoperability 27 | - Inference Pipeline Needs No Change 28 | - Support of SavedModel 29 | - Support of Variable, XDL HashTable and PAI Embedding Variable 30 | 31 | ## HybridBackend v0.5 (2021-11) 32 | 33 | Objective: "Memory-efficient loading of categorical data" 34 | 35 | - Parquet Dataset 36 | - Reading batch of tensors from numeric fields in zero-copy way 37 | - Reading batch of sparse tensors from numeric list fields in zero-copy way 38 | - Support of string fields 39 | - Support of local filesystem, HDFS, S3 and OSS 40 | 41 | - Data Pipeline Functions 42 | - Resizing batch of tensors and ragged tensors 43 | - Converting ragged tensors to sparse tensors 44 | 45 | Objective: "Easy to use with existing AI workflows" 46 | 47 | - Compatibility 48 | - Support of TensorFlow 1.15 and Tensorflow 1.14 49 | - GitHub actions for uploading wheels to PyPI 50 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/framework/version.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Version related utilities. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import distutils.version 24 | 25 | _TENSORFLOW_VERSION = None 26 | 27 | 28 | def tf_version(): 29 | r'''Get tensorflow version. 30 | ''' 31 | global _TENSORFLOW_VERSION 32 | if _TENSORFLOW_VERSION: 33 | return _TENSORFLOW_VERSION 34 | try: 35 | import tensorflow as tf # pylint: disable=import-outside-toplevel 36 | _TENSORFLOW_VERSION = distutils.version.LooseVersion(tf.VERSION) 37 | except ImportError as imp: 38 | _TENSORFLOW_VERSION = None 39 | raise ImportError('Tensorflow version is not supported') from imp 40 | return _TENSORFLOW_VERSION 41 | 42 | 43 | def tf_version_check(ver): 44 | r'''Whether tensorflow version is greater than ver. 45 | ''' 46 | return tf_version() >= distutils.version.LooseVersion(ver) 47 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/graph/optimize_lookup.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #if HYBRIDBACKEND_TENSORFLOW 17 | 18 | #include 19 | 20 | #include "hybridbackend/common/env.h" 21 | #include "hybridbackend/tensorflow/graph/common/linearization.h" 22 | #include "hybridbackend/tensorflow/graph/common/packing.h" 23 | #include "hybridbackend/tensorflow/graph/common/rewriting.h" 24 | #include "hybridbackend/tensorflow/graph/op_optimization.h" 25 | 26 | namespace tensorflow { 27 | namespace hybridbackend { 28 | 29 | class OptimizeLookupReplacingPass : public OpOptimizationPass { 30 | public: 31 | Status Optimize(Graph* graph, const SessionOptions* options, 32 | const bool disabled) override { 33 | TF_RETURN_IF_ERROR(Rewrite("Lookup", "HbLookup").In(graph)); 34 | 35 | return Status::OK(); 36 | } 37 | }; 38 | 39 | REGISTER_REPLACING_OPTIMIZATION(OptimizeLookupReplacingPass); 40 | 41 | } // namespace hybridbackend 42 | } // namespace tensorflow 43 | 44 | #endif // HYBRIDBACKEND_TENSORFLOW 45 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/common/dataset.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | // Do not report compilation warnings of tensorflow dataset implementation. 17 | #pragma GCC system_header 18 | 19 | #ifndef HYBRIDBACKEND_TENSORFLOW_COMMON_DATASET_H_ 20 | #define HYBRIDBACKEND_TENSORFLOW_COMMON_DATASET_H_ 21 | 22 | #if HYBRIDBACKEND_TENSORFLOW 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #if (TF_MAJOR_VERSION * 1000L + TF_MINOR_VERSION) < 1015L 31 | #define PARSE_SCALAR ParseScalarArgument 32 | #define PARSE_VECTOR ParseVectorArgument 33 | #else 34 | #define PARSE_SCALAR ::tensorflow::data::ParseScalarArgument 35 | #define PARSE_VECTOR ::tensorflow::data::ParseVectorArgument 36 | #endif 37 | 38 | #endif // HYBRIDBACKEND_TENSORFLOW 39 | 40 | #endif // HYBRIDBACKEND_TENSORFLOW_COMMON_DATASET_H_ 41 | -------------------------------------------------------------------------------- /.github/helm/templates/tfjob.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1 2 | kind: TFJob 3 | metadata: 4 | name: {{ .Release.Name }} 5 | spec: 6 | tfReplicaSpecs: 7 | Chief: 8 | replicas: 1 9 | restartPolicy: Never 10 | template: 11 | spec: 12 | hostIPC: true 13 | hostPID: true 14 | containers: 15 | - name: tensorflow 16 | image: {{ .Values.image }} 17 | imagePullPolicy: Always 18 | resources: 19 | requests: 20 | nvidia.com/gpu: {{ .Values.gpus }} 21 | limits: 22 | nvidia.com/gpu: {{ .Values.gpus }} 23 | securityContext: 24 | capabilities: 25 | add: 26 | {{- range .Values.caps }} 27 | - {{ . }} 28 | {{- end }} 29 | args: 30 | - bash 31 | - -c 32 | - tail -f /dev/null 33 | workingDir: /workspace 34 | env: 35 | - name: PYTHONPATH 36 | value: "$PYTHONPATH:/workspace" 37 | - name: MALLOC_CONF 38 | value: "background_thread:true,metadata_thp:auto" 39 | - name: ARROW_NUM_THREADS 40 | value: "8" 41 | - name: S3_ADDRESSING_STYLE 42 | value: "virtual" 43 | - name: HYBRIDBACKEND_WHEEL_BUILD 44 | value: "{{ .Values.build }}" 45 | ports: 46 | - containerPort: {{ .Values.port }} 47 | name: tfjob-port 48 | {{- range untilStep (int (add .Values.port 1)) (int (add .Values.port (add .Values.gpus 1))) 1}} 49 | - containerPort: {{ . }} 50 | name: {{ $portName := (printf "gpu-port-%d" .) }}{{ $portName }} 51 | {{- end }} 52 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/framework/device.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Utilities for device placement. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from tensorflow.python.framework import device as pydev 24 | 25 | from hybridbackend.tensorflow.framework.context import Context 26 | 27 | 28 | def device_function(op): 29 | r'''Device function for HybridBackend. 30 | 31 | Args: 32 | op: Operator to place. 33 | 34 | Returns: 35 | device_string: device placement. 36 | ''' 37 | ctx = Context.get() 38 | current_device = pydev.DeviceSpec.from_string(op.device or '') 39 | if ctx.has_gpu: 40 | local_device = '/gpu:0' 41 | else: 42 | local_device = '/cpu:0' 43 | worker_device = pydev.DeviceSpec.from_string( 44 | f'/job:{ctx.task_type}/task:{ctx.task_id}{local_device}') 45 | if hasattr(worker_device, 'merge_from'): 46 | worker_device.merge_from(current_device) 47 | else: 48 | worker_device = worker_device.make_merged_spec(current_device) 49 | return worker_device.to_string() 50 | -------------------------------------------------------------------------------- /hybridbackend/common/Makefile: -------------------------------------------------------------------------------- 1 | COMMON_SRC := hybridbackend/common/ 2 | 3 | COMMON_CFLAGS := \ 4 | $(shell $(PYTHON) -m pybind11 --includes) 5 | 6 | COMMON_CC_SOURCES := $(shell \ 7 | find $(COMMON_SRC) -type f \ 8 | \( -name "*.cc" ! -name "*.cu*" \) \ 9 | -exec realpath {} --relative-to . \;) 10 | 11 | COMMON_OBJS := $(COMMON_CC_SOURCES:.cc=.o) 12 | $(COMMON_OBJS): %.o:%.cc $(THIRDPARTY_DEPS) 13 | mkdir -p $(dir $@) 14 | $(CXX) $(CFLAGS) $(CXX_CFLAGS) $(COMMON_CFLAGS) \ 15 | -MMD -MP -MF $<.d -o $@ -c $< -fpic 16 | 17 | ifeq ($(HYBRIDBACKEND_WITH_CUDA),ON) 18 | COMMON_CU_SOURCES := $(shell \ 19 | find $(COMMON_SRC) -type f \ 20 | \( -name '*.cu.cc' \) \ 21 | -exec realpath {} --relative-to . \;) 22 | 23 | COMMON_CU_OBJS := $(COMMON_CU_SOURCES:.cc=.o) 24 | ifeq ($(OS),Darwin) 25 | $(COMMON_CU_OBJS): %.o:%.cc 26 | mkdir -p $(dir $@) 27 | $(NVCC) $(NVCC_CFLAGS) \ 28 | -o $@ -c $< $(CFLAGS) $(COMMON_CFLAGS) -x cu \ 29 | -Xcompiler -fPIC 30 | else 31 | $(COMMON_CU_OBJS): %.o:%.cc 32 | mkdir -p $(dir $@) 33 | @$(NVCC) -M $< $(CFLAGS) $(COMMON_CFLAGS) -x cu \ 34 | | grep -v '/usr/' \ 35 | | sed 's|$(notdir $@)|$@|g' \ 36 | | sed 's|\./||g' \ 37 | > $<.d 38 | $(NVCC) $(NVCC_CFLAGS) \ 39 | -o $@ -c $< $(CFLAGS) $(COMMON_CFLAGS) -x cu \ 40 | -Xcompiler -fPIC 41 | endif 42 | COMMON_ALL_OBJS := $(COMMON_OBJS) $(COMMON_CU_OBJS) 43 | else 44 | COMMON_ALL_OBJS := $(COMMON_OBJS) 45 | endif 46 | 47 | ifeq ($(OS),Darwin) 48 | $(COMMON_LIB): $(COMMON_ALL_OBJS) 49 | mkdir -p $(dir $@) 50 | $(CXX) $(CFLAGS) $(COMMON_CFLAGS) -std=c++11 \ 51 | -install_name @rpath/lib$(LIBNAME).so \ 52 | -framework CoreFoundation \ 53 | -o $@ $^ $(LDFLAGS) $(COMMON_LDFLAGS) 54 | else 55 | $(COMMON_LIB): $(COMMON_ALL_OBJS) 56 | mkdir -p $(dir $@) 57 | $(CXX) $(CFLAGS) $(COMMON_CFLAGS) -std=c++11 \ 58 | -o $@ $^ $(LDFLAGS) $(COMMON_LDFLAGS) 59 | endif 60 | -------------------------------------------------------------------------------- /hybridbackend/common/logging.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "hybridbackend/common/logging.h" 17 | 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include "hybridbackend/common/env.h" 27 | 28 | namespace hybridbackend { 29 | 30 | int& MinLogLevel() { 31 | static int* min_log_level = new int(EnvVarGetInt("HB_MIN_LOG_LEVEL", 0)); 32 | return *min_log_level; 33 | } 34 | 35 | LogMessage::LogMessage(const char* fname, int line) 36 | : fname_(fname), line_(line) {} 37 | 38 | LogMessage::~LogMessage() { 39 | static size_t pid = static_cast(getpid()); 40 | struct timeval tv; 41 | struct timezone tz; 42 | gettimeofday(&tv, &tz); 43 | struct tm rslt; 44 | struct tm* p = gmtime_r(&tv.tv_sec, &rslt); 45 | fprintf(stderr, "[%04d-%02d-%02d %02d:%02d:%02d.%ld] [%ld#%ld] [%s:%d] %s\n", 46 | 1900 + p->tm_year, 1 + p->tm_mon, p->tm_mday, p->tm_hour, p->tm_min, 47 | p->tm_sec, tv.tv_usec, pid, syscall(SYS_gettid), fname_, line_, 48 | str().c_str()); 49 | } 50 | 51 | } // namespace hybridbackend 52 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/keras/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''HybridBackend Keras Layers. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | try: 24 | from tensorflow.python.feature_column.dense_features import DenseFeatures 25 | except ImportError: 26 | pass 27 | 28 | 29 | def dense_features(features, feature_columns): 30 | r'''Function produces dense tensors based on given `feature_columns`. 31 | 32 | Args: 33 | features: A mapping from key to tensors. `FeatureColumn`s look up via 34 | these keys. For example `numeric_column('price')` will look at 'price' 35 | key in this dict. Values can be a `SparseTensor` or a `Tensor` depends 36 | on corresponding `FeatureColumn`. 37 | feature_columns: List of feature columns. 38 | 39 | Returns: 40 | List of `Tensor`s which represents input layer of a model, which matches 41 | order of columns in `feature_columns`. 42 | ''' 43 | cols_to_output_tensors = {} 44 | DenseFeatures(feature_columns)( 45 | features, cols_to_output_tensors=cols_to_output_tensors) 46 | return [cols_to_output_tensors[f] for f in feature_columns] 47 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/common/slice_sum.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | #ifndef HYBRIDBACKEND_TENSORFLOW_COMMON_SLICE_SUM_H_ 16 | #define HYBRIDBACKEND_TENSORFLOW_COMMON_SLICE_SUM_H_ 17 | 18 | #if HYBRIDBACKEND_TENSORFLOW 19 | 20 | #include 21 | 22 | #include 23 | #include 24 | 25 | namespace tensorflow { 26 | 27 | class OpKernelContext; 28 | 29 | namespace hybridbackend { 30 | namespace functor { 31 | 32 | template 33 | struct SliceSum { 34 | void operator()(const int32 num_rows, const int32 num_cols, const int32 col, 35 | const T* input, T* output_total, T* output, 36 | const Eigen::GpuDevice& d); 37 | }; 38 | 39 | template 40 | struct SliceSumN { 41 | void operator()(const int32 num_rows, const int32 num_cols, const int32 col, 42 | const int32 num_inputs, const T* inputs, T* output_totals, 43 | T** outputs, const Eigen::GpuDevice& d); 44 | }; 45 | 46 | } // namespace functor 47 | } // namespace hybridbackend 48 | } // namespace tensorflow 49 | 50 | #endif // HYBRIDBACKEND_TENSORFLOW 51 | #endif // HYBRIDBACKEND_TENSORFLOW_COMMON_SLICE_SUM_H_ 52 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/graph/op_optimization.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_OP_OPTIMIZATION_H_ 17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_OP_OPTIMIZATION_H_ 18 | 19 | #if HYBRIDBACKEND_TENSORFLOW 20 | 21 | #include 22 | #include 23 | 24 | #include 25 | 26 | namespace tensorflow { 27 | namespace hybridbackend { 28 | 29 | class OpOptimizationPass : public GraphOptimizationPass { 30 | public: 31 | virtual Status Run(const GraphOptimizationPassOptions& options); 32 | 33 | protected: 34 | virtual Status Optimize(Graph* graph, const SessionOptions* options, 35 | const bool disabled) = 0; 36 | }; 37 | 38 | #define REGISTER_REPLACING_OPTIMIZATION(PASS) \ 39 | REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 1, PASS) 40 | 41 | #define REGISTER_REDUCTION_OPTIMIZATION(PASS) \ 42 | REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 100, \ 43 | PASS) 44 | 45 | } // namespace hybridbackend 46 | } // namespace tensorflow 47 | 48 | #endif // HYBRIDBACKEND_TENSORFLOW 49 | #endif // HYBRIDBACKEND_TENSORFLOW_GRAPH_OP_OPTIMIZATION_H_ 50 | -------------------------------------------------------------------------------- /hybridbackend/common/profiler.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "hybridbackend/common/profiler.h" 17 | 18 | namespace hybridbackend { 19 | 20 | ProfilerRange* ProfilerRange::forSynch(const std::string& message) { 21 | #if HYBRIDBACKEND_NVTX 22 | return new ProfilerRange("Synch Ops", message.c_str()); 23 | #else 24 | return nullptr; 25 | #endif 26 | } 27 | 28 | ProfilerRange* ProfilerRange::forLookup(const std::string& message) { 29 | #if HYBRIDBACKEND_NVTX 30 | return new ProfilerRange("Lookup Ops", message.c_str()); 31 | #else 32 | return nullptr; 33 | #endif 34 | } 35 | 36 | ProfilerRange::ProfilerRange(const std::string& domain, 37 | const std::string& message) { 38 | #if HYBRIDBACKEND_NVTX 39 | domain_ = nvtxDomainCreateA(domain.c_str()); 40 | nvtxEventAttributes_t nvtx_attr = {0}; 41 | nvtx_attr.version = NVTX_VERSION; 42 | nvtx_attr.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; 43 | nvtx_attr.messageType = NVTX_MESSAGE_TYPE_ASCII; 44 | nvtx_attr.message.ascii = message.c_str(); 45 | range_ = nvtxDomainRangeStartEx(domain_, &nvtx_attr); 46 | #endif 47 | } 48 | 49 | ProfilerRange::~ProfilerRange() { 50 | #if HYBRIDBACKEND_NVTX 51 | nvtxDomainRangeEnd(domain_, range_); 52 | #endif 53 | } 54 | 55 | } // namespace hybridbackend 56 | -------------------------------------------------------------------------------- /hybridbackend/common/env.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_COMMON_ENV_H_ 17 | #define HYBRIDBACKEND_COMMON_ENV_H_ 18 | 19 | #include 20 | 21 | namespace hybridbackend { 22 | 23 | void EnvVarSet(const std::string& env_var, const std::string& env_val); 24 | 25 | void EnvVarSet(const std::string& env_var, const int env_val); 26 | 27 | void EnvVarSetIfNotExists(const std::string& env_var, 28 | const std::string& env_val); 29 | 30 | void EnvVarSetIfNotExists(const std::string& env_var, const int env_val); 31 | 32 | std::string EnvVarGet(const std::string& env_var, 33 | const std::string& default_val); 34 | 35 | int EnvVarGetInt(const std::string& env_var, const int default_val); 36 | 37 | bool EnvVarGetBool(const std::string& env_var, const bool default_val); 38 | 39 | std::string EnvHttpGet(const std::string& url, const std::string& default_val, 40 | const long timeout); 41 | 42 | int EnvHttpGetInt(const std::string& url, const int default_val, 43 | const long timeout); 44 | 45 | bool EnvCheckInstance(const long timeout); 46 | 47 | int EnvGetGpuInfo(int* count, int* major, int* minor); 48 | 49 | } // namespace hybridbackend 50 | 51 | #endif // HYBRIDBACKEND_COMMON_ENV_H_ 52 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/graph/common/replacing.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_REPLACING_H_ 17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_REPLACING_H_ 18 | 19 | #if HYBRIDBACKEND_TENSORFLOW 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include 26 | 27 | namespace tensorflow { 28 | namespace hybridbackend { 29 | 30 | class Replace { 31 | public: 32 | Replace(const string& op_type, const string& optimized_op_type); 33 | Replace& WithDevice(const string& device); 34 | Replace& WithTypeAttr(const string& attr_name, 35 | const std::vector& constraints); 36 | Replace& WithExtraIntAttr(const string& attr_name); 37 | Replace& Packed(); 38 | Status In(Graph* graph); 39 | Status In(Graph* graph, int64* poccurrence_count); 40 | 41 | private: 42 | string op_type_; 43 | string optimized_op_type_; 44 | string device_; 45 | bool packed_; 46 | std::map> type_attrs_; 47 | std::vector extra_int_attrs_; 48 | 49 | TF_DISALLOW_COPY_AND_ASSIGN(Replace); 50 | }; 51 | 52 | } // namespace hybridbackend 53 | } // namespace tensorflow 54 | 55 | #endif // HYBRIDBACKEND_TENSORFLOW 56 | #endif // HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_REPLACING_H_ 57 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/ops/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Operations in HybridBackend. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import os as _os 24 | 25 | 26 | def disable_optimization(): 27 | r'''Disable optimizations for operators on GPU. 28 | ''' 29 | _os.environ['HB_OP_OPTIMIZATION_DISABLED'] = '1' 30 | 31 | 32 | def enable_optimization(logging_level=None, relocate_ops=False): 33 | r'''Enable optimizations for operators on GPU. 34 | 35 | Args: 36 | logging_level: Level of details to optimize operators. 37 | relocate_ops: Enable relocation of ops. 38 | ''' 39 | _os.environ['HB_OP_OPTIMIZATION_DISABLED'] = '0' 40 | if logging_level is not None: 41 | if 'TF_CPP_VMODULE' not in _os.environ: 42 | _os.environ['TF_CPP_VMODULE'] = '' 43 | if _os.environ['TF_CPP_VMODULE']: 44 | _os.environ['TF_CPP_VMODULE'] += ',' 45 | _os.environ['TF_CPP_VMODULE'] += ( 46 | f'op_optimization={logging_level},' 47 | f'replacing={logging_level},' 48 | f'pruning={logging_level},' 49 | f'relocation={logging_level},' 50 | f'packing={logging_level},' 51 | f'fusion={logging_level}') 52 | if relocate_ops: 53 | _os.environ['HB_OP_RELOCATION_ENABLED'] = '1' 54 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/distribute/partition/modulo_functors.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | #ifndef HYBRIDBACKEND_TENSORFLOW_DISTRIBUTE_PARTITION_MODULO_FUNCTORS_H_ 16 | #define HYBRIDBACKEND_TENSORFLOW_DISTRIBUTE_PARTITION_MODULO_FUNCTORS_H_ 17 | 18 | #if HYBRIDBACKEND_TENSORFLOW 19 | 20 | #include 21 | #include 22 | 23 | namespace tensorflow { 24 | 25 | class OpKernelContext; 26 | 27 | namespace hybridbackend { 28 | namespace functor { 29 | 30 | template 31 | struct PartitionByModulo { 32 | void operator()(const int32 num_partitions, const Tensor& input, 33 | Tensor* output, Tensor* sizes, Tensor* indices, 34 | OpKernelContext* ctx); 35 | }; 36 | 37 | template 38 | struct PartitionByModuloN { 39 | void operator()(const int32 num_partitions, const std::vector& inputs, 40 | std::vector& outputs, 41 | std::vector& outputs_sizes, 42 | std::vector& outputs_indices, OpKernelContext* ctx); 43 | }; 44 | 45 | } // namespace functor 46 | } // namespace hybridbackend 47 | } // namespace tensorflow 48 | 49 | #endif // HYBRIDBACKEND_TENSORFLOW 50 | #endif // HYBRIDBACKEND_TENSORFLOW_DISTRIBUTE_PARTITION_MODULO_FUNCTORS_H_ 51 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''HybridBackend for TensorFlow. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import contextlib as _ctxlib 24 | 25 | from hybridbackend.libhybridbackend import buildinfo 26 | from hybridbackend.tensorflow.framework.config import get_session_config 27 | from hybridbackend.tensorflow.framework.config import wraps_session_config 28 | from hybridbackend.tensorflow.framework.context import Context 29 | from hybridbackend.tensorflow.framework.context import context 30 | from hybridbackend.tensorflow.framework.rewriting import function 31 | from hybridbackend.tensorflow.framework.rewriting import scope 32 | from hybridbackend.tensorflow.ops import * 33 | from hybridbackend.tensorflow.wraps import wraps 34 | 35 | from . import data 36 | from . import distribute 37 | from . import embedding 38 | from . import estimator 39 | from . import keras 40 | from . import metrics 41 | from . import pipeline 42 | from . import training as train 43 | 44 | __version__ = buildinfo() 45 | 46 | 47 | @_ctxlib.contextmanager 48 | def embedding_scope(**kwargs): 49 | r'''Scope for defining embedding weights. 50 | ''' 51 | kwargs.setdefault('sharding', True) 52 | with scope(**kwargs) as ctx: 53 | yield ctx 54 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/embedding/lookup_functors.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | #ifndef HYBRIDBACKEND_TENSORFLOW_EMBEDDING_LOOKUP_FUNCTORS_H_ 16 | #define HYBRIDBACKEND_TENSORFLOW_EMBEDDING_LOOKUP_FUNCTORS_H_ 17 | 18 | #if HYBRIDBACKEND_TENSORFLOW 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #if GOOGLE_CUDA 26 | #include 27 | #include 28 | #endif 29 | 30 | namespace tensorflow { 31 | 32 | class OpKernelContext; 33 | 34 | namespace hybridbackend { 35 | namespace functor { 36 | 37 | #if GOOGLE_CUDA 38 | template 39 | struct LookupFunctor { 40 | public: 41 | typedef T Type; 42 | 43 | void operator()(int32* d_miss_count, int32* d_hit_and_miss_keys_indices, 44 | T* d_hit_cache_indices_and_miss_keys, 45 | const T keys_cache_slab_count, const T* d_keys_cache, 46 | const int32 key_count, const T* d_keys, 47 | const Eigen::GpuDevice& d); 48 | }; 49 | 50 | #endif // GOOGLE_CUDA 51 | 52 | } // namespace functor 53 | } // namespace hybridbackend 54 | } // namespace tensorflow 55 | 56 | #endif // HYBRIDBACKEND_TENSORFLOW 57 | #endif // HYBRIDBACKEND_TENSORFLOW_EMBEDDING_LOOKUP_FUNCTORS_H_ 58 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/sync/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''SyncReplicasDataset that syncs data between replicas. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | try: 24 | from tensorflow.python.data.ops.dataset_ops import DatasetV2 as _dataset # pylint: disable=unused-import, ungrouped-imports, line-too-long # noqa: F401 25 | 26 | from hybridbackend.tensorflow.data.sync.dataset_v2 import \ 27 | _SyncReplicasDatasetV2 as _SyncReplicasDataset 28 | _SyncReplicasDataset.__module__ = __name__ 29 | _SyncReplicasDataset.__name__ = '_SyncReplicasDataset' 30 | 31 | from hybridbackend.tensorflow.data.sync.dataset_v2 import \ 32 | SyncReplicasDatasetV2 as SyncReplicasDataset 33 | SyncReplicasDataset.__module__ = __name__ 34 | SyncReplicasDataset.__name__ = 'SyncReplicasDataset' 35 | except ImportError: 36 | from hybridbackend.tensorflow.data.sync.dataset_v1 import \ 37 | _SyncReplicasDatasetV1 as _SyncReplicasDataset 38 | _SyncReplicasDataset.__module__ = __name__ 39 | _SyncReplicasDataset.__name__ = '_SyncReplicasDataset' 40 | 41 | from hybridbackend.tensorflow.data.sync.dataset_v1 import \ 42 | SyncReplicasDatasetV1 as SyncReplicasDataset 43 | SyncReplicasDataset.__module__ = __name__ 44 | SyncReplicasDataset.__name__ = 'SyncReplicasDataset' 45 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/graph/common/relocation.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_RELOCATION_H_ 17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_RELOCATION_H_ 18 | 19 | #if HYBRIDBACKEND_TENSORFLOW 20 | 21 | #include 22 | #include 23 | 24 | #include 25 | 26 | namespace tensorflow { 27 | namespace hybridbackend { 28 | 29 | class RelocateOutputs { 30 | public: 31 | RelocateOutputs(const string& op_type); 32 | RelocateOutputs& WithDevice(const string& device); 33 | RelocateOutputs& Force(); 34 | Status In(Graph* graph); 35 | Status In(Graph* graph, int64* poccurrence_count); 36 | 37 | private: 38 | string op_type_; 39 | string device_; 40 | bool force_; 41 | 42 | TF_DISALLOW_COPY_AND_ASSIGN(RelocateOutputs); 43 | }; 44 | 45 | class Relocate { 46 | public: 47 | Relocate(const string& op_type); 48 | Relocate& WithDevice(const string& device); 49 | Relocate& WithInput(const int32 input); 50 | Status In(Graph* graph); 51 | Status In(Graph* graph, int64* poccurrence_count); 52 | 53 | private: 54 | string op_type_; 55 | string device_; 56 | int32 input_; 57 | 58 | TF_DISALLOW_COPY_AND_ASSIGN(Relocate); 59 | }; 60 | 61 | } // namespace hybridbackend 62 | } // namespace tensorflow 63 | 64 | #endif // HYBRIDBACKEND_TENSORFLOW 65 | #endif // HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_RELOCATION_H_ 66 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/graph/common/helper.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_HELPER_H_ 17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_HELPER_H_ 18 | 19 | #if HYBRIDBACKEND_TENSORFLOW 20 | 21 | #include 22 | #include 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | namespace tensorflow { 29 | namespace hybridbackend { 30 | 31 | #if (TF_MAJOR_VERSION * 1000L + TF_MINOR_VERSION) < 1014L 32 | template 33 | void DFSFromHelper(const Graph& g, gtl::ArraySlice start, 34 | const std::function& enter, 35 | const std::function& leave, 36 | const NodeComparator& stable_comparator, 37 | const EdgeFilter& edge_filter); 38 | 39 | void DFSFrom(const Graph& g, gtl::ArraySlice start, 40 | const std::function& enter, 41 | const std::function& leave, 42 | const NodeComparator& stable_comparator = {}, 43 | const EdgeFilter& edge_filter = {}); 44 | #endif 45 | 46 | string NodeJoin(const std::vector& nodes, const string& delim); 47 | 48 | } // namespace hybridbackend 49 | } // namespace tensorflow 50 | 51 | #endif // HYBRIDBACKEND_TENSORFLOW 52 | #endif // HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_HELPER_H_ 53 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/tabular/orc.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_TENSORFLOW_DATA_TABULAR_ORC_H_ 17 | #define HYBRIDBACKEND_TENSORFLOW_DATA_TABULAR_ORC_H_ 18 | 19 | #include 20 | #include 21 | 22 | #include 23 | #include 24 | 25 | #include "hybridbackend/tensorflow/data/tabular/table.h" 26 | 27 | namespace tensorflow { 28 | namespace hybridbackend { 29 | 30 | class OrcAccess : public TableAccess { 31 | public: 32 | OrcAccess(OpKernelContext* ctx, const TableFormat& format, 33 | const string& filename, const int64 batch_size, 34 | const std::vector& field_names, 35 | const DataTypeVector& field_dtypes, 36 | const std::vector& field_ragged_ranks, 37 | const std::vector& field_shapes, 38 | const bool drop_remainder, const bool skip_corrupted_data); 39 | 40 | virtual int64 Count() const override; 41 | 42 | virtual Status Open() override; 43 | 44 | virtual Status Open(const int64 start, const int64 end) override; 45 | 46 | virtual Status Read(std::vector* output_tensors) override; 47 | 48 | virtual ~OrcAccess(); 49 | 50 | private: 51 | class Impl; 52 | std::unique_ptr pimpl_; 53 | }; 54 | 55 | } // namespace hybridbackend 56 | } // namespace tensorflow 57 | 58 | #endif // HYBRIDBACKEND_TENSORFLOW_DATA_TABULAR_ORC_H_ 59 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/graph/optimize_memory.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #if HYBRIDBACKEND_TENSORFLOW 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | #include 23 | 24 | #include 25 | #include 26 | 27 | #include "hybridbackend/common/env.h" 28 | #include "hybridbackend/tensorflow/graph/common/packing.h" 29 | #include "hybridbackend/tensorflow/graph/common/rewriting.h" 30 | #include "hybridbackend/tensorflow/graph/op_optimization.h" 31 | 32 | namespace tensorflow { 33 | namespace hybridbackend { 34 | 35 | class OptimizeMemoryReplacingPass : public OpOptimizationPass { 36 | public: 37 | Status Optimize(Graph* graph, const SessionOptions* options, 38 | const bool disabled) override { 39 | if (TF_PREDICT_FALSE(disabled)) { 40 | return Status::OK(); 41 | } 42 | 43 | ::hybridbackend::EnvVarSetIfNotExists("HB_MEMORY_DECAY_MILLIS", 60000); 44 | const int kMemoryDecayMillis = 45 | ::hybridbackend::EnvVarGetInt("HB_MEMORY_DECAY_MILLIS", 0); 46 | ::hybridbackend::EnvVarSetIfNotExists( 47 | "MALLOC_CONF", "background_thread:true,metadata_thp:auto"); 48 | VLOG(1) << "Memory decay set to " << kMemoryDecayMillis << "ms"; 49 | 50 | return Status::OK(); 51 | } 52 | }; 53 | 54 | REGISTER_REPLACING_OPTIMIZATION(OptimizeMemoryReplacingPass); 55 | 56 | } // namespace hybridbackend 57 | } // namespace tensorflow 58 | 59 | #endif // HYBRIDBACKEND_TENSORFLOW 60 | -------------------------------------------------------------------------------- /.github/workflows/cpu.yaml: -------------------------------------------------------------------------------- 1 | name: release deploy on cpu 2 | 3 | on: workflow_dispatch 4 | 5 | env: 6 | IMAGE: registry.cn-shanghai.aliyuncs.com/pai-dlc/hybridbackend:developer-tf1.15-py3.6-manylinux_2_24 7 | JOBNAME: hbci-${{ github.run_id }} 8 | PODNAME: hbci-${{ github.run_id }}-chief-0 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | environment: tf1.15-py3.6-manylinux_2_24 14 | steps: 15 | - name: Checkout Code 16 | uses: actions/checkout@v3 17 | with: 18 | submodules: 'true' 19 | - name: Setup Environment 20 | uses: aliyun/ack-set-context@v1 21 | with: 22 | access-key-id: "${{ secrets.ACCESS_KEY_ID }}" 23 | access-key-secret: "${{ secrets.ACCESS_KEY_SECRET }}" 24 | cluster-id: "${{ secrets.ACK_CLUSTER_ID }}" 25 | - name: Upload 26 | run: |- 27 | helm install ${JOBNAME} .github/helm/ \ 28 | --set image=${IMAGE} \ 29 | --set gpus=0 && \ 30 | .github/helm/upload ${PODNAME} 31 | - name: Build & Check 32 | run: |- 33 | kubectl exec -it ${PODNAME} -- \ 34 | build/install HB_TEST_LOGDIR=build/reports 35 | - name: Download 36 | run: |- 37 | kubectl exec -it ${PODNAME} -- \ 38 | sh -c 'tar -czf hybridbackend.tgz -C build/release/ .' && \ 39 | kubectl cp ${PODNAME}:hybridbackend.tgz ./hybridbackend.tgz --retries=3 && \ 40 | mkdir -p dist && tar -xzf ./hybridbackend.tgz -C dist/ && \ 41 | kubectl exec -it ${PODNAME} -- \ 42 | sh -c 'tar -czf reports.tgz -C build/reports/ .' && \ 43 | kubectl cp ${PODNAME}:reports.tgz ./reports.tgz --retries=3 && \ 44 | mkdir -p reports && tar -xzf ./reports.tgz -C reports/ 45 | - name: Report 46 | uses: EnricoMi/publish-unit-test-result-action@v2 47 | with: 48 | check_name: Test Results 49 | files: "reports/**/*.xml" 50 | - name: Publish 51 | uses: pypa/gh-action-pypi-publish@release/v1 52 | with: 53 | skip_existing: true 54 | user: __token__ 55 | password: ${{ secrets.PYPI_API_TOKEN }} 56 | - name: Cleanup Environment 57 | if: always() 58 | run: |- 59 | helm uninstall ${JOBNAME} 60 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/tabular/parquet.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_TENSORFLOW_DATA_TABULAR_PARQUET_H_ 17 | #define HYBRIDBACKEND_TENSORFLOW_DATA_TABULAR_PARQUET_H_ 18 | 19 | #include 20 | #include 21 | 22 | #include 23 | #include 24 | 25 | #include "hybridbackend/tensorflow/data/tabular/table.h" 26 | 27 | namespace tensorflow { 28 | namespace hybridbackend { 29 | 30 | class ParquetAccess : public TableAccess { 31 | public: 32 | ParquetAccess(OpKernelContext* ctx, const TableFormat& format, 33 | const string& filename, const int64 batch_size, 34 | const std::vector& field_names, 35 | const DataTypeVector& field_dtypes, 36 | const std::vector& field_ragged_ranks, 37 | const std::vector& field_shapes, 38 | const bool drop_remainder, const bool skip_corrupted_data); 39 | 40 | virtual int64 Count() const override; 41 | 42 | virtual Status Open() override; 43 | 44 | virtual Status Open(const int64 start, const int64 end) override; 45 | 46 | virtual Status Read(std::vector* output_tensors) override; 47 | 48 | virtual ~ParquetAccess(); 49 | 50 | private: 51 | class Impl; 52 | std::unique_ptr pimpl_; 53 | }; 54 | 55 | } // namespace hybridbackend 56 | } // namespace tensorflow 57 | 58 | #endif // HYBRIDBACKEND_TENSORFLOW_DATA_TABULAR_PARQUET_H_ 59 | -------------------------------------------------------------------------------- /.github/workflows/gpu.yaml: -------------------------------------------------------------------------------- 1 | name: release deploy on gpu 2 | 3 | on: workflow_dispatch 4 | 5 | env: 6 | IMAGE: registry.cn-shanghai.aliyuncs.com/pai-dlc/hybridbackend:developer-tf1.15-py3.8-cu121-ubuntu20.04 7 | JOBNAME: hbci-${{ github.run_id }} 8 | PODNAME: hbci-${{ github.run_id }}-chief-0 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | environment: tf1.15-py3.8-cu121-ubuntu20.04 14 | steps: 15 | - name: Checkout Code 16 | uses: actions/checkout@v3 17 | with: 18 | submodules: 'true' 19 | - name: Setup Environment 20 | uses: aliyun/ack-set-context@v1 21 | with: 22 | access-key-id: "${{ secrets.ACCESS_KEY_ID }}" 23 | access-key-secret: "${{ secrets.ACCESS_KEY_SECRET }}" 24 | cluster-id: "${{ secrets.ACK_CLUSTER_ID }}" 25 | - name: Upload 26 | run: |- 27 | helm install ${JOBNAME} .github/helm/ \ 28 | --set image=${IMAGE} \ 29 | --set gpus=2 && \ 30 | .github/helm/upload ${PODNAME} 31 | - name: Build & Check 32 | run: |- 33 | kubectl exec -it ${PODNAME} -- \ 34 | build/install HB_TEST_LOGDIR=build/reports 35 | - name: Download 36 | run: |- 37 | kubectl exec -it ${PODNAME} -- \ 38 | sh -c 'tar -czf hybridbackend.tgz -C build/release/ .' && \ 39 | kubectl cp ${PODNAME}:hybridbackend.tgz ./hybridbackend.tgz --retries=3 && \ 40 | mkdir -p dist && tar -xzf ./hybridbackend.tgz -C dist/ && \ 41 | kubectl exec -it ${PODNAME} -- \ 42 | sh -c 'tar -czf reports.tgz -C build/reports/ .' && \ 43 | kubectl cp ${PODNAME}:reports.tgz ./reports.tgz --retries=3 && \ 44 | mkdir -p reports && tar -xzf ./reports.tgz -C reports/ 45 | - name: Report 46 | uses: EnricoMi/publish-unit-test-result-action@v2 47 | with: 48 | check_name: Test Results 49 | files: "reports/**/*.xml" 50 | - name: Publish 51 | uses: pypa/gh-action-pypi-publish@release/v1 52 | with: 53 | skip_existing: true 54 | user: __token__ 55 | password: ${{ secrets.PYPI_API_TOKEN }} 56 | - name: Cleanup Environment 57 | if: always() 58 | run: |- 59 | helm uninstall ${JOBNAME} 60 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | # isort: skip_file 16 | 17 | r'''Input pipelines. 18 | ''' 19 | 20 | from __future__ import absolute_import 21 | from __future__ import division 22 | from __future__ import print_function 23 | 24 | from hybridbackend.tensorflow.data.dataframe import DataFrame 25 | from hybridbackend.tensorflow.data.dataframe import parse 26 | from hybridbackend.tensorflow.data.dataframe import populate_defaults 27 | from hybridbackend.tensorflow.data.dataframe import unbatch_and_to_sparse 28 | from hybridbackend.tensorflow.data.deduplicate.dataset import deduplicate 29 | from hybridbackend.tensorflow.data.prefetch.iterator import Iterator 30 | from hybridbackend.tensorflow.data.rebatch.dataset import RebatchDataset 31 | from hybridbackend.tensorflow.data.rebatch.dataset import rebatch 32 | from hybridbackend.tensorflow.data.sync.dataset import SyncReplicasDataset 33 | from hybridbackend.tensorflow.data.tabular.dataset import Dataset 34 | from hybridbackend.tensorflow.data.tabular.dataset import ParquetDataset 35 | from hybridbackend.tensorflow.data.tabular.dataset import read_parquet 36 | 37 | # HybridBackend operators must be loaded before TensorFlow operators to 38 | # make AWS SDK implementation correct. 39 | from hybridbackend.tensorflow.data.iterators import make_initializable_iterator 40 | from hybridbackend.tensorflow.data.iterators import make_one_shot_iterator 41 | from hybridbackend.tensorflow.framework.context import Context as _ctx 42 | 43 | _ = ( 44 | _ctx.get().options 45 | .register('data_batch_count', 1) 46 | .register('data_sync_drop_remainder', True)) 47 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/distribute/partition/dual_modulo_functors.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | #ifndef HYBRIDBACKEND_TENSORFLOW_DISTRIBUTE_PARTITION_DUAL_MODULO_FUNCTORS_H_ 16 | #define HYBRIDBACKEND_TENSORFLOW_DISTRIBUTE_PARTITION_DUAL_MODULO_FUNCTORS_H_ 17 | 18 | #if HYBRIDBACKEND_TENSORFLOW 19 | 20 | #include 21 | #include 22 | 23 | namespace tensorflow { 24 | 25 | class OpKernelContext; 26 | 27 | namespace hybridbackend { 28 | namespace functor { 29 | 30 | struct ComputeShardAtStageOne; 31 | struct ComputeShardAtStageTwo; 32 | struct ComputeShardOnGpuAtStageOne; 33 | struct ComputeShardOnGpuAtStageTwo; 34 | 35 | template 36 | struct PartitionByDualModulo { 37 | void operator()(const int32 num_partitions, const int32 modulus, 38 | const Tensor& input, Tensor* output, Tensor* sizes, 39 | Tensor* indices, OpKernelContext* ctx); 40 | }; 41 | 42 | template 43 | struct PartitionByDualModuloN { 44 | void operator()(const int32 num_partitions, const int32 modulus, 45 | const std::vector& inputs, 46 | std::vector& outputs, 47 | std::vector& outputs_sizes, 48 | std::vector& outputs_indices, OpKernelContext* ctx); 49 | }; 50 | 51 | } // namespace functor 52 | } // namespace hybridbackend 53 | } // namespace tensorflow 54 | 55 | #endif // HYBRIDBACKEND_TENSORFLOW 56 | #endif // HYBRIDBACKEND_TENSORFLOW_DISTRIBUTE_PARTITION_DUAL_MODULO_FUNCTORS_H_ 57 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/graph/common/rewriting.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_REWRITING_H_ 17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_REWRITING_H_ 18 | 19 | #if HYBRIDBACKEND_TENSORFLOW 20 | 21 | #include 22 | #include 23 | 24 | #include 25 | 26 | namespace tensorflow { 27 | namespace hybridbackend { 28 | 29 | class Rewrite { 30 | public: 31 | Rewrite(const string& op_like_name, const string& op_name); 32 | Rewrite& WithDevice(const string& device); 33 | Rewrite& WithTypeAttr(const string& attr_name, const DataType& default_attr); 34 | Rewrite& WithShapeAttr(const string& attr_name, 35 | const TensorShape& default_attr); 36 | Rewrite& WithIntAttr(const string& attr_name, const int32& default_attr); 37 | Rewrite& WithStrAttr(const string& attr_name, const string& default_attr); 38 | Rewrite& WithTypeListAttr(const string& attr_name); 39 | Status In(Graph* graph); 40 | Status In(Graph* graph, int64* poccurrence_count); 41 | 42 | private: 43 | string op_like_name_; 44 | string op_name_; 45 | string device_; 46 | int32 num_inputs_; 47 | std::map type_attrs_; 48 | std::map shape_attrs_; 49 | std::map int_attrs_; 50 | std::map str_attrs_; 51 | std::vector type_list_attrs_; 52 | 53 | TF_DISALLOW_COPY_AND_ASSIGN(Rewrite); 54 | }; 55 | 56 | } // namespace hybridbackend 57 | } // namespace tensorflow 58 | 59 | #endif // HYBRIDBACKEND_TENSORFLOW 60 | #endif // HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_REWRITING_H_ 61 | -------------------------------------------------------------------------------- /.github/workflows/cpu-nightly.yaml: -------------------------------------------------------------------------------- 1 | name: nightly deploy on cpu 2 | 3 | on: workflow_dispatch 4 | 5 | env: 6 | IMAGE: registry.cn-shanghai.aliyuncs.com/pai-dlc/hybridbackend:developer-tf1.15-py3.6-manylinux_2_24 7 | JOBNAME: hbci-${{ github.run_id }} 8 | PODNAME: hbci-${{ github.run_id }}-chief-0 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | environment: tf1.15-py3.6-manylinux_2_24 14 | steps: 15 | - name: Checkout Code 16 | uses: actions/checkout@v3 17 | with: 18 | submodules: 'true' 19 | - name: Setup Environment 20 | uses: aliyun/ack-set-context@v1 21 | with: 22 | access-key-id: "${{ secrets.ACCESS_KEY_ID }}" 23 | access-key-secret: "${{ secrets.ACCESS_KEY_SECRET }}" 24 | cluster-id: "${{ secrets.ACK_CLUSTER_ID }}" 25 | - name: Upload 26 | run: |- 27 | helm install ${JOBNAME} .github/helm/ \ 28 | --set image=${IMAGE} \ 29 | --set build=.dev${{ github.run_id }} \ 30 | --set gpus=0 && \ 31 | .github/helm/upload ${PODNAME} 32 | - name: Build & Check 33 | run: |- 34 | kubectl exec -it ${PODNAME} -- \ 35 | build/install HB_TEST_LOGDIR=build/reports 36 | - name: Download 37 | run: |- 38 | kubectl exec -it ${PODNAME} -- \ 39 | sh -c 'tar -czf hybridbackend.tgz -C build/release/ .' && \ 40 | kubectl cp ${PODNAME}:hybridbackend.tgz ./hybridbackend.tgz --retries=3 && \ 41 | mkdir -p dist && tar -xzf ./hybridbackend.tgz -C dist/ && \ 42 | kubectl exec -it ${PODNAME} -- \ 43 | sh -c 'tar -czf reports.tgz -C build/reports/ .' && \ 44 | kubectl cp ${PODNAME}:reports.tgz ./reports.tgz --retries=3 && \ 45 | mkdir -p reports && tar -xzf ./reports.tgz -C reports/ 46 | - name: Report 47 | uses: EnricoMi/publish-unit-test-result-action@v2 48 | with: 49 | check_name: Test Results 50 | files: "reports/**/*.xml" 51 | - name: Publish 52 | uses: pypa/gh-action-pypi-publish@release/v1 53 | with: 54 | skip_existing: true 55 | user: __token__ 56 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 57 | repository_url: https://test.pypi.org/legacy/ 58 | - name: Cleanup Environment 59 | if: always() 60 | run: |- 61 | helm uninstall ${JOBNAME} 62 | -------------------------------------------------------------------------------- /.github/workflows/gpu-nightly.yaml: -------------------------------------------------------------------------------- 1 | name: nightly deploy on gpu 2 | 3 | on: workflow_dispatch 4 | 5 | env: 6 | IMAGE: registry.cn-shanghai.aliyuncs.com/pai-dlc/hybridbackend:developer-tf1.15-py3.8-cu121-ubuntu20.04 7 | JOBNAME: hbci-${{ github.run_id }} 8 | PODNAME: hbci-${{ github.run_id }}-chief-0 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | environment: tf1.15-py3.8-cu121-ubuntu20.04 14 | steps: 15 | - name: Checkout Code 16 | uses: actions/checkout@v3 17 | with: 18 | submodules: 'true' 19 | - name: Setup Environment 20 | uses: aliyun/ack-set-context@v1 21 | with: 22 | access-key-id: "${{ secrets.ACCESS_KEY_ID }}" 23 | access-key-secret: "${{ secrets.ACCESS_KEY_SECRET }}" 24 | cluster-id: "${{ secrets.ACK_CLUSTER_ID }}" 25 | - name: Upload 26 | run: |- 27 | helm install ${JOBNAME} .github/helm/ \ 28 | --set image=${IMAGE} \ 29 | --set build=.dev${{ github.run_id }} \ 30 | --set gpus=2 && \ 31 | .github/helm/upload ${PODNAME} 32 | - name: Build & Check 33 | run: |- 34 | kubectl exec -it ${PODNAME} -- \ 35 | build/install HB_TEST_LOGDIR=build/reports 36 | - name: Download 37 | run: |- 38 | kubectl exec -it ${PODNAME} -- \ 39 | sh -c 'tar -czf hybridbackend.tgz -C build/release/ .' && \ 40 | kubectl cp ${PODNAME}:hybridbackend.tgz ./hybridbackend.tgz --retries=3 && \ 41 | mkdir -p dist && tar -xzf ./hybridbackend.tgz -C dist/ && \ 42 | kubectl exec -it ${PODNAME} -- \ 43 | sh -c 'tar -czf reports.tgz -C build/reports/ .' && \ 44 | kubectl cp ${PODNAME}:reports.tgz ./reports.tgz --retries=3 && \ 45 | mkdir -p reports && tar -xzf ./reports.tgz -C reports/ 46 | - name: Report 47 | uses: EnricoMi/publish-unit-test-result-action@v2 48 | with: 49 | check_name: Test Results 50 | files: "reports/**/*.xml" 51 | - name: Publish 52 | uses: pypa/gh-action-pypi-publish@release/v1 53 | with: 54 | skip_existing: true 55 | user: __token__ 56 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 57 | repository_url: https://test.pypi.org/legacy/ 58 | - name: Cleanup Environment 59 | if: always() 60 | run: |- 61 | helm uninstall ${JOBNAME} 62 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/common/fusion_helper.cu.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_TENSORFLOW_COMMON_FUSION_HELPER_H_ 17 | #define HYBRIDBACKEND_TENSORFLOW_COMMON_FUSION_HELPER_H_ 18 | 19 | #if HYBRIDBACKEND_TENSORFLOW 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | namespace tensorflow { 27 | 28 | class OpKernelContext; 29 | 30 | namespace hybridbackend { 31 | 32 | template 33 | __global__ void SetToNValue(const Cuda2DLaunchConfig config, const N* counts, 34 | T** ptr, T value) { 35 | CUDA_AXIS_KERNEL_LOOP(g_idx, config.virtual_thread_count.y, Y) { 36 | CUDA_AXIS_KERNEL_LOOP(g_offset, config.virtual_thread_count.x, X) { 37 | if (g_offset < counts[g_idx]) { 38 | ptr[g_idx][g_offset] = value; 39 | } 40 | } 41 | } 42 | } 43 | 44 | namespace functor { 45 | 46 | #if GOOGLE_CUDA 47 | 48 | template 49 | struct CopyPtrsNFunctor { 50 | void operator()(OpKernelContext* ctx, int8* head_host, int8* head_device, 51 | std::vector* inputs, int num_columns); 52 | }; 53 | 54 | template 55 | struct CopySizesNFunctor { 56 | void operator()(OpKernelContext* ctx, T* input_host, T* input_device, 57 | int num_columns); 58 | }; 59 | 60 | #endif // GOOGLE_CUDA 61 | 62 | } // namespace functor 63 | } // namespace hybridbackend 64 | } // namespace tensorflow 65 | 66 | #endif // HYBRIDBACKEND_TENSORFLOW 67 | 68 | #endif // HYBRIDBACKEND_TENSORFLOW_COMMON_FUSION_HELPER_H_ 69 | -------------------------------------------------------------------------------- /docs/tutorial/ranking/taobao/data/stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================= 17 | 18 | r'''Calculate statistics of Taobao Click Logs Dataset. 19 | 20 | See https://tianchi.aliyun.com/dataset/dataDetail?dataId=56 for more 21 | information. 22 | ''' 23 | 24 | from __future__ import absolute_import 25 | from __future__ import division 26 | from __future__ import print_function 27 | 28 | import argparse 29 | import logging 30 | 31 | import numpy as np 32 | import pandas as pd 33 | import tqdm 34 | 35 | 36 | def main(args): 37 | users = [] 38 | ads = [] 39 | categories = [] 40 | brands = [] 41 | for day in tqdm.tqdm(range(args.ndays)): 42 | fname = args.fname_template.format(day) 43 | click_log = pd.read_parquet(fname) 44 | users += pd.unique(click_log['user']).tolist() 45 | ads += pd.unique(click_log['ad']).tolist() 46 | categories += pd.unique(click_log['item_category']).tolist() 47 | brands += pd.unique(click_log['item_brand']).tolist() 48 | del click_log 49 | users = np.unique(users) 50 | logging.info('#users = %d', len(users)) 51 | del users 52 | ads = np.unique(ads) 53 | logging.info('#ads = %d', len(ads)) 54 | del ads 55 | categories = np.unique(categories) 56 | logging.info('#categories = %d', len(categories)) 57 | del categories 58 | brands = np.unique(brands) 59 | logging.info('#brands = %d', len(brands)) 60 | del brands 61 | 62 | 63 | if __name__ == '__main__': 64 | logging.basicConfig(level=logging.INFO) 65 | 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument('--ndays', type=int, default=8) 68 | parser.add_argument('--fname-template', default='./day_{}.parquet') 69 | main(parser.parse_args()) 70 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/common/cast.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | #ifndef HYBRIDBACKEND_TENSORFLOW_COMMON_CAST_H_ 16 | #define HYBRIDBACKEND_TENSORFLOW_COMMON_CAST_H_ 17 | 18 | #if HYBRIDBACKEND_TENSORFLOW 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #if GOOGLE_CUDA 26 | #include 27 | #include 28 | #include 29 | #include 30 | #endif 31 | 32 | namespace tensorflow { 33 | 34 | class OpKernelContext; 35 | 36 | namespace hybridbackend { 37 | namespace functor { 38 | 39 | #if GOOGLE_CUDA 40 | template 41 | struct Cast { 42 | void operator()(const Tensor& in, Tensor* out, OpKernelContext* ctx, 43 | cudaStream_t* stream); 44 | }; 45 | 46 | template 47 | struct CastN { 48 | void operator()(const std::vector& in, std::vector* out, 49 | OpKernelContext* ctx, cudaStream_t* stream); 50 | void operator()(const std::vector& in, std::vector* out, 51 | OpKernelContext* ctx, cudaStream_t* stream); 52 | void operator()(const std::vector& in, std::vector* out, 53 | OpKernelContext* ctx, cudaStream_t* stream); 54 | }; 55 | 56 | #endif // GOOGLE_CUDA 57 | 58 | } // namespace functor 59 | } // namespace hybridbackend 60 | } // namespace tensorflow 61 | 62 | #endif // HYBRIDBACKEND_TENSORFLOW 63 | #endif // HYBRIDBACKEND_TENSORFLOW_COMMON_CAST_H_ 64 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/embedding/deeprecev.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''DeepRec EV as embedding tables. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from tensorflow.python.ops import variable_scope as vs 24 | 25 | from hybridbackend.tensorflow.embedding.sharding import \ 26 | ShardedEmbeddingWeightsRewriting 27 | 28 | 29 | class ShardedEmbeddingWeightsRewritingForDeepRecEV( 30 | ShardedEmbeddingWeightsRewriting): # pylint: disable=useless-object-inheritance 31 | r'''Embedding lookup decorator for DeepRec EV. 32 | ''' 33 | def __init__(self): 34 | super().__init__() 35 | self._prev_get_embedding_variable = None 36 | 37 | @property 38 | def isdynamic(self): 39 | r'''Whether embedding weights is dynamic. 40 | ''' 41 | return True 42 | 43 | def begin(self): 44 | r'''Rewrites API. 45 | ''' 46 | try: 47 | self._prev_get_embedding_variable = ( 48 | vs.VariableScope.get_embedding_variable) # pylint: disable=protected-access 49 | vs.VariableScope.get_embedding_variable = ( # pylint: disable=protected-access 50 | self.wraps_build_embedding_weights(self._prev_get_embedding_variable)) 51 | except: # pylint: disable=bare-except 52 | pass 53 | 54 | def end(self): 55 | r'''Revert API rewriting. 56 | ''' 57 | try: 58 | vs.VariableScope.get_embedding_variable = ( # pylint: disable=protected-access 59 | self._prev_get_embedding_variable) 60 | except: # pylint: disable=bare-except 61 | pass 62 | 63 | 64 | ShardedEmbeddingWeightsRewriting.register( 65 | ShardedEmbeddingWeightsRewritingForDeepRecEV) 66 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/rebatch/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Dataset that resizes batches of DataFrame values. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import inspect 24 | 25 | # pylint: disable=ungrouped-imports 26 | from hybridbackend.tensorflow.data.dataframe import input_fields 27 | 28 | try: 29 | from tensorflow.python.data.ops.dataset_ops import DatasetV2 as _dataset # pylint: disable=unused-import, line-too-long # noqa: F401 30 | 31 | from hybridbackend.tensorflow.data.rebatch.dataset_v2 import \ 32 | RebatchDatasetV2 as RebatchDataset 33 | if inspect.isabstract(RebatchDataset): 34 | raise ImportError 35 | RebatchDataset.__module__ = __name__ 36 | RebatchDataset.__name__ = 'RebatchDataset' 37 | except ImportError: 38 | from hybridbackend.tensorflow.data.rebatch.dataset_v1 import \ 39 | RebatchDatasetV1 as RebatchDataset 40 | RebatchDataset.__module__ = __name__ 41 | RebatchDataset.__name__ = 'RebatchDataset' 42 | assert not inspect.isabstract(RebatchDataset) 43 | # pylint: enable=ungrouped-imports 44 | 45 | 46 | def rebatch( 47 | batch_size, 48 | drop_remainder=False, 49 | fields=None): 50 | r'''Create a `RebatchDataset`. 51 | 52 | Args: 53 | batch_size: Maxium number of samples in an output batch. 54 | drop_remainder: (Optional.) If True, smaller final batch is dropped. 55 | `False` by default. 56 | fields: (Optional.) List of DataFrame fields. Fetched from `input_dataset` 57 | by default. 58 | ''' 59 | def _apply_fn(dataset): 60 | return RebatchDataset( 61 | dataset, input_fields(dataset, fields), batch_size, 62 | drop_remainder=drop_remainder) 63 | return _apply_fn 64 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/sync/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''SyncReplicasDataset that reports the existence of next element. 17 | 18 | This class is compatible with Tensorflow 1.12. 19 | ''' 20 | 21 | from tensorflow.python.framework import sparse_tensor 22 | from tensorflow.python.framework import tensor_spec 23 | from tensorflow.python.util import nest 24 | 25 | from hybridbackend.tensorflow.framework.ops import TensorKinds 26 | 27 | 28 | def normalize(input_dataset): 29 | r'''flattent to normalize tensors within the input_dataset. 30 | ''' 31 | flattened_specs = nest.flatten(input_dataset.element_spec) 32 | flattened_kinds = [] 33 | for spec in flattened_specs: 34 | if isinstance(spec, tensor_spec.TensorSpec): 35 | flattened_kinds.append(TensorKinds.VALUES) 36 | elif isinstance(spec, sparse_tensor.SparseTensorSpec): 37 | flattened_kinds.append( 38 | sparse_tensor.SparseTensorValue( 39 | TensorKinds.INDICES, TensorKinds.VALUES, TensorKinds.DENSE_SHAPE)) 40 | else: 41 | raise ValueError( 42 | 'SyncReplicasDataset cannot support input datasets with outputs ' 43 | 'other than tensors or sparse tensors') 44 | return input_dataset.map(TensorKinds.normalize),\ 45 | nest.flatten(flattened_kinds), flattened_kinds 46 | 47 | 48 | def denormalize(input_dataset, element_spec, kinds, hook=None): 49 | r'''denormalize all tensors returned by input_dataset. 50 | ''' 51 | if hook is None: 52 | return input_dataset.map( 53 | lambda *args: TensorKinds.denormalize( 54 | element_spec, [TensorKinds.VALUES] + kinds, args)) 55 | input_dataset = input_dataset.map(hook.register) 56 | return input_dataset.map( 57 | lambda *args: TensorKinds.denormalize( 58 | element_spec, kinds, args)) 59 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Configuration file for the Sphinx documentation builder. 17 | 18 | This file only contains a selection of the most common options. For a full 19 | list see the documentation: 20 | https://www.sphinx-doc.org/en/master/usage/configuration.html 21 | ''' 22 | 23 | # -- Project information ----------------------------------------------------- 24 | project = 'HybridBackend' 25 | author = 'Alibaba Group Holding Limited' 26 | copyright = '2021 Alibaba Group Holding Limited' # pylint: disable=redefined-builtin 27 | release = 'latest' 28 | 29 | # -- General configuration --------------------------------------------------- 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | 'myst_parser', 36 | 'sphinx.ext.autodoc', 37 | 'sphinx.ext.napoleon', 38 | 'sphinx.ext.autosectionlabel', 39 | 'sphinx.ext.autosummary', 40 | 'sphinx.ext.extlinks', 41 | 'sphinx.ext.mathjax', 42 | 'sphinx.ext.todo', 43 | 'sphinx.ext.ifconfig' 44 | ] 45 | 46 | # Add any paths that contain templates here, relative to this directory. 47 | templates_path = ['_templates'] 48 | 49 | # List of patterns, relative to source directory, that match files and 50 | # directories to ignore when looking for source files. 51 | # This pattern also affects html_static_path and html_extra_path. 52 | exclude_patterns = [] 53 | 54 | # -- Options for HTML output ------------------------------------------------- 55 | 56 | # The theme to use for HTML and HTML Help pages. See the documentation for 57 | # a list of builtin themes. 58 | # 59 | html_theme = 'sphinx_rtd_theme' 60 | html_static_path = ['images'] 61 | 62 | suppress_warnings = [ 63 | 'autosectionlabel.*' 64 | ] 65 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/tabular/table.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include 27 | 28 | #include "hybridbackend/tensorflow/data/tabular/orc.h" 29 | #include "hybridbackend/tensorflow/data/tabular/parquet.h" 30 | 31 | namespace tensorflow { 32 | namespace hybridbackend { 33 | 34 | TableAccess* TableAccess::Create( 35 | OpKernelContext* ctx, const TableFormat& format, const string& filename, 36 | const int64 batch_size, const std::vector& field_names, 37 | const DataTypeVector& field_dtypes, 38 | const std::vector& field_ragged_ranks, 39 | const std::vector& field_shapes, 40 | const bool drop_remainder, const bool skip_corrupted_data) { 41 | switch (format) { 42 | case kParquetFormat: 43 | return new ParquetAccess(ctx, format, filename, batch_size, field_names, 44 | field_dtypes, field_ragged_ranks, field_shapes, 45 | drop_remainder, skip_corrupted_data); 46 | break; 47 | case kOrcFormat: 48 | return new OrcAccess(ctx, format, filename, batch_size, field_names, 49 | field_dtypes, field_ragged_ranks, field_shapes, 50 | drop_remainder, skip_corrupted_data); 51 | break; 52 | default: 53 | LOG(ERROR) << "File format " << format << " is not supported"; 54 | return nullptr; 55 | } 56 | return nullptr; 57 | } 58 | 59 | } // namespace hybridbackend 60 | } // namespace tensorflow 61 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/framework/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''ConfigProto related functions. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from tensorflow.core.protobuf import config_pb2 24 | from tensorflow.python.distribute import multi_worker_util 25 | 26 | from hybridbackend.tensorflow.framework.context import Context 27 | 28 | 29 | def wraps_session_config(session_config, *args, **kwargs): 30 | r'''Wraps ConfigProto for distributed training. 31 | ''' 32 | if not session_config: 33 | kwargs.setdefault('allow_soft_placement', True) 34 | session_config = config_pb2.ConfigProto(*args, **kwargs) 35 | session_config.gpu_options.allow_growth = True 36 | session_config.gpu_options.force_gpu_compatible = True 37 | if not session_config.device_filters: 38 | cluster_spec = Context.get().cluster_spec 39 | task_type = Context.get().task_type 40 | task_id = Context.get().task_id 41 | if cluster_spec is None: 42 | session_config.isolate_session_state = True 43 | return session_config 44 | session_config.isolate_session_state = False 45 | del session_config.device_filters[:] 46 | if task_type in ('chief', 'worker'): 47 | session_config.device_filters.extend([ 48 | '/job:ps', '/job:chief', f'/job:{task_type}/task:{task_id}']) 49 | session_config.experimental.collective_group_leader = ( 50 | multi_worker_util.collective_leader(cluster_spec, task_type, task_id)) 51 | elif task_type == 'evaluator': 52 | session_config.device_filters.append(f'/job:{task_type}/task:{task_id}') 53 | return session_config 54 | 55 | 56 | def get_session_config(*args, **kwargs): 57 | r'''Creates ConfigProto for distributed training. 58 | ''' 59 | return wraps_session_config(None, *args, **kwargs) 60 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/graph/common/packing.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_PACKING_H_ 17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_PACKING_H_ 18 | 19 | #if HYBRIDBACKEND_TENSORFLOW 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include 26 | 27 | namespace tensorflow { 28 | namespace hybridbackend { 29 | 30 | class Pack { 31 | public: 32 | Pack(const string& op_type, const string& optimized_op_type); 33 | Pack& WithDevice(const string& device); 34 | Pack& WithTypeAttr(const string& attr_name, 35 | const std::vector& constraints); 36 | Pack& WithShapeAttr(const string& attr_name); 37 | Pack& WithIntAttr(const string& attr_name); 38 | Pack& WithStrAttr(const string& attr_name); 39 | Pack& WithAggregatedShapeAttr(const string& attr_name); 40 | Pack& WithAggregatedIntAttr(const string& attr_name); 41 | Pack& WithAggregatedStrAttr(const string& attr_name); 42 | Pack& WithHandle(const int32 input); 43 | Pack& WithBuckets(const int32 num_buckets); 44 | Status In(Graph* graph); 45 | Status In(Graph* graph, int64* poccurrence_count); 46 | 47 | private: 48 | string op_type_; 49 | string optimized_op_type_; 50 | string device_; 51 | std::map> type_attrs_; 52 | std::vector shape_attrs_; 53 | std::vector int_attrs_; 54 | std::vector str_attrs_; 55 | std::vector aggregated_shape_attrs_; 56 | std::vector aggregated_int_attrs_; 57 | std::vector aggregated_str_attrs_; 58 | std::vector handles_; 59 | int num_buckets_; 60 | 61 | TF_DISALLOW_COPY_AND_ASSIGN(Pack); 62 | }; 63 | 64 | } // namespace hybridbackend 65 | } // namespace tensorflow 66 | 67 | #endif // HYBRIDBACKEND_TENSORFLOW 68 | #endif // HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_PACKING_H_ 69 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/distribute/nccl/nccl_get_id.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #if HYBRIDBACKEND_TENSORFLOW 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | #include 23 | 24 | #include "hybridbackend/tensorflow/distribute/nccl/collective.h" 25 | 26 | namespace tensorflow { 27 | namespace hybridbackend { 28 | 29 | #if HYBRIDBACKEND_NCCL 30 | 31 | namespace { 32 | const int64 kNcclIdElements = NCCL_UNIQUE_ID_BYTES / sizeof(int64); 33 | } // anonymous namespace 34 | 35 | REGISTER_OP("HbGetNcclId") 36 | .Output("id: int64") 37 | .SetShapeFn([](shape_inference::InferenceContext* c) { 38 | c->set_output(0, c->Vector(kNcclIdElements)); 39 | return Status::OK(); 40 | }) 41 | .SetIsStateful() 42 | .Doc(R"doc( 43 | Get ID of the NCCL communciator. 44 | 45 | id: Unique ID of the NCCL communicator. 46 | )doc"); 47 | 48 | #if GOOGLE_CUDA 49 | class GetNcclIdOp : public OpKernel { 50 | public: 51 | GetNcclIdOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} 52 | 53 | void Compute(OpKernelContext* ctx) override { 54 | static_assert(NCCL_UNIQUE_ID_BYTES % sizeof(int64) == 0, "Unexpected"); 55 | Tensor* id; 56 | OP_REQUIRES_OK( 57 | ctx, ctx->allocate_output(0, TensorShape({kNcclIdElements}), &id)); 58 | ncclUniqueId nccl_id; 59 | ncclGetUniqueId(&nccl_id); 60 | std::memcpy(reinterpret_cast(id->flat().data()), 61 | nccl_id.internal, NCCL_UNIQUE_ID_BYTES); 62 | } 63 | }; 64 | 65 | REGISTER_KERNEL_BUILDER(Name("HbGetNcclId").Device(DEVICE_GPU).HostMemory("id"), 66 | GetNcclIdOp); 67 | REGISTER_KERNEL_BUILDER(Name("HbGetNcclId").Device(DEVICE_CPU), GetNcclIdOp); 68 | #endif 69 | 70 | #endif 71 | 72 | } // namespace hybridbackend 73 | } // namespace tensorflow 74 | 75 | #endif // HYBRIDBACKEND_TENSORFLOW 76 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/distribute/tests/broadcast_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Tests for broadcast collective communication. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import os 24 | import unittest 25 | 26 | import numpy as np 27 | 28 | import hybridbackend.common.test as hbtest 29 | 30 | # pylint: disable=missing-docstring,import-outside-toplevel 31 | 32 | 33 | def _test_broadcast(rank, a, b): 34 | r'''Test Broadcast. 35 | ''' 36 | import tensorflow as tf 37 | 38 | import hybridbackend.tensorflow as hb 39 | 40 | with tf.Graph().as_default(): 41 | with hb.scope(): 42 | data = tf.constant(a) if rank == 0 else tf.constant(b) 43 | recv = hb.distribute.broadcast(data, root_rank=0) 44 | with tf.train.MonitoredTrainingSession('') as sess: 45 | return sess.run(recv) 46 | 47 | 48 | @unittest.skipUnless( 49 | os.getenv('HYBRIDBACKEND_WITH_CUDA') == 'ON', 'GPU required') 50 | @unittest.skipUnless( 51 | os.getenv('HYBRIDBACKEND_WITH_NCCL') == 'ON', 'NCCL required') 52 | class BroadcastTest(unittest.TestCase): 53 | def setUp(self): # pylint: disable=invalid-name 54 | os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' 55 | os.environ['NCCL_DEBUG'] = 'INFO' 56 | os.environ['NCCL_DEBUG_SUBSYS'] = 'ALL' 57 | os.environ['TF_CPP_VMODULE'] = ( 58 | 'nccl_comm=1,' 59 | 'nccl_create=1,' 60 | 'nccl_broadcast=1') 61 | 62 | def tearDown(self): # pylint: disable=invalid-name 63 | del os.environ['TF_CPP_VMODULE'] 64 | del os.environ['CUDA_VISIBLE_DEVICES'] 65 | 66 | def test_broadcast(self): 67 | a = 13 68 | b = 22 69 | results = hbtest.Spawn(2)(lambda rank: _test_broadcast(rank, a, b)) 70 | np.testing.assert_allclose(results[0], results[1], rtol=1e-6) 71 | np.testing.assert_allclose(results[0], a, rtol=1e-6) 72 | 73 | 74 | if __name__ == '__main__': 75 | hbtest.main(f'{__file__}.xml') 76 | -------------------------------------------------------------------------------- /hybridbackend/common/atomic.cu.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_COMMON_ATOMIC_CU_H_ 17 | #define HYBRIDBACKEND_COMMON_ATOMIC_CU_H_ 18 | 19 | __forceinline__ __device__ long atomicAdd(long* address, long val) { 20 | return (long)atomicAdd((unsigned long long*)address, (unsigned long long)val); 21 | } 22 | 23 | __forceinline__ __device__ long long atomicAdd(long long* address, 24 | long long val) { 25 | return (long long)atomicAdd((unsigned long long*)address, 26 | (unsigned long long)val); 27 | } 28 | 29 | __forceinline__ __device__ unsigned long atomicAdd(unsigned long* address, 30 | unsigned long val) { 31 | return (unsigned long)atomicAdd((unsigned long long*)address, 32 | (unsigned long long)val); 33 | } 34 | 35 | __forceinline__ __device__ long atomicCAS(long* address, long compare, 36 | long val) { 37 | return (long)atomicCAS((unsigned long long*)address, 38 | (unsigned long long)compare, (unsigned long long)val); 39 | } 40 | 41 | __forceinline__ __device__ long long atomicCAS(long long* address, 42 | long long compare, 43 | long long val) { 44 | return (long long)atomicCAS((unsigned long long*)address, 45 | (unsigned long long)compare, 46 | (unsigned long long)val); 47 | } 48 | 49 | __forceinline__ __device__ unsigned long atomicCAS(unsigned long* address, 50 | unsigned long compare, 51 | unsigned long val) { 52 | return (unsigned long)atomicCAS((unsigned long long*)address, 53 | (unsigned long long)compare, 54 | (unsigned long long)val); 55 | } 56 | 57 | #endif // HYBRIDBACKEND_COMMON_ATOMIC_CU_H_ 58 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/deduplicate/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Dataset that compresses DataFrame values. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from tensorflow.python.util import nest 24 | 25 | from hybridbackend.tensorflow.data.dataframe import input_fields 26 | 27 | 28 | def deduplicate( 29 | key_idx_field_names, 30 | value_field_names, 31 | fields=None): 32 | r'''Deduplicate fields specified in `value_field_names` 33 | by using specified fields in `key_field_names`. 34 | 35 | Args: 36 | key_idx_field_names: A list of string as names of fields utilized to 37 | recover the key fields. 38 | value_field_names: A List of list of string as fields to be 39 | deduplicated by key fields. 40 | fields: (Optional) fields of dataset. 41 | ''' 42 | def _apply_fn(dataset): 43 | all_fields = input_fields(dataset, fields=fields) 44 | all_field_names = nest.flatten({f.name: f.name for f in all_fields}) 45 | map_name_to_fields = {f.name: f for f in all_fields} 46 | 47 | for key_idx_field_name in key_idx_field_names: 48 | if key_idx_field_name not in all_field_names: 49 | raise ValueError( 50 | f'Key idx Field {key_idx_field_name} must be within the Fields') 51 | 52 | if len(value_field_names) != len(key_idx_field_names): 53 | raise ValueError( 54 | 'Value field names must have the same length as key idx field names') 55 | 56 | key_idx_field_to_value_fields = {} 57 | for i, name in enumerate(key_idx_field_names): 58 | key_idx_field_to_value_fields[name] = value_field_names[i] 59 | 60 | for k, v_list in key_idx_field_to_value_fields.items(): 61 | for v in v_list: 62 | if v not in all_field_names: 63 | raise ValueError( 64 | f'Value Field {v} must be within the Fields') 65 | map_name_to_fields[v].set_restore_idx_field(map_name_to_fields[k]) 66 | return dataset 67 | return _apply_fn 68 | -------------------------------------------------------------------------------- /hybridbackend/common/murmur3.cu.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | 15 | MurmurHash3 was written by Austin Appleby, and is placed in the public 16 | domain. The author hereby disclaims copyright to this source code. 17 | Note - The x86 and x64 versions do _not_ produce the same results, as the 18 | algorithms are optimized for their respective platforms. You can still 19 | compile and run any of them on any platform, but your performance with the 20 | non-native version will be less than optimal. 21 | 22 | See https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp. 23 | ==============================================================================*/ 24 | 25 | #ifndef HYBRIDBACKEND_COMMON_MURMUR3_CU_H_ 26 | #define HYBRIDBACKEND_COMMON_MURMUR3_CU_H_ 27 | 28 | inline __host__ __device__ uint32_t _rotl32(uint32_t x, int8_t r) { 29 | return (x << r) | (x >> (32 - r)); 30 | } 31 | 32 | template 33 | inline __host__ __device__ uint32_t murmur3_hash32(const T& input) { 34 | constexpr int len = sizeof(T); 35 | const uint8_t* const data = (const uint8_t*)&input; 36 | constexpr int nblocks = len / 4; 37 | uint32_t h1 = seed; 38 | constexpr uint32_t c1 = 0xcc9e2d51; 39 | constexpr uint32_t c2 = 0x1b873593; 40 | 41 | // body 42 | const uint32_t* const blocks = (const uint32_t*)(data + nblocks * 4); 43 | for (int i = -nblocks; i; i++) { 44 | uint32_t k1 = blocks[i]; 45 | k1 *= c1; 46 | k1 = _rotl32(k1, 15); 47 | k1 *= c2; 48 | h1 ^= k1; 49 | h1 = _rotl32(h1, 13); 50 | h1 = h1 * 5 + 0xe6546b64; 51 | } 52 | 53 | // tail 54 | const uint8_t* tail = (const uint8_t*)(data + nblocks * 4); 55 | uint32_t k1 = 0; 56 | switch (len & 3) { 57 | case 3: 58 | k1 ^= tail[2] << 16; 59 | case 2: 60 | k1 ^= tail[1] << 8; 61 | case 1: 62 | k1 ^= tail[0]; 63 | k1 *= c1; 64 | k1 = _rotl32(k1, 15); 65 | k1 *= c2; 66 | h1 ^= k1; 67 | } 68 | 69 | // finalization 70 | h1 ^= len; 71 | h1 ^= h1 >> 16; 72 | h1 *= 0x85ebca6b; 73 | h1 ^= h1 >> 13; 74 | h1 *= 0xc2b2ae35; 75 | h1 ^= h1 >> 16; 76 | return h1; 77 | } 78 | 79 | #endif // HYBRIDBACKEND_COMMON_MURMUR3_CU_H_ -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/tests/rebatch_dataset_seq_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Parquet batch dataset rebatching test. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import os 24 | import random 25 | from six.moves import xrange # pylint: disable=redefined-builtin 26 | import tempfile 27 | import unittest 28 | 29 | import numpy as np 30 | import pyarrow as pa 31 | import pyarrow.parquet as pq 32 | import tensorflow as tf 33 | 34 | import hybridbackend.common.test as hbtest 35 | import hybridbackend.tensorflow as hb 36 | 37 | 38 | # pylint: disable=missing-docstring 39 | class ParquetDatasetSequenceRebatchTest(unittest.TestCase): 40 | def setUp(self): # pylint: disable=invalid-name 41 | os.environ['CUDA_VISIBLE_DEVICES'] = '' 42 | self._workspace = tempfile.mkdtemp() 43 | self._filename = os.path.join(self._workspace, 'seqtest.parquet') 44 | self._nrows = 1000 45 | self._ncols = 10 46 | self._data = { 47 | 'clicks': [ 48 | [random.randint(0, 100) for col in range(self._ncols)] 49 | for row in range(self._nrows)]} 50 | pq.write_table(pa.Table.from_pydict(self._data), self._filename) 51 | 52 | def tearDown(self): # pylint: disable=invalid-name 53 | os.remove(self._filename) 54 | del os.environ['CUDA_VISIBLE_DEVICES'] 55 | 56 | def test_ragged(self): 57 | batch_size = 8 58 | with tf.Graph().as_default() as graph: 59 | ds = hb.data.ParquetDataset(self._filename, batch_size=batch_size) 60 | ds = ds.apply(hb.data.rebatch(batch_size)) 61 | batch = tf.data.make_one_shot_iterator(ds).get_next() 62 | 63 | clicks = self._data['clicks'] 64 | with tf.Session(graph=graph) as sess: 65 | for i in xrange(3): 66 | actual = sess.run(batch['clicks']) 67 | start_row = i * batch_size 68 | end_row = (i + 1) * batch_size 69 | expected = clicks[start_row:end_row] 70 | expected_values = [v for sublist in expected for v in sublist] 71 | np.testing.assert_equal(actual.values, expected_values) 72 | 73 | 74 | if __name__ == '__main__': 75 | hbtest.main(f'{__file__}.xml') 76 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/graph/common/linearization.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #if HYBRIDBACKEND_TENSORFLOW 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | #include 29 | #include 30 | #include 31 | 32 | #include "hybridbackend/common/env.h" 33 | #include "hybridbackend/tensorflow/graph/common/helper.h" 34 | #include "hybridbackend/tensorflow/graph/common/linearization.h" 35 | 36 | namespace tensorflow { 37 | namespace hybridbackend { 38 | LinearizeOutputs::LinearizeOutputs(const string& op_type, 39 | const int32& op_output) 40 | : op_type_(op_type), op_output_(op_output) {} 41 | 42 | Status LinearizeOutputs::In(Graph* graph) { 43 | std::unordered_map candidates; 44 | std::vector dependencies; 45 | 46 | std::vector sorted; 47 | GetReversePostOrder(*graph, &sorted, NodeComparatorName{}); 48 | 49 | for (Node* node : graph->op_nodes()) { 50 | if (!node->IsOp()) { 51 | continue; 52 | } 53 | 54 | if (node->type_string() != op_type_) { 55 | continue; 56 | } 57 | 58 | std::vector linear_ops; 59 | for (Node* n : sorted) { 60 | for (const auto& edge : n->in_edges()) { 61 | if (edge && !edge->IsControlEdge() && edge->src() == node && 62 | edge->src_output() == op_output_) { 63 | linear_ops.push_back(edge->dst()); 64 | break; 65 | } 66 | } 67 | } 68 | 69 | if (linear_ops.size() < 2) { 70 | continue; 71 | } 72 | 73 | for (size_t idx = 1; idx < linear_ops.size(); ++idx) { 74 | graph->AddControlEdge(linear_ops[idx - 1], linear_ops[idx]); 75 | } 76 | 77 | VLOG(1) << "Linearized " << linear_ops.size() << " outputs of " 78 | << node->name() << " in graph " << static_cast(graph); 79 | } 80 | 81 | return Status::OK(); 82 | } 83 | 84 | } // namespace hybridbackend 85 | } // namespace tensorflow 86 | 87 | #endif // HYBRIDBACKEND_TENSORFLOW 88 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/common/stream.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_TENSORFLOW_COMMON_STREAM_H_ 17 | #define HYBRIDBACKEND_TENSORFLOW_COMMON_STREAM_H_ 18 | 19 | #if HYBRIDBACKEND_TENSORFLOW 20 | #if GOOGLE_CUDA 21 | 22 | #include 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #include "hybridbackend/tensorflow/common/host_functions.h" 33 | 34 | namespace tensorflow { 35 | namespace hybridbackend { 36 | 37 | class Stream { 38 | public: 39 | Stream() : se_stream_(nullptr), stream_(nullptr) {} 40 | virtual ~Stream() {} 41 | cudaStream_t* get() const { return stream_; } 42 | 43 | static se::Event* TensorStreamCreateEvent(OpKernelContext* ctx); 44 | 45 | void Initialize(OpKernelContext* ctx); 46 | void Initialize(OpKernelContext* ctx, const string& name, 47 | const int64 num_threads); 48 | void Launch(OpKernelContext* ctx, std::function fn); 49 | void LaunchUntilComputeDone(OpKernelContext* ctx, std::function fn); 50 | 51 | void BlockComputeUntilDone(OpKernelContext* ctx); 52 | void BlockComputeUntilDone(OpKernelContext* ctx, std::function fn); 53 | void BlockHostUntilDone(); 54 | 55 | Stream& ThenWaitUntilComputeDone(OpKernelContext* ctx); 56 | Stream& ThenExecute(OpKernelContext* ctx, std::function fn); 57 | Stream& ThenMemcpy(void* dst, const se::DeviceMemoryBase& src, uint64 size); 58 | Stream& ThenMemcpy(se::DeviceMemoryBase* dst, const void* src, uint64 size); 59 | Stream& ThenMemcpy(se::DeviceMemoryBase* dst, const se::DeviceMemoryBase& src, 60 | uint64 size); 61 | 62 | private: 63 | std::unique_ptr threads_; 64 | se::Stream* se_stream_; 65 | cudaStream_t* stream_; 66 | std::mutex mu_; 67 | }; 68 | } // namespace hybridbackend 69 | } // namespace tensorflow 70 | 71 | #endif // GOOGLE_CUDA 72 | #endif // HYBRIDBACKEND_TENSORFLOW 73 | 74 | #endif // HYBRIDBACKEND_TENSORFLOW_COMMON_STREAM_H_ 75 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/common/fusion_helper.cu.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #if HYBRIDBACKEND_TENSORFLOW 17 | 18 | #if GOOGLE_CUDA 19 | #define EIGEN_USE_GPU 20 | 21 | #include 22 | #include 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | #include "hybridbackend/tensorflow/common/device_functions.h" 29 | #include "hybridbackend/tensorflow/common/fusion_helper.cu.h" 30 | 31 | namespace tensorflow { 32 | namespace hybridbackend { 33 | namespace functor { 34 | 35 | template 36 | void CopyPtrsNFunctor::operator()(OpKernelContext* ctx, int8* head_host, 37 | int8* head_device, 38 | std::vector* inputs, 39 | int num_columns) { 40 | T** head_host_ptr = reinterpret_cast(head_host); 41 | for (int i = 0; i < num_columns; ++i) { 42 | head_host_ptr[i] = 43 | const_cast((*inputs)[i]->flat_outer_dims().data()); 44 | } 45 | auto* stream = ctx->op_device_context()->stream(); 46 | se::DeviceMemoryBase dst_ptr(head_device, num_columns * sizeof(T*)); 47 | stream->ThenMemcpy(&dst_ptr, head_host, num_columns * sizeof(T*)); 48 | stream->BlockHostUntilDone(); 49 | } 50 | 51 | template 52 | void CopySizesNFunctor::operator()(OpKernelContext* ctx, T* input_host, 53 | T* input_device, int num_columns) { 54 | auto* stream = ctx->op_device_context()->stream(); 55 | se::DeviceMemoryBase dst_ptr(input_device, num_columns * sizeof(T)); 56 | stream->ThenMemcpy(&dst_ptr, input_host, num_columns * sizeof(T)); 57 | stream->BlockHostUntilDone(); 58 | } 59 | 60 | #define DEFINE_COPY_PTRS(T) template struct CopyPtrsNFunctor; 61 | #define DEFINE_COPY_SIZES(T) template struct CopySizesNFunctor; 62 | 63 | #define TF_CALL_HELPER_TYPES(m) \ 64 | TF_CALL_uint32(m) TF_CALL_uint64(m) TF_CALL_REAL_NUMBER_TYPES(m) 65 | 66 | TF_CALL_HELPER_TYPES(DEFINE_COPY_PTRS); 67 | TF_CALL_HELPER_TYPES(DEFINE_COPY_SIZES); 68 | 69 | } // namespace functor 70 | } // namespace hybridbackend 71 | } // namespace tensorflow 72 | 73 | #endif // GOOGLE_CUDA 74 | #endif // HYBRIDBACKEND_TENSORFLOW 75 | -------------------------------------------------------------------------------- /hybridbackend/common/arrow.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_COMMON_ARROW_H_ 17 | #define HYBRIDBACKEND_COMMON_ARROW_H_ 18 | 19 | #include 20 | #include 21 | 22 | #if HYBRIDBACKEND_ARROW 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #if HYBRIDBACKEND_ARROW_HDFS 30 | #include 31 | #endif 32 | #if HYBRIDBACKEND_ARROW_S3 33 | #include 34 | #endif 35 | #include 36 | 37 | namespace hybridbackend { 38 | ::arrow::Status OpenArrowFile( 39 | std::shared_ptr<::arrow::fs::FileSystem>* fs, 40 | std::shared_ptr<::arrow::io::RandomAccessFile>* file, 41 | const std::string& filename); 42 | 43 | void CloseArrowFile(std::shared_ptr<::arrow::fs::FileSystem>& fs, 44 | std::shared_ptr<::arrow::io::RandomAccessFile>& file, 45 | const std::string& filename); 46 | 47 | ::arrow::Status OpenParquetReader( 48 | std::unique_ptr<::parquet::arrow::FileReader>* reader, 49 | const std::shared_ptr<::arrow::io::RandomAccessFile>& file, 50 | const bool initialized_from_env); 51 | 52 | ::arrow::Status GetParquetDataFrameFields( 53 | std::vector* field_names, 54 | std::vector* field_dtypes, 55 | std::vector* field_ragged_ranks, const std::string& filename); 56 | 57 | ::arrow::Status GetParquetRowGroupCount(int* row_group_count, 58 | const std::string& filename); 59 | 60 | ::arrow::Status OpenOrcReader( 61 | std::unique_ptr<::arrow::adapters::orc::ORCFileReader>* reader, 62 | const std::shared_ptr<::arrow::io::RandomAccessFile>& file, 63 | const bool initialized_from_env); 64 | 65 | ::arrow::Status GetOrcDataFrameFields(std::vector* field_names, 66 | std::vector* field_dtypes, 67 | std::vector* field_ragged_ranks, 68 | const std::string& filename); 69 | 70 | ::arrow::Status GetOrcRowCount(int* row_count, const std::string& filename); 71 | 72 | } // namespace hybridbackend 73 | 74 | #endif // HYBRIDBACKEND_ARROW 75 | #endif // HYBRIDBACKEND_COMMON_ARROW_H_ 76 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/metrics/gauc.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''A data-parallel gAUC metric. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from tensorflow.python.framework import dtypes 24 | from tensorflow.python.ops import array_ops 25 | from tensorflow.python.ops import math_ops 26 | from tensorflow.python.ops import variable_scope as vs 27 | 28 | from hybridbackend.tensorflow.common import oplib as _ops 29 | from hybridbackend.tensorflow.metrics.mean import mean 30 | 31 | 32 | def gauc(labels, 33 | predictions, 34 | indicators=None, 35 | metrics_collections=None, 36 | updates_collections=None, 37 | name=None): 38 | r'''Computes the approximate gAUC. 39 | 40 | Args: 41 | labels: A `Tensor` whose shape matches `predictions`. Will be cast to 42 | `bool`. 43 | predictions: A floating point `Tensor` of arbitrary shape and whose values 44 | are in the range `[0, 1]`. 45 | indicators: A `Tensor` whose shape matches `predictions`. 46 | metrics_collections: An optional list of collections that `mean` 47 | should be added to. 48 | updates_collections: An optional list of collections that `update_op` 49 | should be added to. 50 | name: An optional variable_scope name. 51 | 52 | Returns: 53 | (gauc, update_op): A tuple of a scalar `Tensor` representing the current 54 | g-area-under-curve and an operation that increments the `true_positives`, 55 | `true_negatives`, `false_positives` and `false_negatives` variables 56 | appropriately and whose value matches `auc`. 57 | 58 | Raises: 59 | ValueError: If `predictions` and `labels` have mismatched shapes, or if 60 | `weights` is not `None` and its shape doesn't match `predictions`, or if 61 | either `metrics_collections` or `updates_collections` are not a list or 62 | tuple. 63 | RuntimeError: If eager execution is enabled. 64 | ''' 65 | if indicators is None: 66 | indicators = math_ops.range( 67 | 0, array_ops.shape(array_ops.reshape(labels, [-1]))[0], 68 | dtype=dtypes.int32) 69 | with vs.variable_scope(name, 'gauc', (labels, predictions, indicators)): 70 | aucs, counts = _ops.hb_gauc_calc(labels, predictions, indicators) 71 | return mean(aucs, counts, metrics_collections, updates_collections, name) 72 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/training/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Support for training models in hybridbackend. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from tensorflow.python.training import training as _training 24 | 25 | from hybridbackend.tensorflow.framework.context import Context as _ctx 26 | from hybridbackend.tensorflow.framework.ops import ModeKeys as _mode_keys 27 | from hybridbackend.tensorflow.framework.rewriting import GraphRewriting 28 | from hybridbackend.tensorflow.framework.rewriting import SessionRunRewriting 29 | from hybridbackend.tensorflow.training.evaluation import EvaluationHook 30 | from hybridbackend.tensorflow.training.evaluation import EvaluationSpec 31 | from hybridbackend.tensorflow.training.hooks import Policy 32 | from hybridbackend.tensorflow.training.hooks import StepStatHook 33 | from hybridbackend.tensorflow.training.optimizer import SyncReplicasOptimizer 34 | from hybridbackend.tensorflow.training.optimizer import \ 35 | wraps_optimizer as _wraps 36 | from hybridbackend.tensorflow.training.saved_model import export 37 | from hybridbackend.tensorflow.training.saved_model import export_all 38 | from hybridbackend.tensorflow.training.saver import replace_default_saver 39 | from hybridbackend.tensorflow.training.saver import Saver 40 | from hybridbackend.tensorflow.training.server import monitored_session 41 | from hybridbackend.tensorflow.training.server import Server 42 | from hybridbackend.tensorflow.training.server import target 43 | from hybridbackend.tensorflow.training.server import wraps_server 44 | from hybridbackend.tensorflow.training.session import \ 45 | wraps_monitored_training_session 46 | 47 | _ = ( 48 | _ctx.get().options 49 | .register('grad_lazy_sync', False, env='HB_GRAD_LAZY_SYNC') 50 | .register('sharding', False) 51 | .register( 52 | 'use_hierarchical_embedding_lookup', True, 53 | env='HB_USE_HIERARCHICAL_EMBEDDING_LOOKUP') 54 | .register('batch_size', -1) 55 | .register('model_dir', None) 56 | .register('keep_checkpoint_max', None) 57 | .register('keep_checkpoint_every_n_hours', None) 58 | .register('mode', _mode_keys.TRAIN)) 59 | 60 | 61 | for c in _training.__dict__.values(): 62 | if (isinstance(c, type) 63 | and issubclass(c, _training.Optimizer) 64 | and c not in (_training.Optimizer, _training.SyncReplicasOptimizer)): 65 | globals()[c.__name__] = _wraps(c) 66 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/embedding/tests/deeprecev_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Tests for embedding columns upon DeepRec EV. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import os 24 | import unittest 25 | 26 | import hybridbackend.common.test as hbtest 27 | 28 | # pylint: disable=missing-docstring 29 | # pylint: disable=import-outside-toplevel 30 | 31 | 32 | def _test_get_embedding_variable(_): 33 | import tensorflow as tf 34 | 35 | import hybridbackend.tensorflow as hb 36 | 37 | with tf.Graph().as_default(): 38 | with hb.scope(): 39 | with hb.embedding_scope(): 40 | with tf.device('/cpu:0'): 41 | var = tf.get_embedding_variable( 42 | 'var_1', 43 | embedding_dim=3, 44 | initializer=tf.ones_initializer(tf.float32), 45 | partitioner=tf.fixed_size_partitioner(num_shards=4)) 46 | emb = tf.nn.embedding_lookup( 47 | var, tf.cast([0, 1, 2, 5, 6, -7], tf.int64)) 48 | fun = tf.multiply(emb, 2.0, name='multiply') 49 | loss = tf.reduce_sum(fun, name='reduce_sum') 50 | opt = tf.train.FtrlOptimizer( 51 | 0.1, 52 | l1_regularization_strength=2.0, 53 | l2_regularization_strength=0.00001) 54 | g_v = opt.compute_gradients(loss) 55 | train_op = opt.apply_gradients(g_v) 56 | with tf.train.MonitoredTrainingSession('') as sess: 57 | emb_result, loss_result, _ = sess.run([emb, loss, train_op]) 58 | return (emb_result, loss_result) 59 | 60 | 61 | @unittest.skipUnless( 62 | (os.getenv('HYBRIDBACKEND_WITH_CUDA') == 'ON' 63 | and os.getenv('HYBRIDBACKEND_WITH_TENSORFLOW_DISTRO') == '99881015'), 64 | 'DeepRec on GPU required') 65 | @unittest.skipUnless( 66 | os.getenv('HYBRIDBACKEND_WITH_NCCL') == 'ON', 'NCCL required') 67 | class DeepRecEVTest(unittest.TestCase): 68 | '''Tests for embedding column. 69 | ''' 70 | def setUp(self): # pylint: disable=invalid-name 71 | os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' 72 | 73 | def test_get_embedding_variable(self): 74 | results = hbtest.Spawn()(_test_get_embedding_variable) 75 | print(results) 76 | 77 | def test_get_embedding_variable_2g(self): 78 | results = hbtest.Spawn(2)(_test_get_embedding_variable) 79 | print(results) 80 | 81 | 82 | # pylint: enable=missing-docstring 83 | if __name__ == '__main__': 84 | hbtest.main(f'{__file__}.xml') 85 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/benchmarks/data_benchmark_csv.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Data reading for CSV files benchmark. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import argparse 24 | import os 25 | from six.moves import xrange # pylint: disable=redefined-builtin 26 | import tempfile 27 | import time 28 | 29 | import numpy as np 30 | import pandas as pd 31 | import tensorflow as tf 32 | 33 | 34 | # pylint: disable=missing-docstring 35 | def benchmark(params): 36 | if not params.filenames: 37 | tf.logging.info('Started generating mock file ...') 38 | workspace = tempfile.mkdtemp() 39 | params.filenames = [os.path.join(workspace, 'benchmark.csv')] 40 | df = pd.DataFrame( 41 | np.random.randint( 42 | 0, 100, 43 | size=(params.batch_size * 100, len(params.fields)), 44 | dtype=np.int64), 45 | columns=params.fields) 46 | df.to_csv(params.filenames[0], header=False, index=False) 47 | tf.logging.info(f'Mock file {params.filenames[0]} generated.') 48 | with tf.Graph().as_default(): 49 | step = tf.train.get_or_create_global_step() 50 | ds = tf.data.TextLineDataset(params.filenames) 51 | ds = ds.batch(params.batch_size, drop_remainder=True) 52 | ds = ds.map( 53 | lambda line: tf.io.decode_csv( 54 | line, [[1 << 32] for f in params.fields])) 55 | batch = tf.data.make_one_shot_iterator(ds).get_next() 56 | train_op = tf.group(batch + [step.assign_add(1)]) 57 | with tf.train.MonitoredTrainingSession('') as sess: 58 | count = 0 59 | prev_ts = time.time() 60 | try: 61 | while not sess.should_stop(): 62 | sess.run(train_op) 63 | count += 1 64 | except tf.errors.OutOfRangeError: 65 | pass 66 | duration = time.time() - prev_ts 67 | if count <= 0: 68 | print('Reading CSV files stopped unexpectedly') 69 | return 70 | print( 71 | 'Reading CSV files elapsed in ' 72 | f'{params.batch_size * count / duration:.2f} samples/sec (' 73 | f'{1000. * duration / count:.2f} msec/step)') 74 | 75 | 76 | if __name__ == '__main__': 77 | os.environ['CUDA_VISIBLE_DEVICES'] = '' 78 | tf.logging.set_verbosity(tf.logging.INFO) 79 | parser = argparse.ArgumentParser() 80 | parser.add_argument('--batch-size', type=int, default=64000) 81 | parser.add_argument( 82 | '--fields', nargs='+', default=[f'f{c}' for c in xrange(200)]) 83 | parser.add_argument('filenames', nargs='*') 84 | benchmark(parser.parse_args()) 85 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/ops/transfer/functors.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef HYBRIDBACKEND_TENSORFLOW_OPS_TRANSFER_FUNCTORS_H_ 17 | #define HYBRIDBACKEND_TENSORFLOW_OPS_TRANSFER_FUNCTORS_H_ 18 | 19 | #if HYBRIDBACKEND_TENSORFLOW 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | namespace tensorflow { 27 | 28 | class OpKernelContext; 29 | 30 | namespace hybridbackend { 31 | namespace functor { 32 | 33 | #define TF_CALL_TRANSFER_TYPES(m) \ 34 | TF_CALL_int8(m) TF_CALL_uint8(m) TF_CALL_int32(m) TF_CALL_uint32(m) \ 35 | TF_CALL_int64(m) TF_CALL_uint64(m) TF_CALL_half(m) TF_CALL_float(m) \ 36 | TF_CALL_double(m) 37 | #define TF_OP_TRANSFER_DTYPE_LIST \ 38 | "int8, uint8, int32, uint32, int64, uint64, half, float, double" 39 | 40 | #if GOOGLE_CUDA 41 | 42 | template 43 | struct TransferH2DNFunctor { 44 | public: 45 | TransferH2DNFunctor(const OpInputList& inputs, OpOutputList& outputs, 46 | OpKernelContext* ctx); 47 | virtual ~TransferH2DNFunctor(); 48 | 49 | int64 num_pinned_inputs() const { return num_pinned_inputs_; } 50 | int64 num_unpinned_inputs() const { return num_unpinned_inputs_; } 51 | 52 | int64 pinned_input_bytes() const { return pinned_input_bytes_; } 53 | int64 unpinned_input_bytes() const { return unpinned_input_bytes_; } 54 | 55 | Status Copy(cudaStream_t* stream); 56 | 57 | private: 58 | int64 num_unpinned_inputs_; 59 | int64 unpinned_input_bytes_; 60 | std::vector unpinned_outputs_; 61 | std::vector unpinned_inputs_; 62 | std::vector unpinned_bytes_; 63 | Tensor* h_unpinned_fusion_buffer_tensor_; 64 | std::vector unpinned_fusion_outputs_; 65 | std::vector unpinned_fusion_inputs_; 66 | std::vector unpinned_fusion_bytes_; 67 | 68 | int64 num_pinned_inputs_; 69 | int64 pinned_input_bytes_; 70 | int64 pinned_buffer_bytes_; 71 | Tensor* h_pinned_buffer_tensor_; 72 | Tensor* d_pinned_buffer_tensor_; 73 | int8* h_pinned_buffer_; 74 | int8* d_pinned_buffer_; 75 | int8* d_pinned_input_raw_ptrs_; 76 | int8* d_pinned_output_raw_ptrs_; 77 | int64* d_pinned_input_sizes_; 78 | int64* d_pinned_output_sizes_; 79 | int64 max_pinned_output_size_; 80 | int pinned_copy_block_size_; 81 | }; 82 | 83 | #endif // GOOGLE_CUDA 84 | } // namespace functor 85 | } // namespace hybridbackend 86 | } // namespace tensorflow 87 | 88 | #endif // HYBRIDBACKEND_TENSORFLOW 89 | #endif // HYBRIDBACKEND_TENSORFLOW_OPS_TRANSFER_FUNCTORS_H_ 90 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/tests/parquet_dataset_reshape_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Parquet batch dataset ragged tensors test. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import os 24 | from six.moves import xrange # pylint: disable=redefined-builtin 25 | import tempfile 26 | import unittest 27 | 28 | import numpy as np 29 | import pandas as pd 30 | import tensorflow as tf 31 | 32 | import hybridbackend.common.test as hbtest 33 | import hybridbackend.tensorflow as hb 34 | 35 | 36 | # pylint: disable=missing-docstring 37 | class ParquetDatasetReshapeTest(unittest.TestCase): 38 | def setUp(self): # pylint: disable=invalid-name 39 | os.environ['CUDA_VISIBLE_DEVICES'] = '' 40 | self._workspace = tempfile.mkdtemp() 41 | self._filename = os.path.join(self._workspace, 'reshape_test.parquet') 42 | num_cols = 3 43 | self._df = pd.DataFrame( 44 | np.array([ 45 | [ 46 | np.random.randint( 47 | 0, 100, 48 | size=(4,) if icol == 0 else (np.random.randint(1, 5),), 49 | dtype=np.int64) 50 | for icol in xrange(num_cols)] 51 | for _ in xrange(100)], dtype=object), 52 | columns=[f'col{c}' for c in xrange(num_cols)]) 53 | self._df.to_parquet(self._filename) 54 | 55 | def tearDown(self): # pylint: disable=invalid-name 56 | os.remove(self._filename) 57 | del os.environ['CUDA_VISIBLE_DEVICES'] 58 | 59 | def test_reshape(self): 60 | batch_size = 32 61 | with tf.Graph().as_default() as graph: 62 | ds = hb.data.Dataset.from_parquet( 63 | [self._filename], 64 | fields=[ 65 | hb.data.DataFrame.Field('col2'), 66 | hb.data.DataFrame.Field('col0', shape=[4])]) 67 | ds = ds.batch(batch_size) 68 | ds = ds.prefetch(4) 69 | batch = tf.data.make_one_shot_iterator(ds).get_next() 70 | 71 | c = self._df['col0'] 72 | with tf.Session(graph=graph) as sess: 73 | for i in xrange(3): 74 | result = sess.run(batch) 75 | start_row = i * batch_size 76 | end_row = (i + 1) * batch_size 77 | expected_items = c[start_row:end_row].to_numpy().tolist() 78 | expected_values = [] 79 | expected_splits = [0] 80 | for item in expected_items: 81 | expected_values.extend(item) 82 | expected_splits.append(expected_splits[-1] + len(item)) 83 | expected = np.array(expected_values) 84 | expected = np.reshape(expected, (batch_size, 4)) 85 | actual = result['col0'] 86 | np.testing.assert_allclose(actual, expected) 87 | 88 | 89 | if __name__ == '__main__': 90 | hbtest.main(f'{__file__}.xml') 91 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/training/server.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Servers using hybrid parallelism. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from tensorflow.python.training import monitored_session as _monitored_session 24 | from tensorflow.python.training import server_lib 25 | 26 | from hybridbackend.tensorflow.framework.config import wraps_session_config 27 | from hybridbackend.tensorflow.framework.context import Context 28 | from hybridbackend.tensorflow.framework.rewriting import scope 29 | 30 | 31 | class HybridBackendServerBase(object): # pylint: disable=useless-object-inheritance 32 | r'''Base class of server wrapper. 33 | ''' 34 | 35 | 36 | def wraps_server(cls): 37 | r'''Decorator to create hybridbackend server class. 38 | ''' 39 | if issubclass(cls, HybridBackendServerBase): 40 | return cls 41 | 42 | class HybridBackendServer(cls, HybridBackendServerBase): 43 | r'''An in-process TensorFlow server, for use in distributed training. 44 | ''' 45 | _default = None 46 | 47 | @classmethod 48 | def get(class_): 49 | if class_._default is None: 50 | class_._default = class_(None) 51 | return class_._default 52 | 53 | def __init__(self, server_or_cluster_def, **kwargs): 54 | r'''Creates a new server with the given definition. 55 | ''' 56 | if server_or_cluster_def is None: 57 | server_or_cluster_def = Context.get().cluster_spec 58 | kwargs['job_name'] = Context.get().task_type 59 | kwargs['task_index'] = Context.get().task_id 60 | if server_or_cluster_def is None: 61 | self._is_local = True 62 | return 63 | self._is_local = False 64 | kwargs['config'] = wraps_session_config(kwargs.pop('config', None)) 65 | super().__init__(server_or_cluster_def, **kwargs) 66 | 67 | @property 68 | def target(self): 69 | r'''Returns the target for asession to connect to this server. 70 | ''' 71 | if self._is_local: 72 | return '' 73 | return super().target 74 | 75 | def monitored_session(self, **kwargs): 76 | r'''Creates a `MonitoredSession` for training. 77 | ''' 78 | with scope(): 79 | return _monitored_session.MonitoredTrainingSession( 80 | master=self.target, **kwargs) 81 | 82 | return HybridBackendServer 83 | 84 | 85 | Server = wraps_server(server_lib.Server) 86 | 87 | 88 | def monitored_session(**kwargs): 89 | r'''Creates a `MonitoredSession` for training with default server. 90 | ''' 91 | return Server.get().monitored_session(**kwargs) 92 | 93 | 94 | def target(): 95 | r'''HybridBackend server target. 96 | ''' 97 | return Server.get().target 98 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/graph/optimize_floormod_shuffle.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #if HYBRIDBACKEND_TENSORFLOW 17 | 18 | #include 19 | 20 | #include "hybridbackend/common/env.h" 21 | #include "hybridbackend/tensorflow/graph/common/packing.h" 22 | #include "hybridbackend/tensorflow/graph/common/relocation.h" 23 | #include "hybridbackend/tensorflow/graph/common/replacing.h" 24 | #include "hybridbackend/tensorflow/graph/common/rewriting.h" 25 | #include "hybridbackend/tensorflow/graph/op_optimization.h" 26 | 27 | namespace tensorflow { 28 | namespace hybridbackend { 29 | 30 | namespace { 31 | inline bool FloormodShuffleOptimizationDisabled() { 32 | static const bool kFloormodShuffleOptimizationDisabled = 33 | ::hybridbackend::EnvVarGetBool( 34 | "HB_OP_FLOORMOD_SHUFFLE_OPTIMIZATION_DISABLED", false); 35 | return kFloormodShuffleOptimizationDisabled; 36 | } 37 | 38 | inline bool FloormodShufflePackingDisabled() { 39 | static const bool kFloormodShufflePackingDisabled = 40 | ::hybridbackend::EnvVarGetBool("HB_OP_FLOORMOD_SHUFFLE_PACKING_DISABLED", 41 | false); 42 | return kFloormodShufflePackingDisabled; 43 | } 44 | 45 | } // namespace 46 | 47 | class OptimizeFloormodShuffleReplacingPass : public OpOptimizationPass { 48 | public: 49 | Status Optimize(Graph* graph, const SessionOptions* options, 50 | const bool disabled) override { 51 | if (TF_PREDICT_FALSE(disabled || FloormodShuffleOptimizationDisabled())) { 52 | return Status::OK(); 53 | } 54 | 55 | TF_RETURN_IF_ERROR(Rewrite("FloormodShuffle", "HbFloormodShuffle") 56 | .WithIntAttr("num_partitions", 1) 57 | .In(graph)); 58 | 59 | return Status::OK(); 60 | } 61 | }; 62 | 63 | REGISTER_REPLACING_OPTIMIZATION(OptimizeFloormodShuffleReplacingPass); 64 | 65 | class OptimizeFloormodShuffleReductionPass : public OpOptimizationPass { 66 | public: 67 | Status Optimize(Graph* graph, const SessionOptions* options, 68 | const bool disabled) override { 69 | if (TF_PREDICT_FALSE(disabled || FloormodShuffleOptimizationDisabled())) { 70 | return Status::OK(); 71 | } 72 | 73 | if (TF_PREDICT_TRUE(!FloormodShufflePackingDisabled())) { 74 | TF_RETURN_IF_ERROR( 75 | Pack("HbFloormodShuffle", "HbFloormodShuffleN") 76 | .WithTypeAttr("T", {DT_INT32, DT_INT64, DT_UINT32, DT_UINT64}) 77 | .WithIntAttr("num_partitions") 78 | .In(graph)); 79 | 80 | return Status::OK(); 81 | } 82 | 83 | return Status::OK(); 84 | } 85 | }; 86 | 87 | REGISTER_REDUCTION_OPTIMIZATION(OptimizeFloormodShuffleReductionPass); 88 | } // namespace hybridbackend 89 | } // namespace tensorflow 90 | 91 | #endif // HYBRIDBACKEND_TENSORFLOW 92 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/graph/optimize_partition_by_modulo.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #if HYBRIDBACKEND_TENSORFLOW 17 | 18 | #include 19 | 20 | #include "hybridbackend/common/env.h" 21 | #include "hybridbackend/tensorflow/graph/common/packing.h" 22 | #include "hybridbackend/tensorflow/graph/common/relocation.h" 23 | #include "hybridbackend/tensorflow/graph/common/replacing.h" 24 | #include "hybridbackend/tensorflow/graph/common/rewriting.h" 25 | #include "hybridbackend/tensorflow/graph/op_optimization.h" 26 | 27 | namespace tensorflow { 28 | namespace hybridbackend { 29 | 30 | namespace { 31 | inline bool PartitionByModuloOptimizationDisabled() { 32 | static const bool kPartitionByModuloOptimizationDisabled = 33 | ::hybridbackend::EnvVarGetBool( 34 | "HB_OP_PARTITION_BY_MODULO_OPTIMIZATION_DISABLED", false); 35 | return kPartitionByModuloOptimizationDisabled; 36 | } 37 | 38 | inline bool PartitionByModuloPackingDisabled() { 39 | static const bool kPartitionByModuloPackingDisabled = 40 | ::hybridbackend::EnvVarGetBool( 41 | "HB_OP_PARTITION_BY_MODULO_PACKING_DISABLED", false); 42 | return kPartitionByModuloPackingDisabled; 43 | } 44 | 45 | } // namespace 46 | 47 | class OptimizePartitionByModuloReplacingPass : public OpOptimizationPass { 48 | public: 49 | Status Optimize(Graph* graph, const SessionOptions* options, 50 | const bool disabled) override { 51 | if (TF_PREDICT_FALSE(disabled || PartitionByModuloOptimizationDisabled())) { 52 | return Status::OK(); 53 | } 54 | 55 | TF_RETURN_IF_ERROR(Rewrite("PartitionByModulo", "HbPartitionByModulo") 56 | .WithIntAttr("num_partitions", 1) 57 | .In(graph)); 58 | 59 | return Status::OK(); 60 | } 61 | }; 62 | 63 | REGISTER_REPLACING_OPTIMIZATION(OptimizePartitionByModuloReplacingPass); 64 | 65 | class OptimizePartitionByModuloReductionPass : public OpOptimizationPass { 66 | public: 67 | Status Optimize(Graph* graph, const SessionOptions* options, 68 | const bool disabled) override { 69 | if (TF_PREDICT_FALSE(disabled || PartitionByModuloOptimizationDisabled())) { 70 | return Status::OK(); 71 | } 72 | 73 | if (TF_PREDICT_TRUE(!PartitionByModuloPackingDisabled())) { 74 | TF_RETURN_IF_ERROR( 75 | Pack("HbPartitionByModulo", "HbPartitionByModuloN") 76 | .WithTypeAttr("T", {DT_INT32, DT_INT64, DT_UINT32, DT_UINT64}) 77 | .WithIntAttr("num_partitions") 78 | .In(graph)); 79 | 80 | return Status::OK(); 81 | } 82 | 83 | return Status::OK(); 84 | } 85 | }; 86 | 87 | REGISTER_REDUCTION_OPTIMIZATION(OptimizePartitionByModuloReductionPass); 88 | } // namespace hybridbackend 89 | } // namespace tensorflow 90 | 91 | #endif // HYBRIDBACKEND_TENSORFLOW 92 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/tabular/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Dataset that reads tabular data. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from tensorflow.python.util.deprecation import deprecated 24 | 25 | # pylint: disable=ungrouped-imports 26 | try: 27 | from hybridbackend.tensorflow.data.tabular.dataset_v2 import \ 28 | TabularDatasetV2 as Dataset 29 | Dataset.__module__ = __name__ 30 | Dataset.__name__ = 'TabularDataset' 31 | except ImportError: 32 | from hybridbackend.tensorflow.data.tabular.dataset_v1 import \ 33 | TabularDatasetV1 as Dataset 34 | Dataset.__module__ = __name__ 35 | Dataset.__name__ = 'TabularDataset' 36 | 37 | try: 38 | from tensorflow.python.data.ops.dataset_ops import DatasetV2 as _dataset # pylint: disable=unused-import, line-too-long # noqa: F401 39 | 40 | from hybridbackend.tensorflow.data.tabular.dataset_v2 import \ 41 | ParquetDatasetV2 as ParquetDataset 42 | ParquetDataset.__module__ = __name__ 43 | ParquetDataset.__name__ = 'ParquetDataset' 44 | except ImportError: 45 | from hybridbackend.tensorflow.data.tabular.dataset_v1 import \ 46 | ParquetDatasetV1 as ParquetDataset 47 | ParquetDataset.__module__ = __name__ 48 | ParquetDataset.__name__ = 'ParquetDataset' 49 | # pylint: enable=ungrouped-imports 50 | 51 | 52 | @deprecated(None, 'Prefer hb.data.Dataset.from_parquet instead.') 53 | def read_parquet( 54 | batch_size, 55 | fields=None, 56 | partition_count=1, 57 | partition_index=0, 58 | drop_remainder=False, 59 | num_parallel_reads=None, 60 | num_sequential_reads=1): 61 | r'''Create a `ParquetDataset` from filenames dataset. 62 | 63 | Args: 64 | batch_size: Maxium number of samples in an output batch. 65 | fields: (Optional.) List of DataFrame fields. 66 | partition_count: (Optional.) Count of row group partitions. 67 | partition_index: (Optional.) Index of row group partitions. 68 | drop_remainder: (Optional.) If True, only keep batches with exactly 69 | `batch_size` samples. 70 | num_parallel_reads: (Optional.) A `tf.int64` scalar representing the 71 | number of files to read in parallel. Defaults to reading files 72 | sequentially. 73 | num_sequential_reads: (Optional.) A `tf.int64` scalar representing the 74 | number of batches to read in sequential. Defaults to 1. 75 | ''' 76 | def _apply_fn(filenames): 77 | return ParquetDataset( 78 | filenames, 79 | batch_size=batch_size, 80 | fields=fields, 81 | partition_count=partition_count, 82 | partition_index=partition_index, 83 | drop_remainder=drop_remainder, 84 | num_parallel_reads=num_parallel_reads, 85 | num_sequential_reads=num_sequential_reads) 86 | return _apply_fn 87 | -------------------------------------------------------------------------------- /docs/tutorial/ranking/optimization.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Functions for optimization 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import tensorflow as tf 24 | 25 | 26 | def lr_with_linear_warmup_and_polynomial_decay( 27 | global_step, 28 | initial_value=24., 29 | scaling_factor=1., 30 | warmup_steps=None, 31 | decay_steps=None, 32 | decay_start_step=None, 33 | decay_exp=2, 34 | epsilon=1.e-7): 35 | r'''Calculates learning rate with linear warmup and polynomial decay. 36 | 37 | Args: 38 | global_step: Variable representing the current step. 39 | initial_value: Initial value of learning rates. 40 | warmup_steps: Steps of warmup. 41 | decay_steps: Steps of decay. 42 | decay_start_step: Start step of decay. 43 | decay_exp: Exponent part of decay. 44 | scaling_factor: Factor for scaling. 45 | 46 | Returns: 47 | New learning rate tensor. 48 | ''' 49 | initial_lr = tf.constant(initial_value * scaling_factor, tf.float32) 50 | 51 | if warmup_steps is None: 52 | return initial_lr 53 | 54 | global_step = tf.cast(global_step, tf.float32) 55 | warmup_steps = tf.constant(warmup_steps, tf.float32) 56 | warmup_rate = initial_lr / warmup_steps 57 | warmup_lr = initial_lr - (warmup_steps - global_step) * warmup_rate 58 | 59 | if decay_steps is None or decay_start_step is None: 60 | return warmup_lr 61 | 62 | decay_start_step = tf.constant(decay_start_step, tf.float32) 63 | steps_since_decay_start = global_step - decay_start_step 64 | decay_steps = tf.constant(decay_steps, tf.float32) 65 | decayed_steps = tf.minimum(steps_since_decay_start, decay_steps) 66 | to_decay_rate = (decay_steps - decayed_steps) / decay_steps 67 | decay_lr = initial_lr * to_decay_rate**decay_exp 68 | decay_lr = tf.maximum(decay_lr, tf.constant(epsilon)) 69 | 70 | warmup_lambda = tf.cast(global_step < warmup_steps, tf.float32) 71 | decay_lambda = tf.cast(global_step > decay_start_step, tf.float32) 72 | initial_lambda = tf.cast( 73 | tf.math.abs(warmup_lambda + decay_lambda) < epsilon, tf.float32) 74 | 75 | lr = warmup_lambda * warmup_lr 76 | lr += decay_lambda * decay_lr 77 | lr += initial_lambda * initial_lr 78 | return lr 79 | 80 | 81 | def sgd_decay_optimize( 82 | loss, 83 | lr_initial_value, 84 | lr_warmup_steps, 85 | lr_decay_start_step, 86 | lr_decay_steps): 87 | r'''Optimize using SGD and learning rate decay. 88 | ''' 89 | step = tf.train.get_or_create_global_step() 90 | lr = lr_with_linear_warmup_and_polynomial_decay( 91 | step, 92 | initial_value=lr_initial_value, 93 | warmup_steps=lr_warmup_steps, 94 | decay_start_step=lr_decay_start_step, 95 | decay_steps=lr_decay_steps) 96 | opt = tf.train.GradientDescentOptimizer(learning_rate=lr) 97 | return opt.minimize(loss, global_step=step) 98 | -------------------------------------------------------------------------------- /docs/tutorial/ranking/criteo/data/prep.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================= 17 | 18 | r'''Prepare Criteo 1TB Click Logs Dataset. 19 | 20 | See https://ailab.criteo.com/download-criteo-1tb-click-logs-dataset/ for more 21 | information. 22 | ''' 23 | 24 | from __future__ import absolute_import 25 | from __future__ import division 26 | from __future__ import print_function 27 | 28 | import argparse 29 | import os 30 | import warnings 31 | 32 | import numpy as np 33 | import pandas as pd 34 | import pyarrow as pa 35 | import pyarrow.parquet as pq 36 | import tqdm 37 | 38 | 39 | def main(args): 40 | label_names = [args.label_prefix] 41 | if_names = [f'{args.integer_features_prefix}{i}' for i in range(13)] 42 | cf_names = [f'{args.categorical_features_prefix}{i}' for i in range(26)] 43 | 44 | pa_schema = pa.schema( 45 | [(n, pa.int32()) for n in label_names] 46 | + [(n, pa.int32()) for n in if_names] 47 | + [(n, pa.int64()) for n in cf_names]) 48 | pd_schema = dict( 49 | [(n, np.int32) for n in label_names] 50 | + [(n, np.int32) for n in if_names] 51 | + [(n, np.int64) for n in cf_names] 52 | ) 53 | 54 | converters = dict( 55 | [(n, np.int32) for n in label_names] 56 | + [(n, lambda i: int(i) if i else args.null_value) for n in if_names] 57 | + [(n, lambda i: int(i, 16) if i else args.null_value) for n in cf_names] 58 | ) 59 | 60 | parquet_fname = f'{os.path.splitext(args.fname)[0]}.parquet' 61 | try: 62 | with pq.ParquetWriter( 63 | parquet_fname, pa_schema, 64 | use_dictionary=not args.no_use_dictionary, 65 | compression=args.compression, 66 | flavor=args.flavor) as writer: 67 | for dfc in tqdm.tqdm( 68 | pd.read_csv( 69 | args.fname, 70 | sep='\t', 71 | names=label_names + if_names + cf_names, 72 | converters=converters, 73 | chunksize=args.row_group_size), 74 | desc=f'Prepare dataset from {args.fname}', 75 | unit='blocks'): 76 | pt = pa.Table.from_pandas(dfc.astype(pd_schema), preserve_index=False) 77 | writer.write_table(pt) 78 | del pt 79 | except Exception: 80 | warnings.warn( 81 | f'Failed to prepare dataset from {args.fname}', 82 | RuntimeWarning) 83 | raise 84 | 85 | 86 | if __name__ == '__main__': 87 | parser = argparse.ArgumentParser() 88 | parser.add_argument('--label-prefix', default='label') 89 | parser.add_argument('--integer-features-prefix', default='if') 90 | parser.add_argument('--categorical-features-prefix', default='cf') 91 | parser.add_argument('--compression', default='zstd') 92 | parser.add_argument('--flavor', default='spark') 93 | parser.add_argument('--no-use-dictionary', default=False, action='store_true') 94 | parser.add_argument('--row-group-size', type=int, default=1000000) 95 | parser.add_argument('--null-value', type=int, default=-1 << 16) 96 | parser.add_argument('fname') 97 | main(parser.parse_args()) 98 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/tests/parquet_dataset_ragged_nested_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Parquet batch dataset nested ragged tensors test. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import os 24 | import tempfile 25 | import unittest 26 | 27 | import numpy as np 28 | import pyarrow as pa 29 | import pyarrow.parquet as pq 30 | import tensorflow as tf 31 | 32 | import hybridbackend.common.test as hbtest 33 | import hybridbackend.tensorflow as hb 34 | 35 | 36 | # pylint: disable=missing-docstring 37 | class ParquetDatasetRaggedNestedTest(unittest.TestCase): 38 | def setUp(self): # pylint: disable=invalid-name 39 | os.environ['CUDA_VISIBLE_DEVICES'] = '' 40 | self._workspace = tempfile.mkdtemp() 41 | self._filename = os.path.join( 42 | self._workspace, 'ragged_test_pyarrow.parquet') 43 | self._data = pa.array( 44 | [[[1], [2, 3]], [[4], [5]]], pa.list_(pa.list_(pa.int64()))) 45 | table = pa.Table.from_arrays([self._data], ['A']) 46 | pq.write_table(table, self._filename, compression='ZSTD') 47 | 48 | def tearDown(self): # pylint: disable=invalid-name 49 | os.remove(self._filename) 50 | del os.environ['CUDA_VISIBLE_DEVICES'] 51 | 52 | def test_read(self): 53 | with tf.Graph().as_default() as graph: 54 | ds = hb.data.ParquetDataset( 55 | [self._filename], 56 | batch_size=2) 57 | ds = ds.apply(hb.data.rebatch(2)) 58 | ds = ds.prefetch(4) 59 | batch = tf.data.make_one_shot_iterator(ds).get_next() 60 | 61 | with tf.Session(graph=graph) as sess: 62 | actual = sess.run(batch)['A'].to_list() 63 | expected = self._data.to_pylist() 64 | np.testing.assert_equal(actual, expected) 65 | 66 | def test_apply_to_sparse(self): 67 | with tf.Graph().as_default() as graph: 68 | ds = hb.data.Dataset.from_parquet([self._filename]) 69 | ds = ds.batch(2) 70 | batch = tf.data.make_one_shot_iterator(ds).get_next()['A'] 71 | baseline = tf.ragged.constant(self._data.to_pylist()).to_sparse() 72 | 73 | with tf.Session(graph=graph) as sess: 74 | actual, expected = sess.run([batch, baseline]) 75 | np.testing.assert_equal(actual.indices, expected.indices) 76 | np.testing.assert_equal(actual.values, expected.values) 77 | np.testing.assert_equal(actual.dense_shape, expected.dense_shape) 78 | 79 | def test_apply_to_tensor(self): 80 | with tf.Graph().as_default() as graph: 81 | ds = hb.data.Dataset.from_parquet([self._filename], to_dense=True) 82 | ds = ds.batch(2) 83 | batch = tf.data.make_one_shot_iterator(ds).get_next()['A'] 84 | baseline = tf.ragged.constant(self._data.to_pylist()).to_tensor() 85 | 86 | with tf.Session(graph=graph) as sess: 87 | actual, expected = sess.run([batch, baseline]) 88 | np.testing.assert_equal(actual, expected) 89 | 90 | 91 | if __name__ == '__main__': 92 | hbtest.main(f'{__file__}.xml') 93 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/data/tests/sync_replicas_dataset_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Test for out-of-range detect. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import os 24 | import unittest 25 | 26 | import numpy as np 27 | 28 | import hybridbackend.common.test as hbtest 29 | 30 | 31 | # pylint: disable=missing-docstring 32 | def _test_single(_): 33 | r'''Testing on a single worker 34 | ''' 35 | # pylint: disable=import-outside-toplevel 36 | import tensorflow as tf 37 | 38 | import hybridbackend.tensorflow as hb 39 | 40 | batch_size = 10 41 | 42 | with tf.Graph().as_default(): 43 | with hb.scope(mode=tf.estimator.ModeKeys.TRAIN): 44 | with tf.device('/cpu:0'): 45 | ds = tf.data.Dataset.range(100) 46 | ds = ds.batch(batch_size=batch_size) 47 | iterator = tf.data.make_one_shot_iterator(ds) 48 | batch = iterator.get_next() 49 | with tf.train.MonitoredTrainingSession('') as sess: 50 | final_result = None 51 | while not sess.should_stop(): 52 | final_result = sess.run(batch) 53 | return final_result 54 | 55 | 56 | def _test_distributed(rank): 57 | r'''Testing on multiple distributed workers 58 | ''' 59 | # pylint: disable=import-outside-toplevel 60 | import tensorflow as tf 61 | 62 | import hybridbackend.tensorflow as hb 63 | 64 | batch_size = 10 65 | 66 | with tf.Graph().as_default(): 67 | with hb.scope( 68 | data_sync_drop_remainder=False, mode=tf.estimator.ModeKeys.TRAIN): 69 | with tf.device('/cpu:0'): 70 | ds = tf.data.Dataset.range(100 + rank * 50) 71 | ds = ds.batch(batch_size=batch_size) 72 | iterator = tf.data.make_one_shot_iterator(ds) 73 | batch = iterator.get_next() 74 | with tf.train.MonitoredTrainingSession('') as sess: 75 | final_result = None 76 | while not sess.should_stop(): 77 | final_result = sess.run(batch) 78 | return final_result 79 | 80 | 81 | @unittest.skipUnless( 82 | os.getenv('HYBRIDBACKEND_WITH_CUDA') == 'ON', 'GPU required') 83 | @unittest.skipUnless( 84 | os.getenv('HYBRIDBACKEND_WITH_NCCL') == 'ON', 'NCCL required') 85 | class DetectEndTest(unittest.TestCase): 86 | r'''Tests for the out-of-range sync. 87 | ''' 88 | def setUp(self): # pylint: disable=invalid-name 89 | os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' 90 | 91 | def test_single(self): 92 | results = hbtest.Spawn()(_test_single) 93 | np.testing.assert_equal( 94 | results[0], [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]) 95 | 96 | def test_parallel(self): 97 | results = hbtest.Spawn(2)(_test_distributed) 98 | np.testing.assert_equal(results[0], []) 99 | np.testing.assert_equal( 100 | results[1], [140, 141, 142, 143, 144, 145, 146, 147, 148, 149]) 101 | 102 | 103 | if __name__ == '__main__': 104 | hbtest.main(f'{__file__}.xml') 105 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/training/variables.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | r'''Variable utilities for training. 17 | ''' 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import contextlib 24 | 25 | from tensorflow.python.framework import ops 26 | from tensorflow.python.keras.backend import reset_uids as reset_keras_uids 27 | from tensorflow.python.ops import math_ops 28 | from tensorflow.python.ops import state_ops 29 | from tensorflow.python.ops import variable_scope as vs 30 | 31 | 32 | class ReuseVariables(object): # pylint: disable=useless-object-inheritance 33 | r'''Variable reusing context. 34 | ''' 35 | def __call__(self, reuse): 36 | reset_keras_uids() 37 | varscope = ops.get_default_graph().get_collection_ref(('__varscope',)) 38 | if varscope: 39 | varscope[0].variable_scopes_count.clear() 40 | vs.get_variable_scope()._reuse = reuse # pylint: disable=protected-access 41 | 42 | 43 | @contextlib.contextmanager 44 | def reuse_variables(reuse=None): 45 | r'''Context manager that reuses variables. 46 | ''' 47 | try: 48 | fn = ReuseVariables() 49 | prev_reuse = vs.get_variable_scope()._reuse # pylint: disable=protected-access 50 | if reuse is not None: 51 | fn(reuse) 52 | yield fn 53 | finally: 54 | vs.get_variable_scope()._reuse = prev_reuse # pylint: disable=protected-access 55 | 56 | 57 | @contextlib.contextmanager 58 | def disable_variable_update(): 59 | r'''Context manager that disable update in state_ops's assign operations 60 | ''' 61 | try: 62 | def wraps_assign(assign_fn): # pylint: disable=unused-argument 63 | r'''Disable the assign op 64 | ''' 65 | def wrapped_assign( 66 | ref, value, validate_shape=None, use_locking=None, name=None): # pylint: disable=unused-argument 67 | return value 68 | return wrapped_assign 69 | 70 | def wraps_assign_sub(assign_sub_fn): # pylint: disable=unused-argument 71 | r'''Disable the assign_sub op 72 | ''' 73 | def wrapped_assign_sub(ref, value, use_locking=None, name=None): # pylint: disable=unused-argument 74 | return math_ops.subtract(ref, value) 75 | return wrapped_assign_sub 76 | 77 | def wraps_assign_add(assign_add_fn): # pylint: disable=unused-argument 78 | r'''Disable the assign_add op 79 | ''' 80 | def wrapped_assign_add(ref, value, use_locking=None, name=None): # pylint: disable=unused-argument 81 | return math_ops.add(ref, value) 82 | return wrapped_assign_add 83 | 84 | prev_assign = state_ops.assign 85 | state_ops.assign = wraps_assign(prev_assign) 86 | prev_assign_sub = state_ops.assign_sub 87 | state_ops.assign_sub = wraps_assign_sub(prev_assign_sub) 88 | prev_assign_add = state_ops.assign_add 89 | state_ops.assign_add = wraps_assign_add(prev_assign_add) 90 | 91 | yield 92 | 93 | finally: 94 | state_ops.assign = prev_assign 95 | state_ops.assign_sub = prev_assign_sub 96 | state_ops.assign_add = prev_assign_add 97 | -------------------------------------------------------------------------------- /hybridbackend/tensorflow/Makefile: -------------------------------------------------------------------------------- 1 | TENSORFLOW_SRC := hybridbackend/tensorflow/ 2 | 3 | ifeq ($(HYBRIDBACKEND_WITH_BUILDINFO),ON) 4 | HYBRIDBACKEND_BUILD_FRAMEWORK := $(shell \ 5 | $(PYTHON) -c \ 6 | "import tensorflow as tf; print('tf{}-{}'.format(tf.__version__, tf.__git_version__))" \ 7 | 2>/dev/null) 8 | CFLAGS := $(CFLAGS) \ 9 | -DHYBRIDBACKEND_BUILD_FRAMEWORK="\"$(HYBRIDBACKEND_BUILD_FRAMEWORK)\"" 10 | endif 11 | 12 | TENSORFLOW_CFLAGS := \ 13 | -DEIGEN_MPL2_ONLY \ 14 | -DEIGEN_MAX_ALIGN_BYTES=64 \ 15 | -DEIGEN_HAS_TYPE_TRAITS=0 \ 16 | $(shell \ 17 | $(PYTHON) -c \ 18 | "import tensorflow as tf; cflags=tf.sysconfig.get_compile_flags(); print(' '.join([c.replace('-I', '-isystem ', 1) if c.startswith('-I') else c for c in cflags]))" 2>/dev/null) 19 | 20 | ifeq ($(HYBRIDBACKEND_WITH_CUDA),ON) 21 | TENSORFLOW_CFLAGS := $(TENSORFLOW_CFLAGS) -DGOOGLE_CUDA=1 22 | endif 23 | 24 | ifeq ($(OS),Darwin) 25 | TENSORFLOW_LDFLAGS := \ 26 | $(shell \ 27 | $(PYTHON) -c \ 28 | "import tensorflow as tf; ldflags=tf.sysconfig.get_link_flags(); print(' '.join(ldflags))" 2>/dev/null) 29 | TENSORFLOW_LDFLAGS := $(subst -l:libtensorflow_framework.1.dylib,-ltensorflow_framework,$(TENSORFLOW_LDFLAGS)) 30 | else 31 | TENSORFLOW_LDFLAGS := \ 32 | -Wl,-rpath='$$ORIGIN/..:$$ORIGIN/../../tensorflow' \ 33 | $(shell \ 34 | $(PYTHON) -c \ 35 | "import tensorflow as tf; ldflags=tf.sysconfig.get_link_flags(); print(' '.join(ldflags))" 2>/dev/null) 36 | endif 37 | 38 | TENSORFLOW_CC_SOURCES := $(shell \ 39 | find $(TENSORFLOW_SRC) -type f \ 40 | \( -name "*.cc" ! -name "*.cu*" \) \ 41 | -exec realpath {} --relative-to . \;) 42 | 43 | TENSORFLOW_OBJS := $(TENSORFLOW_CC_SOURCES:.cc=.o) 44 | ifeq ($(OS),Darwin) 45 | $(TENSORFLOW_OBJS): %.o:%.cc $(THIRDPARTY_DEPS) 46 | mkdir -p $(dir $@) 47 | $(CXX) $(CFLAGS) $(TENSORFLOW_CFLAGS) $(CXX_CFLAGS) \ 48 | -MMD -MP -MF $<.d -o $@ -c $< -fpic 49 | else 50 | $(TENSORFLOW_OBJS): %.o:%.cc $(THIRDPARTY_DEPS) 51 | mkdir -p $(dir $@) 52 | $(CXX) $(CFLAGS) $(TENSORFLOW_CFLAGS) $(CXX_CFLAGS) \ 53 | -MMD -MP -MF $<.d -o $@ -c $< -fpic 54 | sed -i '/site-packages/d' $<.d 55 | sed -i '/^$$/N;/^\n$$/D' $<.d 56 | endif 57 | 58 | ifeq ($(HYBRIDBACKEND_WITH_CUDA),ON) 59 | TENSORFLOW_CU_SOURCES := $(shell \ 60 | find $(TENSORFLOW_SRC) -type f \ 61 | \( -name '*.cu.cc' \) \ 62 | -exec realpath {} --relative-to . \;) 63 | 64 | TENSORFLOW_CU_OBJS := $(TENSORFLOW_CU_SOURCES:.cc=.o) 65 | ifeq ($(OS),Darwin) 66 | $(TENSORFLOW_CU_OBJS): %.o:%.cc 67 | mkdir -p $(dir $@) 68 | $(NVCC) $(NVCC_CFLAGS) \ 69 | -o $@ -c $< $(CFLAGS) $(TENSORFLOW_CFLAGS) -x cu \ 70 | -Xcompiler -fPIC 71 | else 72 | $(TENSORFLOW_CU_OBJS): %.o:%.cc 73 | mkdir -p $(dir $@) 74 | @$(NVCC) -M $< $(CFLAGS) $(TENSORFLOW_CFLAGS) -x cu \ 75 | | grep -v '/usr/' \ 76 | | grep -v 'site-packages' \ 77 | | sed 's|$(notdir $@)|$@|g' \ 78 | | sed 's|^\./||g' \ 79 | > $<.d 80 | sed -i '/^$$/N;/^\n$$/D' $<.d 81 | $(NVCC) $(NVCC_CFLAGS) \ 82 | -o $@ -c $< $(CFLAGS) $(TENSORFLOW_CFLAGS) -x cu \ 83 | -Xcompiler -fPIC 84 | endif 85 | TENSORFLOW_ALL_OBJS := $(TENSORFLOW_OBJS) $(TENSORFLOW_CU_OBJS) 86 | else 87 | TENSORFLOW_ALL_OBJS := $(TENSORFLOW_OBJS) 88 | endif 89 | 90 | ifeq ($(OS),Darwin) 91 | $(TENSORFLOW_LIB): $(TENSORFLOW_ALL_OBJS) $(COMMON_LIB) 92 | mkdir -p $(dir $@) 93 | $(CXX) $(CFLAGS) -std=c++11 \ 94 | -install_name @rpath/lib$(LIBNAME)_tensorflow.so \ 95 | -o $@ $(TENSORFLOW_ALL_OBJS) \ 96 | $(LDFLAGS) \ 97 | $(TENSORFLOW_LDFLAGS) \ 98 | -L$(LIBNAME)/ -l$(LIBNAME) 99 | else 100 | $(TENSORFLOW_LIB): $(TENSORFLOW_ALL_OBJS) $(COMMON_LIB) 101 | mkdir -p $(dir $@) 102 | $(CXX) $(CFLAGS) -std=c++11 \ 103 | -o $@ $(TENSORFLOW_ALL_OBJS) \ 104 | $(LDFLAGS) \ 105 | $(TENSORFLOW_LDFLAGS) \ 106 | -L$(LIBNAME)/ -l$(LIBNAME) 107 | endif 108 | --------------------------------------------------------------------------------